% !TEX root = main.tex

\section{Problem Setting}

\label{section:problem}

\paragraph{Objective.} We consider a set $N = \{1, \dots, n \}$ of $n$ nodes
seeking to
collaboratively solve a classification task with $L$ classes. We denote a
labeled data point by a tuple $(x,y)$ where $x$ represents the data point 
(e.g., a feature vector) and $y\in\{1,\dots,L\}$ its label.
Each
node has
access to a local dataset that
 follows its own local distribution $D_i$ which may differ from that of other
 nodes.
In this work, we tackle label distribution skew: denoting by $p_i(x,y)=p_i
(x|y)p_i(y)$ the
probability of $(x,y)$ under the local distribution $D_i$ of node $i$, we
assume that $p_i(y)$ may vary across nodes. We refer to 
\cite{kairouz2019advances,quagmire} for concrete examples of problems
with label distribution skew.

The objective is to find the parameters
$\theta$ of a global model that performs well on the union of the local
 distributions by
 minimizing
 the average training loss:
\begin{equation}
\min_{\theta} \frac{1}{n}\sum_{i=1}^{n} \mathds{E}_
{(x_i,y_i) \sim D_i} [F_i(\theta;x_i,y_i)],
\label{eq:dist-optimization-problem}
\end{equation}
where $(x_i,y_i)$ is a data point drawn from $D_i$ and $F_i$ is the loss
function
on node $i$. Therefore, $\mathds{E}_{(x_i,y_i) \sim D_i} F_i(\theta;x_i,y_i)$
denotes 
the
expected loss of model $\theta$ over $D_i$.




To collaboratively solve Problem \eqref{eq:dist-optimization-problem}, each
node can exchange messages with its neighbors in an undirected network graph
$G(N,E)$ where $\{i,j\}\in E$ denotes an edge (communication channel)
between nodes $i$ and $j$.

\paragraph{Training algorithm.}
In this work, we use the popular Decentralized Stochastic
Gradient Descent algorithm, aka D-SGD~\cite{lian2017d-psgd}. As
shown in Algorithm~\ref{Algorithm:D-PSGD},
a single iteration of D-SGD at node $i$ consists in sampling a mini-batch
from its local distribution
$D_i$, updating its local model $\theta_i$ by taking a stochastic gradient
descent
(SGD) step according to the mini-batch, and performing a weighted average of
its local model with those of its
neighbors.
This weighted average is defined by a
mixing matrix $W$, in which $W_{ij}$ corresponds to the weight of
the outgoing connection from node $i$ to $j$ and $W_{ij} = 0$ for $
\{i,j\}\notin
E$. To ensure that the local models converge on average to a stationary
point
of Problem
\eqref{eq:dist-optimization-problem}, $W$
must be doubly
stochastic ($\sum_{j \in N} W_{ij} = 1$ and $\sum_{j \in N} W_{ji} = 1$) and
symmetric, i.e. $W_{ij} = W_{ji}$~\cite{lian2017d-psgd}.
In our experiments, $W$ is given by
standard
Metropolis-Hasting weights~\cite{xiao2004fast} computed from $G$, namely:\todo{AB: if we need space we can remove this equation}
\begin{equation}
  W_{ij} = \begin{cases}
    \frac{1}{\max(\text{degree}(i), \text{degree}(j)) + 1} & \text{if}~i \neq
    j \text{ and } \{i,j\}\in E,\\
   1 - \sum_{j \neq i} W_{ij} & \text{if } i = j, \\
   0 & \text{otherwise}.
  \end{cases}
  \label{eq:metro}
\end{equation}

\begin{algorithm}[t]
   \caption{D-SGD, Node $i$}
   \label{Algorithm:D-PSGD}
   \begin{algorithmic}[1]
        \STATE \textbf{Require:} initial model parameters $\theta_i^{(0)}$,
        learning rate $\gamma$, mixing weights $W$, mini-batch size $m$,
        number of steps $K$
        \FOR{$k = 1,\ldots, K$}
          \STATE $S_i^{(k)} \gets \text{mini-batch of $m$ samples drawn
          from~} D_i$
          \STATE $\theta_i^{(k-\frac{1}{2})} \gets \theta_i^{(k-1)} - \gamma
          \nabla F(\theta_i^{(k-1)}; S_i^{(k)})$ 
          \STATE $\theta_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} \theta_j^{(k-\frac{1}{2})}$
        \ENDFOR
   \end{algorithmic}
\end{algorithm}