setting.tex

% !TEX root = main.tex

\section{Problem Setting}

\label{section:problem}

We consider a set $N = \{1, \dots, n \}$ of $n$ nodes seeking to
collaboratively solve a classification task with $L$ classes. Each node has access to a local dataset that
 follows its own local distribution $D_i$. The goal is to find a global model
 $x$ that performs well on the union of the local distributions by minimizing
 the average training loss:
\begin{equation}
\min_{x} \frac{1}{n}\sum_{i=1}^{n} \mathds{E}_
{s_i \sim D_i} [F_i(x;s_i)],
\label{eq:dist-optimization-problem}
\end{equation}
where $s_i$ is a data example drawn from $D_i$ and $F_i$ is the loss function
on node $i$. Therefore, $\mathds{E}_{s_i \sim D_i} F_i(x;s_i)$ denotes  the
expected loss of model $x$ on a random example $s_i$ drawn from $D_i$.

To collaboratively solve Problem \eqref{eq:dist-optimization-problem}, each
node can exchange messages with its neighbors in an undirected network graph
$G(N,E)$ where $\{i,j\}\in E$ denotes an edge (communication channel)
between nodes $i$ and $j$.

\subsection{Training Algorithm}

In this work, we use the popular Decentralized Stochastic
Gradient Descent algorithm, aka D-SGD~\cite{lian2017d-psgd}. As
shown in Algorithm~\ref{Algorithm:D-PSGD},
a single iteration of D-SGD at node $i$ consists of sampling a mini-batch
from its local distribution
$D_i$, updating its local model $x_i$ by taking a stochastic gradient descent 
(SGD) step according to the mini-batch, and performing a weighted average of
its local model with those of its
neighbors.
This weighted average is defined by a
mixing matrix $W$, in which $W_{ij}$ corresponds to the weight of
the outgoing connection from node $i$ to $j$ and $W_{ij} = 0$ for $
\{i,j\}\notin
E$. To ensure that the local models converge on average to a stationary
point
of Problem
\eqref{eq:dist-optimization-problem}, $W$
must be doubly
stochastic ($\sum_{j \in N} W_{ij} = 1$ and $\sum_{j \in N} W_{ji} = 1$) and
symmetric, i.e. $W_{ij} = W_{ji}$~\cite{lian2017d-psgd}.
In our experiments, $W$ is given by
standard
Metropolis-Hasting weights~\cite{xiao2004fast} computed from the network
topology $G$, namely:\todo{AB: if we need space we can remove this equation}
\begin{equation}
  W_{ij} = \begin{cases}
    \frac{1}{\max(\text{degree}(i), \text{degree}(j)) + 1} & \text{if}~i \neq
    j \text{ and } \{i,j\}\in E,\\
   1 - \sum_{j \neq i} W_{ij} & \text{if } i = j, \\
   0 & \text{otherwise}.
  \end{cases}
  \label{eq:metro}
\end{equation}

\begin{algorithm}[t]
   \caption{D-SGD, Node $i$}
   \label{Algorithm:D-PSGD}
   \begin{algorithmic}[1]
        \STATE \textbf{Require:} initial model parameters $x_i^{(0)}$,
        learning rate $\gamma$, mixing weights $W$, mini-batch size $m$,
        number of steps $K$
        \FOR{$k = 1,\ldots, K$}
          \STATE $s_i^{(k)} \gets \text{mini-batch sample of size $m$ drawn
          from~} D_i$
          \STATE $x_i^{(k-\frac{1}{2})} \gets x_i^{(k-1)} - \gamma \nabla F(x_i^{(k-1)}; s_i^{(k)})$ 
          \STATE $x_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} x_j^{(k-\frac{1}{2})}$
        \ENDFOR
   \end{algorithmic}
\end{algorithm}