% !TEX root = main.tex \section{Problem Setting} \label{section:problem} We consider a set $N = \{1, \dots, n \}$ of $n$ nodes seeking to collaboratively solve a classification task with $L$ classes. Each node has access to a local dataset that follows its own local distribution $D_i$. The goal is to find a global model $x$ that performs well on the union of the local distributions by minimizing the average training loss: \begin{equation} \min_{x} \frac{1}{n}\sum_{i=1}^{n} \mathds{E}_ {s_i \sim D_i} [F_i(x;s_i)], \label{eq:dist-optimization-problem} \end{equation} where $s_i$ is a data example drawn from $D_i$ and $F_i$ is the loss function on node $i$. Therefore, $\mathds{E}_{s_i \sim D_i} F_i(x;s_i)$ denotes the expected loss of model $x$ on a random example $s_i$ drawn from $D_i$. To collaboratively solve Problem \eqref{eq:dist-optimization-problem}, each node can exchange messages with its neighbors in an undirected network graph $G(N,E)$ where $\{i,j\}\in E$ denotes an edge (communication channel) between nodes $i$ and $j$. \subsection{Training Algorithm} In this work, we use the popular Decentralized Stochastic Gradient Descent algorithm, aka D-SGD~\cite{lian2017d-psgd}. As shown in Algorithm~\ref{Algorithm:D-PSGD}, a single iteration of D-SGD at node $i$ consists of sampling a mini-batch from its local distribution $D_i$, updating its local model $x_i$ by taking a stochastic gradient descent (SGD) step according to the mini-batch, and performing a weighted average of its local model with those of its neighbors. This weighted average is defined by a mixing matrix $W$, in which $W_{ij}$ corresponds to the weight of the outgoing connection from node $i$ to $j$ and $W_{ij} = 0$ for $ \{i,j\}\notin E$. To ensure that the local models converge on average to a stationary point of Problem \eqref{eq:dist-optimization-problem}, $W$ must be doubly stochastic ($\sum_{j \in N} W_{ij} = 1$ and $\sum_{j \in N} W_{ji} = 1$) and symmetric, i.e. $W_{ij} = W_{ji}$~\cite{lian2017d-psgd}. In our experiments, $W$ is given by standard Metropolis-Hasting weights~\cite{xiao2004fast} computed from the network topology $G$, namely:\todo{AB: if we need space we can remove this equation} \begin{equation} W_{ij} = \begin{cases} \frac{1}{\max(\text{degree}(i), \text{degree}(j)) + 1} & \text{if}~i \neq j \text{ and } \{i,j\}\in E,\\ 1 - \sum_{j \neq i} W_{ij} & \text{if } i = j, \\ 0 & \text{otherwise}. \end{cases} \label{eq:metro} \end{equation} \begin{algorithm}[t] \caption{D-SGD, Node $i$} \label{Algorithm:D-PSGD} \begin{algorithmic}[1] \STATE \textbf{Require:} initial model parameters $x_i^{(0)}$, learning rate $\gamma$, mixing weights $W$, mini-batch size $m$, number of steps $K$ \FOR{$k = 1,\ldots, K$} \STATE $s_i^{(k)} \gets \text{mini-batch sample of size $m$ drawn from~} D_i$ \STATE $x_i^{(k-\frac{1}{2})} \gets x_i^{(k-1)} - \gamma \nabla F(x_i^{(k-1)}; s_i^{(k)})$ \STATE $x_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} x_j^{(k-\frac{1}{2})}$ \ENDFOR \end{algorithmic} \end{algorithm}