% !TEX root = main.tex \section{Problem Setting} \label{section:problem} \paragraph{Objective.} We consider a set $N = \{1, \dots, n \}$ of $n$ nodes seeking to collaboratively solve a classification task with $L$ classes. We denote a labeled data point by a tuple $(x,y)$ where $x$ represents the data point (e.g., a feature vector) and $y\in\{1,\dots,L\}$ its label. Each node has access to a local dataset that follows its own local distribution $D_i$ which may differ from that of other nodes. In this work, we tackle label distribution skew: denoting by $p_i(x,y)=p_i (x|y)p_i(y)$ the probability of $(x,y)$ under the local distribution $D_i$ of node $i$, we assume that $p_i(y)$ may vary across nodes. We refer to \cite{kairouz2019advances,quagmire} for concrete examples of problems with label distribution skew. The objective is to find the parameters $\theta$ of a global model that performs well on the union of the local distributions by minimizing the average training loss: \begin{equation} \min_{\theta} \frac{1}{n}\sum_{i=1}^{n} \mathds{E}_ {(x_i,y_i) \sim D_i} [F_i(\theta;x_i,y_i)], \label{eq:dist-optimization-problem} \end{equation} where $(x_i,y_i)$ is a data point drawn from $D_i$ and $F_i$ is the loss function on node $i$. Therefore, $\mathds{E}_{(x_i,y_i) \sim D_i} F_i(\theta;x_i,y_i)$ denotes the expected loss of model $\theta$ over $D_i$. To collaboratively solve Problem \eqref{eq:dist-optimization-problem}, each node can exchange messages with its neighbors in an undirected network graph $G(N,E)$ where $\{i,j\}\in E$ denotes an edge (communication channel) between nodes $i$ and $j$. \paragraph{Training algorithm.} In this work, we use the popular Decentralized Stochastic Gradient Descent algorithm, aka D-SGD~\cite{lian2017d-psgd}. As shown in Algorithm~\ref{Algorithm:D-PSGD}, a single iteration of D-SGD at node $i$ consists in sampling a mini-batch from its local distribution $D_i$, updating its local model $\theta_i$ by taking a stochastic gradient descent (SGD) step according to the mini-batch, and performing a weighted average of its local model with those of its neighbors. This weighted average is defined by a mixing matrix $W$, in which $W_{ij}$ corresponds to the weight of the outgoing connection from node $i$ to $j$ and $W_{ij} = 0$ for $ \{i,j\}\notin E$. To ensure that the local models converge on average to a stationary point of Problem \eqref{eq:dist-optimization-problem}, $W$ must be doubly stochastic ($\sum_{j \in N} W_{ij} = 1$ and $\sum_{j \in N} W_{ji} = 1$) and symmetric, i.e. $W_{ij} = W_{ji}$~\cite{lian2017d-psgd}. In our experiments, $W$ is given by standard Metropolis-Hasting weights~\cite{xiao2004fast} computed from $G$, namely:\todo{AB: if we need space we can remove this equation} \begin{equation} W_{ij} = \begin{cases} \frac{1}{\max(\text{degree}(i), \text{degree}(j)) + 1} & \text{if}~i \neq j \text{ and } \{i,j\}\in E,\\ 1 - \sum_{j \neq i} W_{ij} & \text{if } i = j, \\ 0 & \text{otherwise}. \end{cases} \label{eq:metro} \end{equation} \begin{algorithm}[t] \caption{D-SGD, Node $i$} \label{Algorithm:D-PSGD} \begin{algorithmic}[1] \STATE \textbf{Require:} initial model parameters $\theta_i^{(0)}$, learning rate $\gamma$, mixing weights $W$, mini-batch size $m$, number of steps $K$ \FOR{$k = 1,\ldots, K$} \STATE $S_i^{(k)} \gets \text{mini-batch of $m$ samples drawn from~} D_i$ \STATE $\theta_i^{(k-\frac{1}{2})} \gets \theta_i^{(k-1)} - \gamma \nabla F(\theta_i^{(k-1)}; S_i^{(k)})$ \STATE $\theta_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} \theta_j^{(k-\frac{1}{2})}$ \ENDFOR \end{algorithmic} \end{algorithm}