% !TEX root = main.tex \section{D-Cliques: Creating Locally Representative Cliques} \label{section:d-cliques} In this section, we present the design of D-Cliques. To give an intuition of our approach, let us consider the neighborhood of a single node in a grid similar to that of Figure~\ref{fig:grid-IID-vs-non-IID}, represented on Figure~\ref{fig:grid-iid-vs-non-iid-neighbourhood}. The colors of a node represent the different classes present in its local dataset. In the IID setting (Figure~\ref{fig:grid-iid-neighbourhood}), each node has examples of all classes in equal proportions. In the non-IID setting (Figure~\ref{fig:grid-non-iid-neighbourhood}), each node has examples of only a single class and nodes are distributed randomly in the grid. A single training step, from the point of view of the center node, is equivalent to sampling a mini-batch five times larger from the union of the local distributions of all illustrated nodes. In the IID case, since gradients are computed from examples of all classes, the resulting averaged gradient points in a direction that tends to reduce the loss across all classes. In contrast, in the non-IID case, only a subset of classes are represented in the immediate neighborhood of the node, thus the gradients will be biased towards these classes. Importantly, as the distributed averaging algorithm takes several steps to converge, this variance persists across iterations as the locally computed gradients are far from the global average.\footnote{It is possible, but very costly, to mitigate this by performing a sufficiently large number of averaging steps between each gradient step.} This can significantly slow down convergence speed to the point of making decentralized optimization impractical. \begin{figure}[t] \centering \begin{subfigure}[b]{0.18\textwidth} \centering \includegraphics[width=\textwidth]{../figures/grid-iid-neighbourhood} \caption{\label{fig:grid-iid-neighbourhood} IID} \end{subfigure} \begin{subfigure}[b]{0.18\textwidth} \centering \includegraphics[width=\textwidth]{../figures/grid-non-iid-neighbourhood} \caption{\label{fig:grid-non-iid-neighbourhood} Non-IID} \end{subfigure} \caption{Neighborhood in an IID and non-IID grid.} \label{fig:grid-iid-vs-non-iid-neighbourhood} \end{figure} In D-Cliques, we address the issues of non-iidness by carefully designing a network topology composed of \textit{cliques} and \textit{inter-clique connections}. First, D-Cliques recover a balanced representation of classes, close to that of the IID case, by constructing a topology such that each node $i \in N$ is part of a \textit{clique} $C$ such that the clique distribution $D_C = \bigcup\limits_{\substack{i \in C}} D_i$ is close to that of the global distribution $D = \bigcup\limits_{\substack{i \in N}} D_i$. We measure the closeness of $D_C$ to $D$ using its \textit{skew}, i.e. the sum of the differences in the probabilities that a sample $(x,y)$ belongs to the same class in $L$ in $D_C$ and $D$: \begin{equation} \label{eq:skew} \begin{split} \textit{skew}(C) =\ \sum\limits_{\substack{l \in L}} | p(y = l~|(x,y) \in D_C) - \\ p(y = l~|(x,y) \in D) | \end{split} \end{equation} Second, to ensure a global consensus and convergence, \textit{inter-clique connections} are introduced by connecting a small number of node pairs that are part of different cliques. In the following, we introduce up to one inter-clique connection per node such that each clique has exactly one edge with all other cliques, see Figure~\ref{fig:d-cliques-figure} for the corresponding D-Cliques network in the case of $n=100$ nodes and $c=10$ classes. We will explore sparser inter-clique topologies in Section~\ref{section:interclique-topologies}. We construct D-Cliques by initializing cliques at random, using at most $M$ nodes to limit the intra-clique communication costs, then we swap nodes between pairs of cliques chosen at random such that the swap decreases the skew of that pair but keeps the size of the cliques constant (see Algorithm~\ref{Algorithm:D-Clique-Construction}). Only swaps that decrease the skew are performed, hence this algorithm can be seen as a form of randomized greedy algorithm. We note that this algorithm only requires the knowledge of the label distribution at each node. For the sake of simplicity, we assume that D-Cliques are constructed from the global knowledge of these distributions, which can easily be obtained by decentralized averaging in a pre-processing step. \begin{algorithm}[h] \caption{D-Cliques Construction: Greedy Swap} \label{Algorithm:D-Clique-Construction} \begin{algorithmic}[1] \STATE \textbf{Require:} Max clique size $M$, Max steps $K$, \STATE Set of all nodes $N = \{ 1, 2, \dots, n \}$, \STATE $\textit{skew}(S)$: skew of subset $S \subseteq N$ compared to the global distribution (Eq.~\ref{eq:skew}), \STATE $\textit{intra}(DC)$: edges within cliques $C \in DC$, \STATE $\textit{inter}(DC)$: edges between $C_1,C_2 \in DC$ (Sec.~\ref{section:interclique-topologies}), \STATE $\textit{weights}(E)$: set weights to edges in $E$ (Eq.~\ref{eq:metro}). \STATE ~~ \STATE $DC \leftarrow []$ \COMMENT{Empty list} \WHILE {$N \neq \emptyset$} \STATE $C \leftarrow$ sample $M$ nodes from $N$ at random \STATE $N \leftarrow N \setminus C$; $DC.append(C)$ \ENDWHILE \FOR{$k \in \{1, \dots, K\}$} \STATE $C_1,C_2 \leftarrow$ sample 2 from $DC$ at random \STATE $\textit{swaps} \leftarrow []$ \FOR{$n_1 \in C_1, n_2 \in C_2$} \STATE $s \leftarrow skew(C_1) + skew(C_2)$ \STATE $s' \leftarrow \textit{skew}(C_1-n_1+n_2) + \textit{skew}(C_2 -n_2+n_1)$ \IF {$s' < s$} \STATE \textit{swaps}.append($(n_1, n_2)$) \ENDIF \ENDFOR \IF {\#\textit{swaps} $> 0$} \STATE $(n_1,n_2) \leftarrow$ sample 1 from $\textit{swaps}$ at random \STATE $C_1 \leftarrow C_1 - n_1 + n_2; C_2 \leftarrow C_2 - n_2 + n1$ \ENDIF \ENDFOR \RETURN $(weights(\textit{intra}(DC) \cup \textit{inter}(DC)), DC)$ \end{algorithmic} \end{algorithm} The key idea of D-Cliques is that because the clique-level distribution $D_C$ is representative of the global distribution $D$, the local models of nodes across cliques remain rather close. Therefore, a sparse inter-clique topology can be used, significantly reducing the total number of edges without slowing down the convergence. Furthermore, the degree of each node in the network remains low and even, making the D-Cliques topology very well-suited to decentralized federated learning. \section{Optimizing with Clique Averaging and Momentum} \label{section:clique-averaging-momentum} In this section, we present Clique Averaging. This feature, when added to D-SGD, removes the bias caused by the inter-cliques edges of D-Cliques. We also show how it can be used to successfully implement momentum for non-IID data. \subsection{Clique Averaging: Debiasing Gradients from Inter-Clique Edges} \label{section:clique-averaging} While limiting the number of inter-clique connections reduces the amount of messages traveling on the network, it also introduces its own bias. Figure~\ref{fig:connected-cliques-bias} illustrates the problem on the simple case of two cliques connected by one inter-clique edge (here, between the green node of the left clique and the pink node of the right clique). Let us focus on node A. With weights computed as in \eqref{eq:metro}, node A's self-weight is $\frac{12} {110}$, the weight between A and the green node connected to B is $\frac{10}{110}$, and all other neighbors of A have a weight of $\frac{11}{110}$. Therefore, the gradient at A is biased towards its own class (pink) and against the green class. A similar bias holds for all other nodes without inter-clique edges with respect to their respective classes. For node B, all its edge weights (including its self-weight) are equal to $\frac{1} {11}$. However, the green class is represented twice (once as a clique neighbor and once from the inter-clique edge), while all other classes are represented only once. This biases the gradient toward the green class. The combined effect of these two sources of bias is to increase the variance of the local models across nodes. \begin{figure}[t] \centering \includegraphics[width=0.3\textwidth]{../figures/connected-cliques-bias} \caption{\label{fig:connected-cliques-bias} Illustrating the bias induced by inter-clique connections (see main text).} \end{figure} We address this problem by adding \emph{Clique Averaging} to D-SGD (Algorithm~\ref{Algorithm:Clique-Unbiased-D-PSGD}), which essentially decouples gradient averaging from model averaging. The idea is to use only the gradients of neighbors within the same clique to compute the average gradient, providing an equal representation to all classes. In contrast, all neighbors' models, including those across inter-clique edges, participate in the model averaging step as in the original version. \begin{algorithm}[t] \caption{D-SGD with Clique Averaging, Node $i$} \label{Algorithm:Clique-Unbiased-D-PSGD} \begin{algorithmic}[1] \STATE \textbf{Require} initial model parameters $\theta_i^{(0)}$, learning rate $\gamma$, mixing weights $W$, mini-batch size $m$, number of steps $K$ \FOR{$k = 1,\ldots, K$} \STATE $S_i^{(k)} \gets \text{mini-batch of $m$ samples drawn from~} D_i$ \STATE $g_i^{(k)} \gets \frac{1}{|\textit{Clique}(i)|}\sum_{j \in \textit{Clique(i)}} \nabla F(\theta_j^{(k-1)}; S_j^{(k)})$ \STATE $\theta_i^{(k-\frac{1}{2})} \gets \theta_i^{(k-1)} - \gamma g_i^{(k)}$ \STATE $\theta_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} \theta_j^{(k-\frac{1}{2})}$ \ENDFOR \end{algorithmic} \end{algorithm} \subsection{Implementing Momentum with Clique Averaging} \label{section:momentum} Efficiently training high capacity models usually requires additional optimization techniques. In particular, momentum~\cite{pmlr-v28-sutskever13} increases the magnitude of the components of the gradient that are shared between several consecutive steps, and is critical for deep convolutional networks like LeNet~\cite{lecun1998gradient,quagmire} to converge quickly. However, a direct application of momentum in a non-IID setting can actually be very detrimental. As illustrated in Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-effect} for the case of LeNet on CIFAR10 with 100 nodes, D-Cliques with momentum even fails to converge. Not using momentum actually gives a faster convergence, but there is a significant gap compared to the case of a single IID node with momentum. Clique Averaging (Section~\ref{section:clique-averaging}) allows us to compute an unbiased momentum from the unbiased average gradient $g_i^{(k)}$ of Algorithm~\ref{Algorithm:Clique-Unbiased-D-PSGD}: \begin{equation} v_i^{(k)} \leftarrow m v_i^{(k-1)} + g_i^{(k)}. \end{equation} It then suffices to modify the original gradient step to use momentum: \begin{equation} \theta_i^{(k-\frac{1}{2})} \leftarrow \theta_i^{(k-1)} - \gamma v_i^{(k)}. \end{equation}