exp.tex

% !TEX root = main.tex

\section{Evaluation}
\label{section:evaluation}

%In this section, we first compare D-Cliques to alternative topologies to
%confirm the relevance of our main design choices. Then,
%we evaluate some extensions of D-Cliques to further reduce the number of
%inter-clique connections so as to gracefully scale with the number of
%nodes.

 \todo{EL: Revise intro to section}

\subsection{Experimental setup.}
\label{section:experimental-settings}

Our main goal is to provide a fair comparison of the convergence speed across
different topologies and algorithmic variations, in order to
show that our approach
can remove much of the effect of label distribution skew.

We experiment with two datasets: MNIST~\cite{mnistWebsite} and
CIFAR10~\cite{krizhevsky2009learning}, which both have $L=10$ classes.
For MNIST, we use 45k and 10k examples from the original 60k
training set for training and validation respectively. The remaining 5k
training examples were randomly removed to ensure all 10 classes are balanced
while ensuring that the dataset is evenly divisible across 100 and 1000 nodes.
We use all 10k examples of
the test set to measure prediction accuracy. For CIFAR10, classes are evenly
balanced: we use 45k/50k images of the original training set for training,
5k/50k for validation, and all 10k examples of the test set for measuring
prediction accuracy.

We use the non-IID partitioning scheme proposed by ~\cite{mcmahan2016communication} 
in their seminal Federated Learning paper for MNIST on both MNIST and CIFAR10: 
i.e., we sort all training examples by class, then split that list in shards of 
equal size, and distribute the shards to nodes randomly such that each node will receive two shards.
When the number of examples of one class does not divide evenly in shards, as is the case for MNIST, some
shards may have examples of more than one class and therefore nodes may have examples
of up to 4 classes. However, most nodes will have examples of 2 classes.  The varying number 
of classes, as well as the varying distribution of examples within a single node, makes the task 
of creating cliques with low skew non-trivial.

We
use a logistic regression classifier for MNIST, which
provides up to 92.5\% accuracy in the centralized setting.
For CIFAR10, we use a Group-Normalized variant of LeNet~\cite{quagmire}, a
deep convolutional network which achieves an accuracy of $72.3\%$ in the
centralized setting.
These models are thus reasonably accurate (which is sufficient to
study the effect of the topology) while being sufficiently fast to train in a
fully decentralized setting and simple enough to configure and analyze.
Regarding hyper-parameters, we jointly optimize the learning rate and
mini-batch size on the
validation set for 100 nodes, obtaining respectively $0.1$ and $128$ for
MNIST and $0.002$ and $20$ for CIFAR10.
For CIFAR10, we additionally use a momentum of $0.9$.

We evaluate 100- and 1000-node networks by creating multiple models in memory and simulating the exchange of messages between nodes.
To ignore the impact of distributed execution strategies and system
optimization techniques, we report the test accuracy of all nodes (min, max,
average) as a function of the number of times each example of the dataset has
been sampled by a node, i.e. an \textit{epoch}. This is equivalent to the classic case of a single node sampling the full distribution.
To further make results comparable across different number of nodes, we lower
the batch size proportionally to the number of nodes added, and inversely,
e.g. on MNIST, 128 with 100 nodes vs. 13 with 1000 nodes. This
ensures the same number of model updates and averaging per epoch, which is
important to have a fair comparison.\footnote{Updating and averaging models
after every example can eliminate the impact of label distribution skew. However, the
resulting communication overhead is impractical.}

Finally, we compare our results against an ideal baseline: either a
fully-connected network topology with the same number of nodes or a single IID
node. In both cases, the topology has no effect on
the optimization. For a certain choice of number of nodes and
mini-batch size, both approaches are equivalent. 

\subsection{D-Cliques match the Convergence Speed of Fully-Connected with a Fraction of the Edges}

% From directory 'results-v2':
% MNIST
% python $TOOLS/analyze/filter.py all --dataset:name mnist --topology:name fully-connected d-cliques/greedy-swap --nodes:name 2-shards-uneq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-28-23:16:47-CEST-labostrex117 all/2021-09-28-23:18:49-CEST-labostrex119 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 80 --ymax 92.5 --yaxis test-accuracy --labels 'fully-connected' 'd-cliques (fc) w/ cliq-avg' --save-figure ../mlsys2022style/figures/convergence-speed-mnist-dc-fc-vs-fc-2-shards-per-node.png --linestyles 'solid' 'dashed' --font-size 18
% CIFAR10
% python $TOOLS/analyze/filter.py all --dataset:name cifar10 --topology:name fully-connected d-cliques/greedy-swap --nodes:name 2-shards-eq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-02-18:58:22-CEST-labostrex114 all/2021-10-03-19:53:21-CEST-labostrex117 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 100 --yaxis test-accuracy --labels 'fully-connected w/ mom.' 'd-cliques (fc) w/ c-avg, mom.' --save-figure ../mlsys2022style/figures/convergence-speed-cifar10-dc-fc-vs-fc-2-shards-per-node.png --linestyles 'solid' 'dashed' --legend 'lower right' --font-size 18

\begin{figure}[htbp]
    \centering        
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/convergence-speed-mnist-dc-fc-vs-fc-2-shards-per-node}
    \caption{\label{fig:convergence-speed-mnist-dc-fc-vs-fc-2-shards-per-node} MNIST}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/convergence-speed-cifar10-dc-fc-vs-fc-2-shards-per-node}
    \caption{\label{fig:convergence-speed-cifar10-dc-fc-vs-fc-2-shards-per-node} CIFAR10}
    \end{subfigure}
\caption{\label{fig:convergence-speed-dc-vs-fc-2-shards-per-node} Convergence Speed of D-Cliques constructed with Greedy Swap Compared to Fully-Connected on 100 Nodes (2 shards/node).}
\end{figure}


Figure~\ref{fig:convergence-speed-dc-vs-fc-2-shards-per-node} illustrates the
convergence speed of D-Cliques with $n=100$ nodes on MNIST (with Clique Averaging) and CIFAR10 (with Clique Averaging and Momentum). Observe that the
convergence speed is
very close
to that of a fully-connected topology, and significantly better than with
a ring or a grid (see Figure~\ref{fig:iid-vs-non-iid-problem}). With 
100 nodes, it offers a reduction of $\approx90\%$ in the number of edges
compared to a fully-connected topology.

\subsection{D-Cliques Converge Faster than Random Graphs}

% From directory 'results-v2':
% MNIST
% python $TOOLS/analyze/filter.py all --dataset:name mnist --topology:name random-graph d-cliques/greedy-swap greedy-neighbourhood-swap --nodes:name 2-shards-uneq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-29-03:53:42-CEST-labostrex119 all/2021-09-29-22:17:08-CEST-labostrex118 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 80 --ymax 92.5 --yaxis test-accuracy --labels 'd-cliques (fc) w/o cliq-avg'  'random 10' --save-figure ../mlsys2022style/figures/convergence-mnist-random-vs-d-cliques-2-shards.png --linestyles 'solid' 'dashed' --font-size 18

% CIFAR10
% python $TOOLS/analyze/filter.py all --dataset:name cifar10 --topology:name d-cliques/greedy-swap random-graph --nodes:nb-nodes 100 --algorithm:learning-momentum 0.9  | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-03-23:37:42-CEST-labostrex117 all/2021-10-05-18:38:30-CEST-labostrex115 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 100 --yaxis test-accuracy --labels 'd-cliques (fc) w/o cliq-avg w/o mom.'  'random 10 w/o mom.' --save-figure ../mlsys2022style/figures/convergence-cifar10-random-vs-d-cliques-2-shards.png --linestyles 'solid' 'dashed' --font-size 18

\begin{figure}[htbp]
     \centering     
         \begin{subfigure}[b]{0.23\textwidth}
         \centering
         \includegraphics[width=\textwidth]{figures/convergence-mnist-random-vs-d-cliques-2-shards}
                  \caption{MNIST}
         \end{subfigure}
                 \hfill                      
        \begin{subfigure}[b]{0.23\textwidth}
        \centering
         \includegraphics[width=\textwidth]{figures/convergence-cifar10-random-vs-d-cliques-2-shards}
         \caption{CIFAR10}
     \end{subfigure} 
 \caption{\label{fig:convergence-random-vs-d-cliques-2-shards} Comparison to Random Graph with 10 edges per node \textit{without} Clique Averaging or Momentum (see main text for justification).} 
\end{figure}

\begin{figure}[htbp]
     \centering     
         \begin{subfigure}[b]{0.23\textwidth}
         \centering
         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-linear-comparison-to-non-clustered-topologies}
                  \caption{MNIST}
         \end{subfigure}
                 \hfill                      
        \begin{subfigure}[b]{0.23\textwidth}
        \centering
         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-linear-comparison-to-non-clustered-topologies}
         \caption{CIFAR10}
     \end{subfigure} 
 \caption{\label{fig:d-cliques-comparison-to-non-clustered-topologies} Comparison to Random Graph with 10 edges per node, \textit{with} Clique Averaging (and Momentum) as well as analogous Neighbourhood Averaging for Random Graph \textit{in a more stringent partitioning of 1 class/node}\textsuperscript{*}.} 
\footnotesize\textsuperscript{*}\textit{These results were obtained with a previous version of the simulator but should be consistent with the latest. They will be updated for the final version.}
\end{figure}

%We demonstrate the advantages of D-Cliques over alternative sparse topologies
%that have a similar number of edges. First, we consider topologies in which
%the neighbors of each node are selected at random (hence without any clique
%structure).
%Specifically, for $n=100$ nodes, we
%construct a random topology such that each node has exactly 10 edges, which is
%similar to the average 9.9 edges of our D-Cliques topology 
%(Figure~\ref{fig:d-cliques-figure}). To better understand the role of
%the clique structure beyond merely ensuring class representativity among
%neighbors,
%we also compare to a random topology similar to the one described above except
%that edges are
%chosen such that each node has neighbors of all possible classes. Finally, we
%also implement an analog of Clique Averaging for these random topologies,
%where all nodes de-bias their gradient based on the class distribution of
%their neighbors. In the latter case, since nodes do not form a clique, each
%node obtains a different average gradient.
%
%The results for MNIST and CIFAR10 are shown in
%Figure~\ref{fig:d-cliques-comparison-to-non-clustered-topologies}. For MNIST,
%a purely random topology has higher variance and lower convergence speed than
%D-Cliques (with or without Clique Averaging), while a random topology with
%class representativity performs similarly as D-Cliques without Clique
%Averaging. However and perhaps surprisingly, a random topology with unbiased
%gradient performs slightly worse than without it. In any case, D-Cliques with
%Clique Averaging outperforms all random topologies, showing that the clique
%structure has a small but noticeable effect on the average accuracy and
%significantly reduces the variance across nodes in this setup.
%
%\begin{figure}[t]
%     \centering     
%         \begin{subfigure}[b]{0.35\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-linear-comparison-to-non-clustered-topologies}
%                  \caption{MNIST with Linear Model}
%         \end{subfigure}
%                 \hfill                      
%        \begin{subfigure}[b]{0.35\textwidth}
%        \centering
%         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-linear-comparison-to-non-clustered-topologies}
%         \caption{CIFAR10 with LeNet}
%     \end{subfigure} 
% \caption{\label{fig:d-cliques-comparison-to-non-clustered-topologies} Comparison to Non-Clustered Topologies} 
%\end{figure}
%
%On the harder CIFAR10 dataset with a deep convolutional network, the
%differences are much more dramatic:
%D-Cliques with Clique Averaging and momentum turns out to be critical for fast
%convergence.
%Crucially, all random topologies fail to converge to a good solution. This
%confirms that our clique structure is important to reduce variance
%across nodes and improve the convergence. The difference with the previous
%experiment seems to be due to both the use of a higher capacity model and to
%the intrinsic characteristics of the datasets.
%
%While the previous experiments suggest that our clique structure is
%instrumental in obtaining good performance, one may wonder whether
%intra-clique full connectivity is actually necessary.
%Figure~\ref{fig:d-cliques-intra-connectivity} shows the convergence speed of
%a D-Cliques topology where cliques have been sparsified by randomly
%removing 1 or 5 undirected edges per clique (out of 45). Strikingly, both for MNIST and
%CIFAR10, removing just a single edge from the cliques has a
%significant effect on the
%convergence speed. On CIFAR10, it even entirely negates the
%benefits of D-Cliques.
%
%Overall, these results show that achieving fast convergence on non-IID
%data with sparse topologies requires a very careful design, as we have
%proposed with D-Cliques.

\subsection{Cliques built with Greedy Swap Converge Faster than Random Cliques}

% From directory 'results-v2':
% MNIST
% python $TOOLS/analyze/filter.py all --dataset:name mnist --topology:name d-cliques/random-cliques d-cliques/greedy-swap --nodes:name 2-shards-uneq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-29-22:12:59-CEST-labostrex114 all/2021-09-28-23:18:49-CEST-labostrex119 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 80 --ymax 92.5 --yaxis test-accuracy --labels 'd-cliques random' 'd-cliques greedy-swap' --save-figure ../mlsys2022style/figures/convergence-speed-mnist-dc-random-vs-dc-gs-2-shards-per-node.png --linestyles 'solid' 'dashed' --font-size 18
% CIFAR10
%  python $TOOLS/analyze/filter.py all --dataset:name cifar10 --topology:name d-cliques/random-cliques d-cliques/greedy-swap --nodes:name 2-shards-eq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-04-21:18:33-CEST-labostrex117 all/2021-10-03-19:53:21-CEST-labostrex117 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 100 --yaxis test-accuracy --labels 'd-cliques random' 'd-cliques greedy-swap' --save-figure ../mlsys2022style/figures/convergence-speed-cifar10-dc-random-vs-dc-gs-2-shards-per-node.png --linestyles 'solid' 'dashed' --font-size 18

\begin{figure}[htbp]
    \centering        
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/convergence-speed-mnist-dc-random-vs-dc-gs-2-shards-per-node}
    \caption{\label{fig:convergence-speed-mnist-dc-random-vs-dc-gs-2-shards-per-node} MNIST}
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/convergence-speed-cifar10-dc-random-vs-dc-gs-2-shards-per-node}
    \caption{\label{fig:convergence-speed-cifar10-dc-random-vs-dc-gs-2-shards-per-node} CIFAR10}
    \end{subfigure}
\caption{\label{fig:convergence-speed-dc-random-vs-dc-gs-2-shards-per-node} Convergence Speed of D-Cliques constructed Randomly vs Greedy Swap on 100 Nodes (2 shards/node).}
\end{figure}

\subsection{Clique Averaging and Momentum are Necessary}

% From directory 'results-v2':
% MNIST
% python $TOOLS/analyze/filter.py all --dataset:name mnist --topology:name d-cliques/greedy-swap --nodes:name 2-shards-uneq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-29-03:53:42-CEST-labostrex119 all/2021-09-28-23:18:49-CEST-labostrex119 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 89 --ymax 92.5 --yaxis test-accuracy --labels 'd-cliques w/o c-avg.' 'd-cliques w/ c-avg.' --save-figure ../mlsys2022style/figures/convergence-speed-mnist-dc-no-c-avg-vs-c-avg-2-shards-per-node.png --linestyles 'solid' 'dashed' --font-size 18
\begin{figure}[htbp]
         \centering
         \includegraphics[width=0.35\textwidth]{figures/convergence-speed-mnist-dc-no-c-avg-vs-c-avg-2-shards-per-node}
\caption{\label{fig:d-clique-mnist-clique-avg} Effect of Clique Averaging on MNIST. Y axis starts at 89.}
\end{figure}

As illustrated in Figure~\ref{fig:d-clique-mnist-clique-avg}, Clique Averaging
significantly reduces the variance of models across nodes and accelerates
convergence to reach the same level as the one obtained with a
fully-connected topology. Note that Clique Averaging induces a small
additional cost, as gradients
and models need to be sent in two separate rounds of messages. Nonetheless, compared to fully connecting all nodes, the total number of messages is reduced by $\approx 80\%$.


% CIFAR10
%  python $TOOLS/analyze/filter.py all --dataset:name cifar10 --topology:name d-cliques/greedy-swap --nodes:name 2-shards-eq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% w/o Clique Averaging
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-03-23:37:42-CEST-labostrex117 all/2021-10-04-03:13:46-CEST-labostrex117  --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 100 --yaxis test-accuracy --labels 'd-cliques w/o momentum' 'd-cliques w/ momentum' --save-figure ../mlsys2022style/figures/convergence-speed-cifar10-wo-c-avg-no-mom-vs-mom-2-shards-per-node.png --linestyles 'solid' 'dashed' --font-size 18
% w/ Clique Averaging
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-03-16:10:34-CEST-labostrex117 all/2021-10-03-19:53:21-CEST-labostrex117 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 100 --yaxis test-accuracy --labels 'd-cliques w/o momentum' 'd-cliques w/ momentum' --save-figure ../mlsys2022style/figures/convergence-speed-cifar10-w-c-avg-no-mom-vs-mom-2-shards-per-node.png --linestyles 'solid' 'dashed' --font-size 18

\begin{figure}[htbp]
    \centering        
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/convergence-speed-cifar10-wo-c-avg-no-mom-vs-mom-2-shards-per-node}
    \caption{\label{fig:convergence-speed-cifar10-wo-c-avg-no-mom-vs-mom-2-shards-per-node} Without Clique Averaging }
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/convergence-speed-cifar10-w-c-avg-no-mom-vs-mom-2-shards-per-node}
    \caption{\label{fig:convergence-speed-cifar10-w-c-avg-no-mom-vs-mom-2-shards-per-node} With Clique Averaging}
    \end{subfigure}
\caption{\label{fig:cifar10-c-avg-momentum} Effect of Clique Averaging and Momentum on CIFAR10 with LeNet.}
\end{figure}

As shown in
Figure~\ref{fig:cifar10-c-avg-momentum}, 
the use of Clique Averaging restores the benefits of momentum and closes the gap
with the centralized setting.

\subsection{D-Cliques Tolerate Some Intra-Connectivity Failures}

% From directory 'results-v2':
% MNIST
% python $TOOLS/analyze/filter.py all --dataset:name mnist --topology:name d-cliques/greedy-swap --nodes:name 2-shards-uneq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% w/o Clique Gradient
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-29-03:53:42-CEST-labostrex119 all/2021-10-01-21:44:14-CEST-labostrex113 all/2021-10-02-06:53:40-CEST-labostrex113 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 89 --ymax 92.5 --yaxis test-accuracy --labels 'full intra-connectivity' '-1 edge/clique' '-5 edges/clique' --save-figure ../mlsys2022style/figures/d-cliques-mnist-wo-clique-avg-impact-of-edge-removal.png --linestyles 'solid' 'dashed' 'dotted' --font-size 18
% w/ Clique Gradient
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-28-23:18:49-CEST-labostrex119 all/2021-10-01-17:08:42-CEST-labostrex113 all/2021-10-02-02:17:43-CEST-labostrex113 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 89 --ymax 92.5 --yaxis test-accuracy  --labels 'full intra-connectivity' '-1 edge/clique' '-5 edges/clique' --save-figure ../mlsys2022style/figures/d-cliques-mnist-w-clique-avg-impact-of-edge-removal.png --linestyles 'solid' 'dashed' 'dotted' --font-size 18
\begin{figure}[htbp]
     \centering

\begin{subfigure}[htbp]{0.23\textwidth}
     \centering   
         \includegraphics[width=\textwidth]{figures/d-cliques-mnist-wo-clique-avg-impact-of-edge-removal}     
\caption{\label{fig:d-cliques-mnist-wo-clique-avg-impact-of-edge-removal} Without Clique Averaging }
\end{subfigure}
\hfill
\begin{subfigure}[htbp]{0.23\textwidth}
     \centering
         \includegraphics[width=\textwidth]{figures/d-cliques-mnist-w-clique-avg-impact-of-edge-removal}
\caption{\label{fig:d-cliques-mnist-w-clique-avg-impact-of-edge-removal} With Clique Averaging}
\end{subfigure}
\caption{\label{fig:d-cliques-mnist-intra-connectivity} MNIST: Impact of Intra-clique Connectivity Failures. Y axis starts at 89.}
\end{figure}

% CIFAR10
% python $TOOLS/analyze/filter.py all --dataset:name cifar10 --topology:name d-cliques/greedy-swap --nodes:name 2-shards-eq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% w/o Clique Gradient
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-04-03:13:46-CEST-labostrex117 all/2021-10-06-17:58:49-CEST-labostrex112 all/2021-10-06-17:45:22-CEST-labostrex115  --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 80 --yaxis test-accuracy --labels 'full intra-connectivity' '-1 edge/clique' '-5 edges/clique' --save-figure ../mlsys2022style/figures/d-cliques-cifar10-wo-clique-avg-impact-of-edge-removal.png --linestyles 'solid' 'dashed' 'dotted' --font-size 18
% w/ Clique Gradient
% python $TOOLS/analyze/diff.py --rundirs all/2021-10-03-19:53:21-CEST-labostrex117 all/2021-10-06-12:46:49-CEST-labostrex112 all/2021-10-06-12:49:51-CEST-labostrex115 --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 0 --ymax 80 --yaxis test-accuracy --labels 'full intra-connectivity' '-1 edge/clique' '-5 edges/clique' --save-figure ../mlsys2022style/figures/d-cliques-cifar10-w-clique-avg-impact-of-edge-removal.png --linestyles 'solid' 'dashed' 'dotted' --font-size 18

\begin{figure}[htbp]
     \centering
\begin{subfigure}[htbp]{0.23\textwidth}
     \centering   
         \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-wo-clique-avg-impact-of-edge-removal}     
\caption{\label{fig:d-cliques-cifar10-wo-clique-avg-impact-of-edge-removal} Without Clique Averaging }
\end{subfigure}
\hfill
\begin{subfigure}[htbp]{0.23\textwidth}
     \centering
         \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-w-clique-avg-impact-of-edge-removal}
\caption{\label{fig:d-cliques-cifar10-w-clique-avg-impact-of-edge-removal} With Clique Averaging}
\end{subfigure}
\caption{\label{fig:d-cliques-cifar10-intra-connectivity} CIFAR: Impact of Intra-clique Connectivity Failures (with Momentum).}
\end{figure}

\subsection{D-Cliques Scale with Sparser Inter-Clique Topologies}

In this last series of experiments, we evaluate the effect of choosing sparser
inter-clique topologies on the convergence speed for a larger network of 1000
nodes.  We compare the scalability and convergence speed of the several
D-Cliques variants introduced in Section~\ref{section:interclique-topologies}.

Figure~\ref{fig:d-cliques-cifar10-convolutional} shows the convergence
speed of all sparse inter-clique topologies on MNIST and CIFAR10, compared to the ideal
baseline
of a
single IID node performing the same number of updates per epoch (representing
the fastest convergence speed achievable if topology had no impact). Among the linear schemes, the ring
topology converges but is much slower than our fractal scheme. Among the super-linear schemes, the small-world
topology has a convergence speed that is almost the same as with a
fully-connected inter-clique topology but with 22\% less edges
(14.5 edges on average instead of 18.9). 

While the small-world inter-clique topology shows promising scaling behaviour, the
fully-connected topology still offers
significant benefits with 1000 nodes, as it represents a 98\% reduction in the
number of edges compared to fully connecting individual nodes (18.9 edges on
average instead of 999) and a 96\% reduction in the number of messages (37.8
messages per round per node on average instead of 999). We refer to
Appendix~\ref{app:scaling} for additional results comparing the convergence
speed across different number of nodes. Overall, these results
show that D-Cliques can nicely scale with the number of nodes.

% From directory 'results-v2':
% MNIST
% python $TOOLS/analyze/filter.py all --dataset:name mnist --topology:name d-cliques/greedy-swap --nodes:name 2-shards-uneq-classes --meta:seed 1 --nodes:nb-nodes 100 | python $TOOLS/analyze/diff.py
% python $TOOLS/analyze/diff.py --rundirs all/2021-09-30-11:39:47-CEST-labostrex114 all/2021-09-30-11:40:32-CEST-labostrex115  --pass-through | python $TOOLS/plot/convergence.py --add-min-max --ymin 84 --ymax 92.5 --yaxis test-accuracy  --labels 'fully-connected' 'd-cliques (fractal)' 'd-cliques (ring)' --save-figure ../mlsys2022style/figures/d-cliques-scaling-mnist-1000-linear.png --linestyles 'solid' 'dashed' 'dotted' --font-size 18

\begin{figure}[t]
     \centering
%       % To regenerate the figure, from directory results/mnist
% % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-09:20:03-CET ../scaling/1000/mnist/fractal-cliques/all/2021-03-14-17:41:59-CET ../scaling/1000/mnist/clique-ring/all/2021-03-13-18:22:36-CET     --add-min-max --yaxis test-accuracy --legend 'lower right' --ymin 84 --ymax 92.5 --labels '1 node IID' 'd-cliques (fractal)' 'd-cliques (ring)'  --save-figure ../../figures/d-cliques-mnist-1000-nodes-comparison-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
%     \begin{subfigure}[b]{0.35\textwidth}
%         \centering
%            \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-1000-nodes-comparison-linear}
%             \caption{\label{fig:d-cliques-mnist-1000-nodes-comparison-linear} MNIST with Linear Model: Linear Inter-clique Topologies.}
%     \end{subfigure}
%     \hfill
%     % To regenerate the figure, from directory results/cifar10
%% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-13:52:58-CET ../scaling/1000/cifar10/fractal-cliques/all/2021-03-14-17:42:46-CET ../scaling/1000/cifar10/clique-ring/all/2021-03-14-09:55:24-CET  --add-min-max --yaxis test-accuracy --labels '1-node IID' 'd-cliques (fractal)' 'd-cliques (ring)' --legend 'lower right' --save-figure ../../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
%     \begin{subfigure}[b]{0.35\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-linear}
%\caption{\label{fig:d-cliques-cifar10-1000-vs-1-node-test-accuracy-linear}  CIFAR10 with LeNet Model: Linear Inter-clique Topologies.}
%     \end{subfigure}
%    
%     
% % To regenerate the figure, from directory results/mnist
% % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-09:20:03-CET ../scaling/1000/mnist/fully-connected-cliques/all/2021-03-14-17:56:26-CET ../scaling/1000/mnist/smallworld-logn-cliques/all/2021-03-23-21:45:39-CET --add-min-max --yaxis test-accuracy --legend 'lower right' --ymin 84 --ymax 92.5 --labels '1 node IID'  'd-cliques (fully-connected cliques)' 'd-cliques (smallworld)'  --save-figure ../../figures/d-cliques-mnist-1000-nodes-comparison-super-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
%     \begin{subfigure}[b]{0.35\textwidth}
%         \centering
%            \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-1000-nodes-comparison-super-linear}
%             \caption{\label{fig:d-cliques-mnist-1000-nodes-comparison-super-linear} MNIST with Linear Model: Superlinear Inter-clique Topologies.}
%     \end{subfigure}
%     \hfill
%     % To regenerate the figure, from directory results/cifar10
%% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-13:52:58-CET ../scaling/1000/cifar10/fully-connected-cliques/all/2021-03-14-17:41:20-CET ../scaling/1000/cifar10/smallworld-logn-cliques/all/2021-03-23-22:13:57-CET  --add-min-max --yaxis test-accuracy --labels '1-node IID' 'd-cliques (fully-connected cliques)' 'd-cliques (smallworld)' --legend 'lower right' --save-figure ../../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-super-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
%     \begin{subfigure}[b]{0.35\textwidth}
%         \centering
%         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-super-linear}
%\caption{\label{fig:d-cliques-cifar10-1000-vs-1-node-test-accuracy-super-linear}  CIFAR10 with LeNet Model: Superlinear Inter-clique Topologies.}
%     \end{subfigure}
TODO   
\caption{\label{fig:d-cliques-cifar10-convolutional} D-Cliques Convergence Speed with 1000 nodes, non-IID, Constant Updates per Epoch, with Different Inter-Clique Topologies.}
\end{figure}

\subsection{Cliques with Low Skew can be Constructed Efficiently with Greedy Swap}
\label{section:cost-cliques}

% MNIST
% python $TOOLS/plot/skew/final-distribution.py --rundirs skews-mnist/* --save-figure ../mlsys2022style/figures/final-skew-distribution-mnist.png --labels 'Greedy Swap' 'Random Cliques' --linewidth 2.5 --font-size 18 --linestyles 'solid' 'dashed'
% CIFAR10
% python $TOOLS/plot/skew/final-distribution.py --rundirs skews-cifar10/* --save-figure ../mlsys2022style/figures/final-skew-distribution-cifar10.png --labels 'Greedy Swap' 'Random Cliques' --linewidth 2.5 --font-size 18 --linestyles 'solid' 'dashed'

\begin{figure}[htbp]
    \centering        
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/final-skew-distribution-mnist}
    \caption{\label{fig:final-skew-distribution-mnist} MNIST }
    \end{subfigure}
    \hfill
    \begin{subfigure}[b]{0.23\textwidth}
    \centering
    \includegraphics[width=\textwidth]{figures/final-skew-distribution-cifar10}
    \caption{\label{fig:final-skew-distribution-cifar10} CIFAR10}
    \end{subfigure}
\caption{\label{fig:final-skew-distribution} Final Quality of Cliques (Skew) with a Maximum Size of 10 over 100 Experiments.}
\end{figure}

%python $TOOLS/analyze/filter.py skews --topology:name d-cliques/greedy-swap | python $TOOLS/plot/skew/convergence.py --max-steps 400 --labels 'MNIST (unbalanced classes)' 'CIFAR10 (balanced classes)' --linewidth 2.5 --save-figure ../mlsys2022style/figures/skew-convergence-speed-2-shards.png
\begin{figure}[htbp]
    \centering
    \includegraphics[width=0.3\textwidth]{figures/skew-convergence-speed-2-shards}
    \caption{\label{fig:skew-convergence-speed-2-shards} Speed of Skew Decrease during Clique Construction. Bold line is the average over 100 experiments and 10 cliques/experiments. Thin lines are respectively the minimum and maximum over all experiments. As a reference, 1000 steps take less than 6 seconds in Python 3.7 on a MacBook Pro 2020.}
\end{figure}