title={{SCAFFOLD: Stochastic Controlled Averaging for On-Device Federated Learning}},
author={Sai Praneeth Karimireddy and Satyen Kale and Mehryar Mohri and Sashank J. Reddi and Sebastian U. Stich and Ananda Theertha Suresh},
booktitle={ICML},
year={2020}
}
@inproceedings{fedprox,
author={Tian Li and Anit Kumar Sahu and Manzil Zaheer and Maziar Sanjabi and Ameet Talwalkar and Virginia Smith},
title={{Federated Optimization in Heterogeneous Networks}},
booktitle={MLSys},
year={2020}
}
@inproceedings{quagmire,
title={{The Non-IID Data Quagmire of Decentralized Machine Learning}},
author={Kevin Hsieh and Amar Phanishayee and Onur Mutlu and Phillip B. Gibbons},
booktitle={ICML},
year={2020}
}
@inproceedings{mcmahan2016communication,
title={Communication-efficient learning of deep networks from decentralized data},
author={McMahan, H. Brendan and Moore, Eider and Ramage, Daniel and Hampson, Seth and Ag\"uera y Arcas, Blaise},
booktitle={AISTATS},
year={2017}
}
@inproceedings{neglia2020,
title={Decentralized gradient methods: does topology matter?},
author={Giovanni Neglia and Chuan Xu and Don Towsley and Gianmarco Calbi},
booktitle={AISTATS},
year={2020}
}
@techreport{amp_dec,
title={{Privacy Amplification by Decentralization}},
author={Edwige Cyffers and Aurélien Bellet},
year={2020},
institution={2012.05326}
}
@article{Duchi2012a,
Author={John C. Duchi and Alekh Agarwal and Martin J. Wainwright},
Date-Modified={2014-10-30 15:23:27 +0000},
Journal={{IEEE} {T}ransactions on {A}utomatic {C}ontrol},
Keywords={optimization, distributed},
Number={3},
Owner={aurelien},
Pages={592--606},
Timestamp={2013.09.16},
Title={{D}ual {A}veraging for {D}istributed {O}ptimization: {C}onvergence {A}nalysis and {N}etwork {S}caling},
Volume={57},
Year={2012}}
@article{Nedic18,
Author={Angelia Nedić and Alex Olshevsky and Michael G. Rabbat},
Journal={Proceedings of the IEEE},
Number={5},
Pages={953--976},
Title={{Network Topology and Communication-Computation Tradeoffs in Decentralized Optimization}},
Volume={106},
Year={2018}}
@techreport{kairouz2019advances,
title={{Advances and Open Problems in Federated Learning}},
author={Peter Kairouz and H. Brendan McMahan and Brendan Avent and Aurélien Bellet and Mehdi Bennis and Arjun Nitin Bhagoji and Keith Bonawitz and Zachary Charles and Graham Cormode and Rachel Cummings and Rafael G. L. D'Oliveira and Salim El Rouayheb and David Evans and Josh Gardner and Zachary Garrett and Adrià Gascón and Badih Ghazi and Phillip B. Gibbons and Marco Gruteser and Zaid Harchaoui and Chaoyang He and Lie He and Zhouyuan Huo and Ben Hutchinson and Justin Hsu and Martin Jaggi and Tara Javidi and Gauri Joshi and Mikhail Khodak and Jakub Konečný and Aleksandra Korolova and Farinaz Koushanfar and Sanmi Koyejo and Tancrède Lepoint and Yang Liu and Prateek Mittal and Mehryar Mohri and Richard Nock and Ayfer Özgür and Rasmus Pagh and Mariana Raykova and Hang Qi and Daniel Ramage and Ramesh Raskar and Dawn Song and Weikang Song and Sebastian U. Stich and Ziteng Sun and Ananda Theertha Suresh and Florian Tramèr and Praneeth Vepakomma and Jianyu Wang and Li Xiong and Zheng Xu and Qiang Yang and Felix X. Yu and Han Yu and Sen Zhao},
year={2019},
institution={arXiv:1912.04977}
}
@article{tibshirani1996regression,
title={Regression shrinkage and selection via the lasso},
% 1/ Decentralized FL approaches can be more scalable than Centralized FL approach when the number of nodes is large
% 2/ It is well known the topology can affect convergence of decentralized algorithms, as shown by classic convergence analysis. However the effect of topology has been observed to be often quite small in practice. This is because most of these results were obtained for iid data.
% 3/ In this paper, we show that the effect of topology is very significant for non-iid data. Unlike for centralized FL approaches, this happens even when nodes perform a single local update before averaging. We propose an approach to design a sparse data-aware topology which recovers the convergence speed of a centralized approach.
% 4/ An originality of our approach is to work at the topology level without changing the original efficient and simple D-SGD algorithm. Other work to mitigate the effect of non-iid on decentralized algorithms are based on performing modified updates (eg with variance reduction) or multiple averaging steps.
Machine learning is currently shifting from the classic \emph{centralized}
paradigm, in which models are trained on data located on a single machine or
in a data center, to more \emph{decentralized} ones.
Indeed, data is often inherently decentralized as it is collected by several
parties (such as different hospitals, companies, personal devices...).
Federated Learning (FL) allows a set
of data owners to collaboratively train machine learning models
on their joint
data while keeping it decentralized, thereby avoiding the costs of moving
data as well as mitigating privacy and confidentiality concerns
\cite{kairouz2019advances}.
Due to the decentralized nature of data collection, the local datasets of
participants can be very different in size and distribution: they are
\emph{not} independent and identically distributed
(non-IID). In particular, the class distributions may vary a lot
across local datasets \cite{quagmire}.
Therefore, one of the key challenges in FL is to design algorithms that
Federated learning algorithms can be classified into two categories depending
on the network topology they work on. In server-based FL, the network is
organized as a star: a central server orchestrates the training process and
iteratively aggregates model updates received from the participants and sends
them back the aggregated model \cite{mcmahan2016communication}. In contrast,
fully decentralized FL algorithms operate over an arbitrary topology where
participants communicate in a peer-to-peer fashion with their direct neighbors
in the network graph. A classic example of such algorithms is Decentralized
SGD (D-SGD) \cite{lian2017d-psgd}, in which participants alternate between
local SGD updates and model averaging with neighboring nodes.
In this work, we focus on fully decentralized algorithms as they can
generally
scale better to the large number of participants seen in ``cross-device''
applications \cite{kairouz2019advances}. Indeed, while a central
server quickly becomes a bottleneck as the number of participants increases, the topology used in fully decentralized algorithms can remain sparse
enough such that all participants have small (constant or logarithmic) degree
\cite{lian2017d-psgd}. Recent work has shown both empirically
\cite{lian2017d-psgd} and theoretically \cite{neglia2020} that sparse
topologies like rings or grids do not significantly affect the convergence
rate compared to using denser topologies when data is IID.
% We also note that full decentralization can also provide benefits in terms of
% privacy protection \cite{amp_dec}.
In contrast to the IID case, we show in this work that \emph{the impact of
topology is very significant for non-IID data}. This phenomenon is illustrated
in Figure~\ref{fig:iid-vs-non-iid-problem}, where we see that using a ring or
grid topology completely jeopardize the convergence rate in practice when
classes are imbalanced across participants.
We stress that the fact unlike for centralized FL approaches
\cite{kairouz2019advances,scaffold,quagmire}, this
happens even when nodes perform a single local update before averaging the
mode with their neighbors. We thus study the following question:
% \textit{Are there regular topologies, i.e. where all nodes have similar or the same number of neighbours, with less connections than a fully-connected graph that retain a similar convergence speed and non-IID behaviour?}
\textit{Are there sparse topologies that allow a similar convergence
speed as the fully connected graph under a large number of participants with
non-IID class distribution?}
We answer this question in the affirmative by proposing \textsc{D-Cliques}, an approach to
design \emph{a sparse data-aware topology which allows to recover the convergence
speed of a centralized (or IID) approach}. Our proposal includes a simple
modification of the standard D-SGD algorithm which ensures that gradients are
unbiased with respect to the class distribution.
We empirically evaluate our approach on MNIST and CIFAR10 datasets using
logistic
regression and deep convolutional models with up to 1000 participants. This is
in contrast to most previous work on fully decentralized algorithms which only
consider a few tens of participants \cite{refs}.
\aurelien{TODO: complete above paragraph with more details and highlighting
other contributions as needed}
To summarize, our contributions are as follows:
\begin{enumerate}
\item TODO
\item
\item
\end{enumerate}
The rest of this paper is organized as follows. \aurelien{TO COMPLETE}
\begin{figure}
\centering
...
...
@@ -82,17 +172,23 @@ TODO: Short verbal introduction to D-PSGD
\caption{IID vs non-IID Convergence Speed. Thin lines are the minimum and maximum accuracy of individual nodes. Bold lines are the average accuracy across all nodes.\protect\footnotemark}
\caption{IID vs non-IID Convergence Speed. Thin lines are the minimum
and maximum accuracy of individual nodes. Bold lines are the average
accuracy across all nodes.\protect\footnotemark\aurelien{TODO: make
the figure more self-contained (adding all information to
understand the setting) and perhaps add a fig to show the
effect of D-Cliques}}
\label{fig:iid-vs-non-iid-problem}
\end{figure}
\footnotetext{This is different from the accuracy of the average model across nodes that is sometimes used once training is completed.}
\textit{Are there regular topologies, i.e. where all nodes have similar or the same number of neighbours, with less connections than a fully-connected graph that retain a similar convergence speed and non-IID behaviour?}
\subsection{Bias in Gradient Averaging with Non-IID Data}
\aurelien{I think this should go into the approach section, to motivate it.
In the introduction, maybe we can just give the main intuitions in a few
sentences?}
To have a preliminary intuition of the impact of non-IID data on convergence speed, examine the local neighbourhood of a single node in a grid similar to that used to obtain results in Figure~\ref{fig:grid-IID-vs-non-IID}, as illustrated in Figure~\ref{fig:grid-iid-vs-non-iid-neighbourhood}. The color of a node, represented as a circle, corresponds to one of the 10 available classes in the dataset. In this IID setting (Figure~\ref{fig:grid-iid-neighbourhood}), each node has examples of all ten classes in equal proportions. In this (rather extreme) non-IID case (Figure~\ref{fig:grid-non-iid-neighbourhood}), each node has examples of only a single class and nodes are distributed randomly in the grid, with neighbourhood such as this one, sometimes having nodes with examples of the same class adjacent to each other.
\begin{figure}
...
...
@@ -119,6 +215,9 @@ However, in the (rather extreme) non-IID case illustrated, there are not enough
\subsection{D-Cliques}
\aurelien{this should definitely go to approach section}
@@ -482,7 +581,46 @@ In addition, it is important that all nodes are initialized with the same model
\section{Related Work}
D2: numerically unstable when $W_{ij}$ rows and columns do not exactly sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbour contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting.
\aurelien{not sure yet if it is better to have this section here or earlier,
we'll see}
% where to place TornadoAggregate and related refs?
\paragraph{Impact of topology in fully decentralized FL.} It is well
known
that the choice of network topology can affect the
convergence of fully decentralized algorithms: this is typically accounted
for
in the theoretical convergence rate by a dependence on the spectral gap of the
network, see for instance \cite{Duchi2012a,lian2017d-psgd,Nedic18}.
% mention Neglia and empirical results for IID data, probably also Consensus
% Control paper which does not allow to analyze the effect of topology.
% can mention Marfoq paper on topology design but to optimize network
% resources, independent of data
% conclusion: role of topology in non-IID is not understood / has not
% been much studied before our work.
\paragraph{Dealing with non-IID data in server-based FL.}
% scaffold, quagmire, fedprox, etc
% also personalized models: Smith etc
\paragraph{Dealing with non-IID data in fully decentralized FL.}
% non-IID known to be a problem for fully decentralized FL. cf Jelasity paper
% D2 and other recent papers on modifying updates: Quasi-Global Momentum,
% Cross-Gradient Aggregation
% papers using multiple averaging steps
% also our personalized papers
D2: numerically unstable when $W_{ij}$ rows and columns do not exactly sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting.
An originality of our approach is to focus on the effect of topology
level without significantly changing the original simple and efficient D-SGD
algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID
data on decentralized algorithms are based on performing modified updates (eg
with variance reduction) or multiple averaging steps.