Skip to content
Snippets Groups Projects
Commit 223f059a authored by aurelien.bellet's avatar aurelien.bellet
Browse files

continue related work

parent 8bc441bd
No related branches found
No related tags found
No related merge requests found
@inproceedings{smith2017federated,
title={{Federated Multi-Task Learning}},
author={Smith, Virginia and Chiang, Chao-Kai and Sanjabi, Maziar and Talwalkar, Ameet S.},
booktitle={NIPS},
year={2017}
}
@inproceedings{perso_fl_mean,
title={{Lower Bounds and Optimal Algorithms for Personalized Federated Learning}},
author={Filip Hanzely and Slavomír Hanzely and Samuel Horváth and Peter Richtarik},
booktitle={NeurIPS},
year={2020}
}
@inproceedings{maml,
title={{Personalized Federated Learning with Theoretical Guarantees: A Model-Agnostic Meta-Learning Approach}},
author={Alireza Fallah and Aryan Mokhtari and Asuman Ozdaglar},
booktitle={NeurIPS},
year={2020}
}
@inproceedings{moreau,
title={{Personalized Federated Learning with Moreau Envelopes}},
author={Canh T. Dinh and Nguyen H. Tran and Tuan Dung Nguyen},
booktitle={NeurIPS},
year={2020}
}
@techreport{momentum_noniid,
title={{Quasi-Global Momentum: Accelerating Decentralized Deep Learning on Heterogeneous Data}},
author={Tao Lin and Sai Praneeth Karimireddy and Sebastian U. Stich and Martin Jaggi},
year={2021},
institution = {arXiv:2102.04761}
}
@techreport{cross_gradient,
title={{Cross-Gradient Aggregation for Decentralized Learning from Non-IID data}},
author={Yasaman Esfandiari and Sin Yong Tan and Zhanhong Jiang and Aditya Balu and Ethan Herron and Chinmay Hegde and Soumik Sarkar},
year={2021},
institution = {arXiv:2103.02051}
}
@techreport{consensus_distance,
title={{Consensus Control for Decentralized Deep Learning}},
author={Lingjing Kong and Tao Lin and Anastasia Koloskova and Martin Jaggi and Sebastian U. Stich},
year={2021},
institution = {arXiv:2102.04828}
}
@INPROCEEDINGS{Colin2016a,
author = {Igor Colin and Aur\'elien Bellet and Joseph Salmon and St\'ephan Cl\'emen\c{c}on},
title = {{G}ossip {D}ual {A}veraging for {D}ecentralized {O}ptimization of {P}airwise {F}unctions},
booktitle = {{ICML}},
year = {2016}
}
@inproceedings{scaffold, @inproceedings{scaffold,
title={{SCAFFOLD: Stochastic Controlled Averaging for On-Device Federated Learning}}, title={{SCAFFOLD: Stochastic Controlled Averaging for On-Device Federated Learning}},
author={Sai Praneeth Karimireddy and Satyen Kale and Mehryar Mohri and Sashank J. Reddi and Sebastian U. Stich and Ananda Theertha Suresh}, author={Sai Praneeth Karimireddy and Satyen Kale and Mehryar Mohri and Sashank J. Reddi and Sebastian U. Stich and Ananda Theertha Suresh},
...@@ -5,6 +61,13 @@ ...@@ -5,6 +61,13 @@
year={2020} year={2020}
} }
@inproceedings{marfoq,
title={{Throughput-Optimal Topology Design for Cross-Silo Federated Learning}},
author={Othmane Marfoq and Chuan Xu and Giovanni Neglia and Richard Vidal},
booktitle={NeurIPS},
year={2020}
}
@inproceedings{Lian2018, @inproceedings{Lian2018,
Author = {Xiangru Lian and Wei Zhang and Ce Zhang and Ji Liu}, Author = {Xiangru Lian and Wei Zhang and Ce Zhang and Ji Liu},
Booktitle = {ICML}, Booktitle = {ICML},
...@@ -43,7 +106,7 @@ ...@@ -43,7 +106,7 @@
title={{Privacy Amplification by Decentralization}}, title={{Privacy Amplification by Decentralization}},
author={Edwige Cyffers and Aurélien Bellet}, author={Edwige Cyffers and Aurélien Bellet},
year={2020}, year={2020},
institution = {2012.05326} institution = {arXiv:2012.05326}
} }
@article{Duchi2012a, @article{Duchi2012a,
...@@ -59,6 +122,14 @@ ...@@ -59,6 +122,14 @@
Volume = {57}, Volume = {57},
Year = {2012}} Year = {2012}}
@article{jelasity,
Author = {István Hegedüs and Gábor Danner and Márk Jelasity},
Journal = {Journal of Parallel and Distributed Computing},
Pages = {109--124},
Title = {{Decentralized learning works: An empirical comparison of gossip learning and federated learning}},
Volume = {148},
Year = {2021}}
@article{Nedic18, @article{Nedic18,
Author = {Angelia Nedić and Alex Olshevsky and Michael G. Rabbat}, Author = {Angelia Nedić and Alex Olshevsky and Michael G. Rabbat},
Journal = {Proceedings of the IEEE}, Journal = {Proceedings of the IEEE},
...@@ -620,15 +691,6 @@ pages={211-252} ...@@ -620,15 +691,6 @@ pages={211-252}
url = {http://proceedings.mlr.press/v80/tang18a.html}, url = {http://proceedings.mlr.press/v80/tang18a.html},
} }
@misc{hsieh2019noniid,
title={The Non-IID Data Quagmire of Decentralized Machine Learning},
author={Kevin Hsieh and Amar Phanishayee and Onur Mutlu and Phillip B. Gibbons},
year={2019},
eprint={1910.00189},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@article{xiao2007distributed, @article{xiao2007distributed,
title={{Distributed Average Consensus with Least-Mean-Square Deviation}}, title={{Distributed Average Consensus with Least-Mean-Square Deviation}},
author={Xiao, Lin and Boyd, Stephen and Kim, Seung-Jean}, author={Xiao, Lin and Boyd, Stephen and Kim, Seung-Jean},
......
...@@ -92,7 +92,8 @@ can efficiently deal with such non-IID data ...@@ -92,7 +92,8 @@ can efficiently deal with such non-IID data
Federated learning algorithms can be classified into two categories depending Federated learning algorithms can be classified into two categories depending
on the network topology they work on. In server-based FL, the network is on the network topology they work on. In server-based FL, the network is
organized as a star: a central server orchestrates the training process and organized as a star: a central server orchestrates the training process and
iteratively aggregates model updates received from the participants and sends iteratively aggregates model updates received from the participants
(\emph{clients}) and sends
them back the aggregated model \cite{mcmahan2016communication}. In contrast, them back the aggregated model \cite{mcmahan2016communication}. In contrast,
fully decentralized FL algorithms operate over an arbitrary topology where fully decentralized FL algorithms operate over an arbitrary topology where
participants communicate in a peer-to-peer fashion with their direct neighbors participants communicate in a peer-to-peer fashion with their direct neighbors
...@@ -107,7 +108,7 @@ applications \cite{kairouz2019advances}. Indeed, while a central ...@@ -107,7 +108,7 @@ applications \cite{kairouz2019advances}. Indeed, while a central
server quickly becomes a bottleneck as the number of participants increases, the topology used in fully decentralized algorithms can remain sparse server quickly becomes a bottleneck as the number of participants increases, the topology used in fully decentralized algorithms can remain sparse
enough such that all participants have small (constant or logarithmic) degree enough such that all participants have small (constant or logarithmic) degree
\cite{lian2017d-psgd}. Recent work has shown both empirically \cite{lian2017d-psgd}. Recent work has shown both empirically
\cite{lian2017d-psgd} and theoretically \cite{neglia2020} that sparse \cite{lian2017d-psgd,Lian2018} and theoretically \cite{neglia2020} that sparse
topologies like rings or grids do not significantly affect the convergence topologies like rings or grids do not significantly affect the convergence
rate compared to using denser topologies when data is IID. rate compared to using denser topologies when data is IID.
% We also note that full decentralization can also provide benefits in terms of % We also note that full decentralization can also provide benefits in terms of
...@@ -596,44 +597,80 @@ In addition, it is important that all nodes are initialized with the same model ...@@ -596,44 +597,80 @@ In addition, it is important that all nodes are initialized with the same model
\aurelien{not sure yet if it is better to have this section here or earlier, \aurelien{not sure yet if it is better to have this section here or earlier,
we'll see} we'll see}
% where to place TornadoAggregate and related refs? \aurelien{TODO: where to place TornadoAggregate and related refs?}
\paragraph{Impact of topology in fully decentralized FL.} It is well \paragraph{Impact of topology in fully decentralized FL.} It is well
known known
that the choice of network topology can affect the that the choice of network topology can affect the
convergence of fully decentralized algorithms: this is typically accounted convergence of fully decentralized algorithms: this is typically accounted
for for in the theoretical convergence rate by a dependence on the spectral gap of
in the theoretical convergence rate by a dependence on the spectral gap of the the network, see for instance
network, see for instance \cite{Duchi2012a,lian2017d-psgd,Nedic18}. \cite{Duchi2012a,Colin2016a,lian2017d-psgd,Nedic18}.
However, for IID data, practice contradicts these classic
% mention Neglia and empirical results for IID data, probably also Consensus results: fully decentralized algorithms converge essentially as fast
% Control paper which does not allow to analyze the effect of topology. on sparse topologies like rings or grids as they do on a fully connected
% can mention Marfoq paper on topology design but to optimize network graph \cite{lian2017d-psgd,Lian2018}. Recent work
% resources, independent of data \cite{neglia2020,consensus_distance} sheds light on this phenomenon with refined convergence analyses based on differences between gradients or parameters across nodes, which are typically
% conclusion: role of topology in non-IID is not understood / has not smaller in the IID case. However, these results do not give any clear insight
% been much studied before our work. regarding the role of the topology in the non-IID case. We note that some work
has gone into designing efficient topologies to optimize the use of
network resources (see e.g., \cite{marfoq}), but this is done independently
of how data is distributed across nodes. In summary, the role
of topology in the
non-IID data scenario is
not well understood and we are not aware of prior work focusing on this
question.
\paragraph{Dealing with non-IID data in server-based FL.} \paragraph{Dealing with non-IID data in server-based FL.}
Dealing with non-IID data in server-based FL has
% scaffold, quagmire, fedprox, etc recently attracted a lot of interest. While non-IID data is not an issue if
% also personalized models: Smith etc clients send their parameters to the server after each gradient update,
problems arise when one seeks to reduce
the number of communication rounds by allowing each participant to perform
multiple local updates, as in the popular FedAvg algorithm
\cite{mcmahan2016communication}. This led to the design of extensions that are
specifically designed to mitigate the impact of non-IID data when performing
multiple local updates, using adaptive sampling \cite{quagmire}, update
corrections \cite{scaffold} or regularization in the local objective
\cite{fedprox}. Another direction is to embrace the non-IID scenario by
learning personalized models for each client
\cite{smith2017federated,perso_fl_mean,maml,moreau}.
\paragraph{Dealing with non-IID data in fully decentralized FL.} \paragraph{Dealing with non-IID data in fully decentralized FL.}
Non-IID data is known to negatively impact the convergence speed
of fully decentralized FL algorithms in practice \cite{jelasity}. This
motivated the design of algorithms with modified updates based on variance
reduction \cite{tang18a}, momentum correction \cite{momentum_noniid},
cross-gradient
aggregation \cite{cross_gradient}, or multiple averaging steps
between updates (see \cite{consensus_distance} and references therein). These
algorithms
typically require additional communication and/or computation.\footnote{We
also observed that \cite{tang18a} is subject to numerical
instabilities when run on topologies other than rings and grids. When
the rows and columns of $W$ do not exactly
sum to $1$ (due to finite precision), these small differences get amplified by
the proposed updates and make the algorithm diverge.}Z
% non-IID known to be a problem for fully decentralized FL. cf Jelasity paper % non-IID known to be a problem for fully decentralized FL. cf Jelasity paper
% D2 and other recent papers on modifying updates: Quasi-Global Momentum, % D2 and other recent papers on modifying updates: Quasi-Global Momentum,
% Cross-Gradient Aggregation % Cross-Gradient Aggregation
% papers using multiple averaging steps % papers using multiple averaging steps
% also our personalized papers % also our personalized papers
% D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly
D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly % sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting.
sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. In contrast, D-cliques focuses on the design of a sparse topology which is
able to compensate for the effect of non-IID data. We do not modify the simple
An originality of our approach is to focus on the effect of topology and efficient D-SGD
level without significantly changing the original simple and efficient D-SGD algorithm \cite{lian2017d-psgd} beyond removing some neighbor
algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID contributions
data on decentralized algorithms are based on performing modified updates (eg that would otherwise bias the direction of the gradient.
with variance reduction) or multiple averaging steps. \aurelien{add personalized models - or merge all that in specific paragraph}
% An originality of our approach is to focus on the effect of topology
% level without significantly changing the original simple and efficient D-SGD
% algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID
% data on decentralized algorithms are based on performing modified updates (eg
% with variance reduction) or multiple averaging steps.
\section{Future Work} \section{Future Work}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment