From 223f059a17d95ff09bf7ae7ce902db3e3c9b3f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= <aurelien.bellet@inria.fr> Date: Tue, 23 Mar 2021 11:05:38 +0100 Subject: [PATCH] continue related work --- main.bib | 82 ++++++++++++++++++++++++++++++++++++++++++++------- main.tex | 89 +++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 135 insertions(+), 36 deletions(-) diff --git a/main.bib b/main.bib index 0e609ec..8b6b2f9 100644 --- a/main.bib +++ b/main.bib @@ -1,3 +1,59 @@ +@inproceedings{smith2017federated, + title={{Federated Multi-Task Learning}}, + author={Smith, Virginia and Chiang, Chao-Kai and Sanjabi, Maziar and Talwalkar, Ameet S.}, + booktitle={NIPS}, + year={2017} +} + +@inproceedings{perso_fl_mean, + title={{Lower Bounds and Optimal Algorithms for Personalized Federated Learning}}, + author={Filip Hanzely and SlavomÃr Hanzely and Samuel Horváth and Peter Richtarik}, + booktitle={NeurIPS}, + year={2020} +} + +@inproceedings{maml, + title={{Personalized Federated Learning with Theoretical Guarantees: A Model-Agnostic Meta-Learning Approach}}, + author={Alireza Fallah and Aryan Mokhtari and Asuman Ozdaglar}, + booktitle={NeurIPS}, + year={2020} +} + +@inproceedings{moreau, + title={{Personalized Federated Learning with Moreau Envelopes}}, + author={Canh T. Dinh and Nguyen H. Tran and Tuan Dung Nguyen}, + booktitle={NeurIPS}, + year={2020} +} + +@techreport{momentum_noniid, + title={{Quasi-Global Momentum: Accelerating Decentralized Deep Learning on Heterogeneous Data}}, + author={Tao Lin and Sai Praneeth Karimireddy and Sebastian U. Stich and Martin Jaggi}, + year={2021}, + institution = {arXiv:2102.04761} +} + +@techreport{cross_gradient, + title={{Cross-Gradient Aggregation for Decentralized Learning from Non-IID data}}, + author={Yasaman Esfandiari and Sin Yong Tan and Zhanhong Jiang and Aditya Balu and Ethan Herron and Chinmay Hegde and Soumik Sarkar}, + year={2021}, + institution = {arXiv:2103.02051} +} + +@techreport{consensus_distance, + title={{Consensus Control for Decentralized Deep Learning}}, + author={Lingjing Kong and Tao Lin and Anastasia Koloskova and Martin Jaggi and Sebastian U. Stich}, + year={2021}, + institution = {arXiv:2102.04828} +} + +@INPROCEEDINGS{Colin2016a, + author = {Igor Colin and Aur\'elien Bellet and Joseph Salmon and St\'ephan Cl\'emen\c{c}on}, + title = {{G}ossip {D}ual {A}veraging for {D}ecentralized {O}ptimization of {P}airwise {F}unctions}, + booktitle = {{ICML}}, + year = {2016} +} + @inproceedings{scaffold, title={{SCAFFOLD: Stochastic Controlled Averaging for On-Device Federated Learning}}, author={Sai Praneeth Karimireddy and Satyen Kale and Mehryar Mohri and Sashank J. Reddi and Sebastian U. Stich and Ananda Theertha Suresh}, @@ -5,6 +61,13 @@ year={2020} } +@inproceedings{marfoq, + title={{Throughput-Optimal Topology Design for Cross-Silo Federated Learning}}, + author={Othmane Marfoq and Chuan Xu and Giovanni Neglia and Richard Vidal}, + booktitle={NeurIPS}, + year={2020} +} + @inproceedings{Lian2018, Author = {Xiangru Lian and Wei Zhang and Ce Zhang and Ji Liu}, Booktitle = {ICML}, @@ -43,7 +106,7 @@ title={{Privacy Amplification by Decentralization}}, author={Edwige Cyffers and Aurélien Bellet}, year={2020}, - institution = {2012.05326} + institution = {arXiv:2012.05326} } @article{Duchi2012a, @@ -59,6 +122,14 @@ Volume = {57}, Year = {2012}} +@article{jelasity, + Author = {István Hegedüs and Gábor Danner and Márk Jelasity}, + Journal = {Journal of Parallel and Distributed Computing}, + Pages = {109--124}, + Title = {{Decentralized learning works: An empirical comparison of gossip learning and federated learning}}, + Volume = {148}, + Year = {2021}} + @article{Nedic18, Author = {Angelia Nedić and Alex Olshevsky and Michael G. Rabbat}, Journal = {Proceedings of the IEEE}, @@ -620,15 +691,6 @@ pages={211-252} url = {http://proceedings.mlr.press/v80/tang18a.html}, } -@misc{hsieh2019noniid, - title={The Non-IID Data Quagmire of Decentralized Machine Learning}, - author={Kevin Hsieh and Amar Phanishayee and Onur Mutlu and Phillip B. Gibbons}, - year={2019}, - eprint={1910.00189}, - archivePrefix={arXiv}, - primaryClass={cs.LG} -} - @article{xiao2007distributed, title={{Distributed Average Consensus with Least-Mean-Square Deviation}}, author={Xiao, Lin and Boyd, Stephen and Kim, Seung-Jean}, diff --git a/main.tex b/main.tex index 32cd17e..b92d27c 100644 --- a/main.tex +++ b/main.tex @@ -92,7 +92,8 @@ can efficiently deal with such non-IID data Federated learning algorithms can be classified into two categories depending on the network topology they work on. In server-based FL, the network is organized as a star: a central server orchestrates the training process and -iteratively aggregates model updates received from the participants and sends +iteratively aggregates model updates received from the participants +(\emph{clients}) and sends them back the aggregated model \cite{mcmahan2016communication}. In contrast, fully decentralized FL algorithms operate over an arbitrary topology where participants communicate in a peer-to-peer fashion with their direct neighbors @@ -107,7 +108,7 @@ applications \cite{kairouz2019advances}. Indeed, while a central server quickly becomes a bottleneck as the number of participants increases, the topology used in fully decentralized algorithms can remain sparse enough such that all participants have small (constant or logarithmic) degree \cite{lian2017d-psgd}. Recent work has shown both empirically -\cite{lian2017d-psgd} and theoretically \cite{neglia2020} that sparse +\cite{lian2017d-psgd,Lian2018} and theoretically \cite{neglia2020} that sparse topologies like rings or grids do not significantly affect the convergence rate compared to using denser topologies when data is IID. % We also note that full decentralization can also provide benefits in terms of @@ -596,44 +597,80 @@ In addition, it is important that all nodes are initialized with the same model \aurelien{not sure yet if it is better to have this section here or earlier, we'll see} -% where to place TornadoAggregate and related refs? +\aurelien{TODO: where to place TornadoAggregate and related refs?} \paragraph{Impact of topology in fully decentralized FL.} It is well known that the choice of network topology can affect the convergence of fully decentralized algorithms: this is typically accounted -for -in the theoretical convergence rate by a dependence on the spectral gap of the -network, see for instance \cite{Duchi2012a,lian2017d-psgd,Nedic18}. - -% mention Neglia and empirical results for IID data, probably also Consensus -% Control paper which does not allow to analyze the effect of topology. -% can mention Marfoq paper on topology design but to optimize network -% resources, independent of data -% conclusion: role of topology in non-IID is not understood / has not -% been much studied before our work. +for in the theoretical convergence rate by a dependence on the spectral gap of +the network, see for instance +\cite{Duchi2012a,Colin2016a,lian2017d-psgd,Nedic18}. +However, for IID data, practice contradicts these classic +results: fully decentralized algorithms converge essentially as fast +on sparse topologies like rings or grids as they do on a fully connected +graph \cite{lian2017d-psgd,Lian2018}. Recent work +\cite{neglia2020,consensus_distance} sheds light on this phenomenon with refined convergence analyses based on differences between gradients or parameters across nodes, which are typically +smaller in the IID case. However, these results do not give any clear insight +regarding the role of the topology in the non-IID case. We note that some work +has gone into designing efficient topologies to optimize the use of +network resources (see e.g., \cite{marfoq}), but this is done independently +of how data is distributed across nodes. In summary, the role +of topology in the +non-IID data scenario is +not well understood and we are not aware of prior work focusing on this +question. \paragraph{Dealing with non-IID data in server-based FL.} - -% scaffold, quagmire, fedprox, etc -% also personalized models: Smith etc +Dealing with non-IID data in server-based FL has +recently attracted a lot of interest. While non-IID data is not an issue if +clients send their parameters to the server after each gradient update, +problems arise when one seeks to reduce +the number of communication rounds by allowing each participant to perform +multiple local updates, as in the popular FedAvg algorithm +\cite{mcmahan2016communication}. This led to the design of extensions that are +specifically designed to mitigate the impact of non-IID data when performing +multiple local updates, using adaptive sampling \cite{quagmire}, update +corrections \cite{scaffold} or regularization in the local objective +\cite{fedprox}. Another direction is to embrace the non-IID scenario by +learning personalized models for each client +\cite{smith2017federated,perso_fl_mean,maml,moreau}. \paragraph{Dealing with non-IID data in fully decentralized FL.} - +Non-IID data is known to negatively impact the convergence speed +of fully decentralized FL algorithms in practice \cite{jelasity}. This +motivated the design of algorithms with modified updates based on variance +reduction \cite{tang18a}, momentum correction \cite{momentum_noniid}, +cross-gradient +aggregation \cite{cross_gradient}, or multiple averaging steps +between updates (see \cite{consensus_distance} and references therein). These +algorithms +typically require additional communication and/or computation.\footnote{We +also observed that \cite{tang18a} is subject to numerical +instabilities when run on topologies other than rings and grids. When +the rows and columns of $W$ do not exactly +sum to $1$ (due to finite precision), these small differences get amplified by +the proposed updates and make the algorithm diverge.}Z % non-IID known to be a problem for fully decentralized FL. cf Jelasity paper % D2 and other recent papers on modifying updates: Quasi-Global Momentum, % Cross-Gradient Aggregation % papers using multiple averaging steps % also our personalized papers - -D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly -sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. - -An originality of our approach is to focus on the effect of topology -level without significantly changing the original simple and efficient D-SGD -algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID -data on decentralized algorithms are based on performing modified updates (eg -with variance reduction) or multiple averaging steps. +% D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly +% sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. +In contrast, D-cliques focuses on the design of a sparse topology which is +able to compensate for the effect of non-IID data. We do not modify the simple +and efficient D-SGD +algorithm \cite{lian2017d-psgd} beyond removing some neighbor +contributions +that would otherwise bias the direction of the gradient. +\aurelien{add personalized models - or merge all that in specific paragraph} + +% An originality of our approach is to focus on the effect of topology +% level without significantly changing the original simple and efficient D-SGD +% algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID +% data on decentralized algorithms are based on performing modified updates (eg +% with variance reduction) or multiple averaging steps. \section{Future Work} -- GitLab