From 223f059a17d95ff09bf7ae7ce902db3e3c9b3f46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien?= <aurelien.bellet@inria.fr>
Date: Tue, 23 Mar 2021 11:05:38 +0100
Subject: [PATCH] continue related work

---
 main.bib | 82 ++++++++++++++++++++++++++++++++++++++++++++-------
 main.tex | 89 +++++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 135 insertions(+), 36 deletions(-)

diff --git a/main.bib b/main.bib
index 0e609ec..8b6b2f9 100644
--- a/main.bib
+++ b/main.bib
@@ -1,3 +1,59 @@
+@inproceedings{smith2017federated,
+  title={{Federated Multi-Task Learning}},
+  author={Smith, Virginia and Chiang, Chao-Kai and Sanjabi, Maziar and Talwalkar, Ameet S.},
+  booktitle={NIPS},
+  year={2017}
+}
+
+@inproceedings{perso_fl_mean,
+  title={{Lower Bounds and Optimal Algorithms for Personalized Federated Learning}},
+  author={Filip Hanzely and Slavomír Hanzely and Samuel Horváth and Peter Richtarik},
+  booktitle={NeurIPS},
+  year={2020}
+}
+
+@inproceedings{maml,
+  title={{Personalized Federated Learning with Theoretical Guarantees: A Model-Agnostic Meta-Learning Approach}},
+  author={Alireza Fallah and Aryan Mokhtari and Asuman Ozdaglar},
+  booktitle={NeurIPS},
+  year={2020}
+}
+
+@inproceedings{moreau,
+  title={{Personalized Federated Learning with Moreau Envelopes}},
+  author={Canh T. Dinh and Nguyen H. Tran and Tuan Dung Nguyen},
+  booktitle={NeurIPS},
+  year={2020}
+}
+
+@techreport{momentum_noniid,
+    title={{Quasi-Global Momentum: Accelerating Decentralized Deep Learning on Heterogeneous Data}},
+    author={Tao Lin and Sai Praneeth Karimireddy and Sebastian U. Stich and Martin Jaggi},
+    year={2021},
+    institution = {arXiv:2102.04761}
+}
+
+@techreport{cross_gradient,
+    title={{Cross-Gradient Aggregation for Decentralized Learning from Non-IID data}},
+    author={Yasaman Esfandiari and Sin Yong Tan and Zhanhong Jiang and Aditya Balu and Ethan Herron and Chinmay Hegde and Soumik Sarkar},
+    year={2021},
+    institution = {arXiv:2103.02051}
+}
+
+@techreport{consensus_distance,
+    title={{Consensus Control for Decentralized Deep Learning}},
+    author={Lingjing Kong and Tao Lin and Anastasia Koloskova and Martin Jaggi and Sebastian U. Stich},
+    year={2021},
+    institution = {arXiv:2102.04828}
+}
+
+@INPROCEEDINGS{Colin2016a,
+  author = {Igor Colin and Aur\'elien Bellet and Joseph Salmon and St\'ephan Cl\'emen\c{c}on},
+  title = {{G}ossip {D}ual {A}veraging for {D}ecentralized {O}ptimization of {P}airwise {F}unctions},
+  booktitle = {{ICML}},
+  year = {2016}
+}
+
 @inproceedings{scaffold,
   title={{SCAFFOLD: Stochastic Controlled Averaging for On-Device Federated Learning}},
   author={Sai Praneeth Karimireddy and Satyen Kale and Mehryar Mohri and Sashank J. Reddi and Sebastian U. Stich and Ananda Theertha Suresh},
@@ -5,6 +61,13 @@
   year={2020}
 }
 
+@inproceedings{marfoq,
+  title={{Throughput-Optimal Topology Design for Cross-Silo Federated Learning}},
+  author={Othmane Marfoq and Chuan Xu and Giovanni Neglia and Richard Vidal},
+  booktitle={NeurIPS},
+  year={2020}
+}
+
 @inproceedings{Lian2018,
   Author = {Xiangru Lian and Wei Zhang and Ce Zhang and Ji Liu},
   Booktitle = {ICML},
@@ -43,7 +106,7 @@
     title={{Privacy Amplification by Decentralization}},
     author={Edwige Cyffers and Aurélien Bellet},
     year={2020},
-    institution = {2012.05326}
+    institution = {arXiv:2012.05326}
 }
 
 @article{Duchi2012a,
@@ -59,6 +122,14 @@
     Volume = {57},
     Year = {2012}}
 
+@article{jelasity,
+    Author = {István Hegedüs and Gábor Danner and Márk Jelasity},
+    Journal = {Journal of Parallel and Distributed Computing},
+    Pages = {109--124},
+    Title = {{Decentralized learning works: An empirical comparison of gossip learning and federated learning}},
+    Volume = {148},
+    Year = {2021}}
+
 @article{Nedic18,
     Author = {Angelia Nedić and Alex Olshevsky and Michael G. Rabbat},
     Journal = {Proceedings of the IEEE},
@@ -620,15 +691,6 @@ pages={211-252}
   url = 	 {http://proceedings.mlr.press/v80/tang18a.html},
 }
 
-@misc{hsieh2019noniid,
-    title={The Non-IID Data Quagmire of Decentralized Machine Learning},
-    author={Kevin Hsieh and Amar Phanishayee and Onur Mutlu and Phillip B. Gibbons},
-    year={2019},
-    eprint={1910.00189},
-    archivePrefix={arXiv},
-    primaryClass={cs.LG}
-}
-
 @article{xiao2007distributed,
   title={{Distributed Average Consensus with Least-Mean-Square Deviation}},
   author={Xiao, Lin and Boyd, Stephen and Kim, Seung-Jean},
diff --git a/main.tex b/main.tex
index 32cd17e..b92d27c 100644
--- a/main.tex
+++ b/main.tex
@@ -92,7 +92,8 @@ can efficiently deal with such non-IID data
 Federated learning algorithms can be classified into two categories depending
 on the network topology they work on. In server-based FL, the network is
 organized as a star: a central server orchestrates the training process and
-iteratively aggregates model updates received from the participants and sends
+iteratively aggregates model updates received from the participants
+(\emph{clients}) and sends
 them back the aggregated model \cite{mcmahan2016communication}. In contrast,
 fully decentralized FL algorithms operate over an arbitrary topology where
 participants communicate in a peer-to-peer fashion with their direct neighbors
@@ -107,7 +108,7 @@ applications \cite{kairouz2019advances}. Indeed, while a central
 server quickly becomes a bottleneck as the number of participants increases, the topology used in fully decentralized algorithms can remain sparse
 enough such that all participants have small (constant or logarithmic) degree 
 \cite{lian2017d-psgd}. Recent work has shown both empirically 
-\cite{lian2017d-psgd} and theoretically \cite{neglia2020} that sparse
+\cite{lian2017d-psgd,Lian2018} and theoretically \cite{neglia2020} that sparse
 topologies like rings or grids do not significantly affect the convergence
 rate compared to using denser topologies when data is IID.
 % We also note that full decentralization can also provide benefits in terms of
@@ -596,44 +597,80 @@ In addition, it is important that all nodes are initialized with the same model
 \aurelien{not sure yet if it is better to have this section here or earlier,
 we'll see}
 
-% where to place TornadoAggregate and related refs?
+\aurelien{TODO: where to place TornadoAggregate and related refs?}
 
 \paragraph{Impact of topology in fully decentralized FL.} It is well
 known
 that the choice of network topology can affect the
 convergence of fully decentralized algorithms: this is typically accounted
-for
-in the theoretical convergence rate by a dependence on the spectral gap of the
-network, see for instance \cite{Duchi2012a,lian2017d-psgd,Nedic18}.
-
-% mention Neglia and empirical results for IID data, probably also Consensus
-% Control paper which does not allow to analyze the effect of topology.
-% can mention Marfoq paper on topology design but to optimize network
-% resources, independent of data
-% conclusion: role of topology in non-IID is not understood / has not
-% been much studied before our work.
+for in the theoretical convergence rate by a dependence on the spectral gap of
+the network, see for instance 
+\cite{Duchi2012a,Colin2016a,lian2017d-psgd,Nedic18}.
+However, for IID data, practice contradicts these classic
+results: fully decentralized algorithms converge essentially as fast
+on sparse topologies like rings or grids as they do on a fully connected
+graph \cite{lian2017d-psgd,Lian2018}. Recent work 
+\cite{neglia2020,consensus_distance} sheds light on this phenomenon with refined convergence analyses based on differences between gradients or parameters across nodes, which are typically
+smaller in the IID case. However, these results do not give any clear insight
+regarding the role of the topology in the non-IID case. We note that some work
+has gone into designing efficient topologies to optimize the use of
+network resources (see e.g., \cite{marfoq}), but this is done independently
+of how data is distributed across nodes. In summary, the role
+of topology in the
+non-IID data scenario is
+not well understood and we are not aware of prior work focusing on this
+question.
 
 \paragraph{Dealing with non-IID data in server-based FL.}
-
-% scaffold, quagmire, fedprox, etc
-% also personalized models: Smith etc
+Dealing with non-IID data in server-based FL has
+recently attracted a lot of interest. While non-IID data is not an issue if
+clients send their parameters to the server after each gradient update,
+problems arise when one seeks to reduce
+the number of communication rounds by allowing each participant to perform
+multiple local updates, as in the popular FedAvg algorithm 
+\cite{mcmahan2016communication}. This led to the design of extensions that are
+specifically designed to mitigate the impact of non-IID data when performing
+multiple local updates, using adaptive sampling \cite{quagmire}, update
+corrections \cite{scaffold} or regularization in the local objective 
+\cite{fedprox}. Another direction is to embrace the non-IID scenario by
+learning personalized models for each client 
+\cite{smith2017federated,perso_fl_mean,maml,moreau}.
 
 \paragraph{Dealing with non-IID data in fully decentralized FL.}
-
+Non-IID data is known to negatively impact the convergence speed
+of fully decentralized FL algorithms in practice \cite{jelasity}. This
+motivated the design of algorithms with modified updates based on variance
+reduction \cite{tang18a}, momentum correction \cite{momentum_noniid},
+cross-gradient
+aggregation \cite{cross_gradient}, or multiple averaging steps
+between updates (see \cite{consensus_distance} and references therein). These
+algorithms
+typically require additional communication and/or computation.\footnote{We
+also observed that \cite{tang18a} is subject to numerical
+instabilities when run on topologies other than rings and grids. When
+the rows and columns of $W$ do not exactly
+sum to $1$ (due to finite precision), these small differences get amplified by
+the proposed updates and make the algorithm diverge.}Z
 % non-IID known to be a problem for fully decentralized FL. cf Jelasity paper
 % D2 and other recent papers on modifying updates: Quasi-Global Momentum,
 % Cross-Gradient Aggregation
 % papers using multiple averaging steps
 % also our personalized papers
-
-D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly
-sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting.
-
-An originality of our approach is to focus on the effect of topology
-level without significantly changing the original simple and efficient D-SGD
-algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID
-data on decentralized algorithms are based on performing modified updates (eg
-with variance reduction) or multiple averaging steps.
+% D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly
+% sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting.
+In contrast, D-cliques focuses on the design of a sparse topology which is
+able to compensate for the effect of non-IID data. We do not modify the simple
+and efficient D-SGD
+algorithm \cite{lian2017d-psgd} beyond removing some neighbor
+contributions
+that would otherwise bias the direction of the gradient.
+\aurelien{add personalized models - or merge all that in specific paragraph}
+
+% An originality of our approach is to focus on the effect of topology
+% level without significantly changing the original simple and efficient D-SGD
+% algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID
+% data on decentralized algorithms are based on performing modified updates (eg
+% with variance reduction) or multiple averaging steps.
 
 
 \section{Future Work}
-- 
GitLab