From 23c48e8df9c6eee62d942cb673990b6ba9a4730e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien?= <aurelien.bellet@inria.fr> Date: Fri, 19 Mar 2021 12:21:12 +0100 Subject: [PATCH] cosmit --- main.bib | 6 ++++++ main.tex | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/main.bib b/main.bib index decef7d..0e609ec 100644 --- a/main.bib +++ b/main.bib @@ -5,6 +5,12 @@ year={2020} } +@inproceedings{Lian2018, + Author = {Xiangru Lian and Wei Zhang and Ce Zhang and Ji Liu}, + Booktitle = {ICML}, + Title = {{Asynchronous Decentralized Parallel Stochastic Gradient Descent}}, + Year = {2018}} + @inproceedings{fedprox, author = {Tian Li and Anit Kumar Sahu and Manzil Zaheer and Maziar Sanjabi and Ameet Talwalkar and Virginia Smith}, title = {{Federated Optimization in Heterogeneous Networks}}, diff --git a/main.tex b/main.tex index b58f287..93bd1f8 100644 --- a/main.tex +++ b/main.tex @@ -137,8 +137,11 @@ unbiased with respect to the class distribution. We empirically evaluate our approach on MNIST and CIFAR10 datasets using logistic regression and deep convolutional models with up to 1000 participants. This is -in contrast to most previous work on fully decentralized algorithms which only -consider a few tens of participants \cite{refs}. +in contrast to most previous work on fully decentralized algorithms +considering only a few tens of participants \cite{tang18a,more_refs}, which +fall short of +giving a realistic view of the performance of these algorithms in actual +applications. \aurelien{TODO: complete above paragraph with more details and highlighting other contributions as needed} @@ -614,7 +617,8 @@ network, see for instance \cite{Duchi2012a,lian2017d-psgd,Nedic18}. % papers using multiple averaging steps % also our personalized papers -D2: numerically unstable when $W_{ij}$ rows and columns do not exactly sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. +D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly +sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. An originality of our approach is to focus on the effect of topology level without significantly changing the original simple and efficient D-SGD -- GitLab