diff --git a/main.bib b/main.bib
index aa91cd3d7774a88826382a14f0972eb1140702fa..dc335c67debfc746431c2e7f1c4ed085c2bdb8af 100644
--- a/main.bib
+++ b/main.bib
@@ -1,3 +1,20 @@
+@INPROCEEDINGS{Vanhaesebrouck2017a,
+  author = {Paul Vanhaesebrouck and Aur\'elien Bellet and Marc Tommasi},
+  title = {{D}ecentralized {C}ollaborative {L}earning of {P}ersonalized {M}odels over {N}etworks},
+  booktitle = {{AISTATS}},
+  year = {2017}
+}
+
+@INPROCEEDINGS{Zantedeschi2020a,
+  author = {Valentina Zantedeschi and Aur\'elien Bellet and Marc Tommasi},
+  title = {{F}ully {D}ecentralized {J}oint {L}earning of {P}ersonalized 
+  {M}odels and {C}ollaboration {G}raphs},
+  booktitle = {{AISTATS}},
+  year = {2020}
+}
+
+
+
 @inproceedings{smith2017federated,
   title={{Federated Multi-Task Learning}},
   author={Smith, Virginia and Chiang, Chao-Kai and Sanjabi, Maziar and Talwalkar, Ameet S.},
@@ -33,6 +50,13 @@
     institution = {arXiv:2102.04761}
 }
 
+@techreport{tornado,
+    title={TornadoAggregate: Accurate and Scalable Federated Learning via the Ring-Based Architecture},
+    author={Jin-Woo Lee and Jaehoon Oh and Sungsu Lim and Se-Young Yun and Jae-Gil Lee},
+    year={2020},
+    institution = {arXiv:2012.03214}
+}
+
 @techreport{cross_gradient,
     title={{Cross-Gradient Aggregation for Decentralized Learning from Non-IID data}},
     author={Yasaman Esfandiari and Sin Yong Tan and Zhanhong Jiang and Aditya Balu and Ethan Herron and Chinmay Hegde and Soumik Sarkar},
diff --git a/main.tex b/main.tex
index cb4ed95b6075ec168231379f0f1d66d469a12917..617a02a13c35fad3c24ab438bfbe93f06288754f 100644
--- a/main.tex
+++ b/main.tex
@@ -845,41 +845,45 @@ show that D-Cliques can scale nicely with the number of nodes.
 \section{Related Work}
 \label{section:related-work}
 
-\aurelien{TODO: where to place TornadoAggregate and related refs?}
+In this section, we review some related work on dealing with non-IID data in
+FL, and on the role of topology in fully decentralized algorithms.
 
 \paragraph{Dealing with non-IID data in server-based FL.}
-Dealing with non-IID data in server-based FL has
-recently attracted a lot of interest. While non-IID data is not an issue if
+While non-IID data is not an issue in server-based FL if
 clients send their parameters to the server after each gradient update,
 problems arise when one seeks to reduce
 the number of communication rounds by allowing each participant to perform
 multiple local updates, as in the popular FedAvg algorithm 
-\cite{mcmahan2016communication}. This led to the design of extensions that are
-specifically designed to mitigate the impact of non-IID data when performing
-multiple local updates, using adaptive sampling \cite{quagmire}, update
+\cite{mcmahan2016communication}. Indeed, non-IID data can prevent the
+algorithm from
+converging to a good solution in this case. This led to the design of
+extensions that are specifically designed to mitigate the impact of non-IID
+data when performing
+multiple local updates, using adaptive client sampling \cite{quagmire}, update
 corrections \cite{scaffold} or regularization in the local objective 
 \cite{fedprox}. Another direction is to embrace the non-IID scenario by
 learning personalized models for each client 
 \cite{smith2017federated,perso_fl_mean,maml,moreau}.
+We note that recent work explores rings of server-based topologies 
+\cite{tornado}, but the focus is not on dealing with non-IID data but
+to make server-based FL more scalable to a large number of clients.
 
 \paragraph{Dealing with non-IID data in fully decentralized FL.}
 Non-IID data is known to negatively impact the convergence speed
-of fully decentralized FL algorithms in practice \cite{jelasity}. This
+of fully decentralized FL algorithms in practice \cite{jelasity}. Aside from approaches that aim to learn personalized models \cite{Vanhaesebrouck2017a,Zantedeschi2020a}, this
 motivated the design of algorithms with modified updates based on variance
 reduction \cite{tang18a}, momentum correction \cite{momentum_noniid},
 cross-gradient
 aggregation \cite{cross_gradient}, or multiple averaging steps
 between updates (see \cite{consensus_distance} and references therein). These
 algorithms
-typically require additional communication and/or computation.\footnote{We
+typically require additional communication and/or computation, and have been
+only evaluated in small-scale networks with a few tens of nodes.\footnote{We
 also observed that \cite{tang18a} is subject to numerical
 instabilities when run on topologies other than rings. When
 the rows and columns of $W$ do not exactly
 sum to $1$ (due to finite precision), these small differences get amplified by
 the proposed updates and make the algorithm diverge.}
-
-\aurelien{emphasize that they only do small scale experiments}
-
 % non-IID known to be a problem for fully decentralized FL. cf Jelasity paper
 % D2 and other recent papers on modifying updates: Quasi-Global Momentum,
 % Cross-Gradient Aggregation
@@ -888,14 +892,13 @@ the proposed updates and make the algorithm diverge.}
 % D2 \cite{tang18a}: numerically unstable when $W_{ij}$ rows and columns do not exactly
 % sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbor contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting.
 In contrast, D-Cliques focuses on the design of a sparse topology which is
-able to compensate for the effect of non-IID data. We do not modify the simple
+able to compensate for the effect of non-IID data and scales to large
+networks. We do not modify the simple
 and efficient D-SGD
 algorithm \cite{lian2017d-psgd} beyond removing some neighbor
 contributions
 that would otherwise bias the direction of the gradient.
 
-\aurelien{add personalized models - or merge all that in specific paragraph}
-
 % An originality of our approach is to focus on the effect of topology
 % level without significantly changing the original simple and efficient D-SGD
 % algorithm \cite{lian2017d-psgd}. Other work to mitigate the effect of non-IID