diff --git a/main.tex b/main.tex
index bebaca5ee6ed9a049dfe80494230bef01f5c2c51..2f36b8a1bcfd0d2934566244a86061a03ce0177a 100644
--- a/main.tex
+++ b/main.tex
@@ -658,28 +658,6 @@ we'll see}
 
 \aurelien{TODO: where to place TornadoAggregate and related refs?}
 
-\paragraph{Impact of topology in fully decentralized FL.} It is well
-known
-that the choice of network topology can affect the
-convergence of fully decentralized algorithms: this is typically accounted
-for in the theoretical convergence rate by a dependence on the spectral gap of
-the network, see for instance 
-\cite{Duchi2012a,Colin2016a,lian2017d-psgd,Nedic18}.
-However, for IID data, practice contradicts these classic
-results: fully decentralized algorithms converge essentially as fast
-on sparse topologies like rings or grids as they do on a fully connected
-graph \cite{lian2017d-psgd,Lian2018}. Recent work 
-\cite{neglia2020,consensus_distance} sheds light on this phenomenon with refined convergence analyses based on differences between gradients or parameters across nodes, which are typically
-smaller in the IID case. However, these results do not give any clear insight
-regarding the role of the topology in the non-IID case. We note that some work
-has gone into designing efficient topologies to optimize the use of
-network resources (see e.g., \cite{marfoq}), but this is done independently
-of how data is distributed across nodes. In summary, the role
-of topology in the
-non-IID data scenario is
-not well understood and we are not aware of prior work focusing on this
-question.
-
 \paragraph{Dealing with non-IID data in server-based FL.}
 Dealing with non-IID data in server-based FL has
 recently attracted a lot of interest. While non-IID data is not an issue if
@@ -709,7 +687,8 @@ also observed that \cite{tang18a} is subject to numerical
 instabilities when run on topologies other than rings and grids. When
 the rows and columns of $W$ do not exactly
 sum to $1$ (due to finite precision), these small differences get amplified by
-the proposed updates and make the algorithm diverge.}Z
+the proposed updates and make the algorithm diverge.}\aurelien{emphasize that
+they only do small scale experiments}
 % non-IID known to be a problem for fully decentralized FL. cf Jelasity paper
 % D2 and other recent papers on modifying updates: Quasi-Global Momentum,
 % Cross-Gradient Aggregation
@@ -732,6 +711,31 @@ that would otherwise bias the direction of the gradient.
 % with variance reduction) or multiple averaging steps.
 
 
+\paragraph{Impact of topology in fully decentralized FL.} It is well
+known
+that the choice of network topology can affect the
+convergence of fully decentralized algorithms: this is typically accounted
+for in the theoretical convergence rate by a dependence on the spectral gap of
+the network, see for instance 
+\cite{Duchi2012a,Colin2016a,lian2017d-psgd,Nedic18}.
+However, for IID data, practice contradicts these classic
+results: fully decentralized algorithms converge essentially as fast
+on sparse topologies like rings or grids as they do on a fully connected
+graph \cite{lian2017d-psgd,Lian2018}. Recent work 
+\cite{neglia2020,consensus_distance} sheds light on this phenomenon with refined convergence analyses based on differences between gradients or parameters across nodes, which are typically
+smaller in the IID case. However, these results do not give any clear insight
+regarding the role of the topology in the non-IID case. We note that some work
+has gone into designing efficient topologies to optimize the use of
+network resources (see e.g., \cite{marfoq}), but this is done independently
+of how data is distributed across nodes. In summary, the role
+of topology in the non-IID data scenario is not well understood and we are not
+aware of prior work focusing on this question. Our work shows that an
+appropriate choice of data-dependent topology can effectively compensate for
+non-IID data.
+
+\section{Conclusion}
+
+
 %\section{Future Work}
 %\begin{itemize}
 %  \item Non-uniform Class Representation
@@ -741,8 +745,6 @@ that would otherwise bias the direction of the gradient.
 %  \item Relaxing Clique Connectivity: Randomly choose a subset of clique neighbours to compute average gradient.
 %\end{itemize}
 
-\section{Conclusion}
-
 
 \section{Credits}