diff --git a/main.bib b/main.bib
index 37764d25598c39f25f2d1872ee6ef197f17278dd..4050742108ab0b3eaf983658a868286aa8b87df0 100644
--- a/main.bib
+++ b/main.bib
@@ -141,7 +141,7 @@
 
 @techreport{kairouz2019advances,
     title={{Advances and Open Problems in Federated Learning}},
-    author={Peter Kairouz and H. Brendan McMahan and Brendan Avent and AurÃ©lien Bellet and Mehdi Bennis and Arjun Nitin Bhagoji and Keith Bonawitz and Zachary Charles and Graham Cormode and Rachel Cummings and Rafael G. L. D'Oliveira and Salim El Rouayheb and David Evans and Josh Gardner and Zachary Garrett and AdriÃ  GascÃ³n and Badih Ghazi and Phillip B. Gibbons and Marco Gruteser and Zaid Harchaoui and Chaoyang He and Lie He and Zhouyuan Huo and Ben Hutchinson and Justin Hsu and Martin Jaggi and Tara Javidi and Gauri Joshi and Mikhail Khodak and Jakub KoneÄnÃ½ and Aleksandra Korolova and Farinaz Koushanfar and Sanmi Koyejo and TancrÃ¨de Lepoint and Yang Liu and Prateek Mittal and Mehryar Mohri and Richard Nock and Ayfer Ã–zgÃ¼r and Rasmus Pagh and Mariana Raykova and Hang Qi and Daniel Ramage and Ramesh Raskar and Dawn Song and Weikang Song and Sebastian U. Stich and Ziteng Sun and Ananda Theertha Suresh and Florian TramÃ¨r and Praneeth Vepakomma and Jianyu Wang and Li Xiong and Zheng Xu and Qiang Yang and Felix X. Yu and Han Yu and Sen Zhao},
+    author={Peter Kairouz and others},
     year={2019},
     institution = {arXiv:1912.04977}
 }
@@ -606,14 +606,11 @@ pages={211-252}
 }
 
 @incollection{lian2017d-psgd,
-  title = {Can Decentralized Algorithms Outperform Centralized Algorithms? A Case Study for Decentralized Parallel Stochastic Gradient Descent},
+  title = {{Can Decentralized Algorithms Outperform Centralized Algorithms? A Case Study for Decentralized Parallel Stochastic Gradient Descent}},
   author = {Lian, Xiangru and Zhang, Ce and Zhang, Huan and Hsieh, Cho-Jui and Zhang, Wei and Liu, Ji},
-  booktitle = {Advances in Neural Information Processing Systems 30},
-  editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
-  pages = {5330--5340},
+  booktitle = {Advances in Neural Information Processing Systems},
   year = {2017},
   publisher = {Curran Associates, Inc.},
-  url = {http://papers.nips.cc/paper/7117-can-decentralized-algorithms-outperform-centralized-algorithms-a-case-study-for-decentralized-parallel-stochastic-gradient-descent.pdf}
 }
 
 @article{nedic2016sgp, 
@@ -657,11 +654,9 @@ pages={211-252}
 @article{kempe2003gossip,
   title={{Gossip-based Computation of Aggregate Information}},
   author={Kempe, David and Dobra, Alin and Gehrke, Johannes},
-  journal={44th Annual IEEE Symposium on Foundations of Computer Science, 2003. Proceedings.},
-  pages={482--491},
+  journal={Foundations of Computer Science},
   year={2003},
-  organization={IEEE},
-  doi={10.1109/SFCS.2003.1238221}
+  organization={IEEE}
 }
 
 @article{nedic2018network,
@@ -678,17 +673,9 @@ pages={211-252}
 @inproceedings{tang18a,
   title = 	 {{$D^2$: Decentralized Training over Decentralized Data}},
   author = 	 {Tang, Hanlin and Lian, Xiangru and Yan, Ming and Zhang, Ce and Liu, Ji},
-  booktitle = 	 {Proceedings of the 35th International Conference on Machine Learning},
-  pages = 	 {4848--4856},
+  booktitle = 	 {ICML},
   year = 	 {2018},
-  editor = 	 {Dy, Jennifer and Krause, Andreas},
-  volume = 	 {80},
-  series = 	 {Proceedings of Machine Learning Research},
-  address = 	 {StockholmsmÃ¤ssan, Stockholm Sweden},
-  month = 	 {10--15 Jul},
-  publisher = 	 {PMLR},
-  pdf = 	 {http://proceedings.mlr.press/v80/tang18a/tang18a.pdf},
-  url = 	 {http://proceedings.mlr.press/v80/tang18a.html},
+  publisher = 	 {PMLR}
 }
 
 @article{xiao2007distributed,
@@ -810,4 +797,63 @@ pages={211-252}
       eprint={2102.04828},
       archivePrefix={arXiv},
       primaryClass={cs.LG}
-}
\ No newline at end of file
+}
+
+@article{krizhevsky2009learning,
+  title={{Learning Multiple Layers of Features from Tiny Images}},
+  author={Krizhevsky, Alex},
+  year={2009},
+  howpublished={\url{https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf}},
+}
+
+@article{xiao2004fast,
+  title={Fast linear iterations for distributed averaging},
+  author={Xiao, Lin and Boyd, Stephen},
+  journal={Systems \& Control Letters},
+  volume={53},
+  number={1},
+  pages={65--78},
+  year={2004},
+  publisher={Elsevier}
+}
+
+@article{jelasity2007gossip,
+  title={Gossip-based peer sampling},
+  author={Jelasity, M{\'a}rk and Voulgaris, Spyros and Guerraoui, Rachid and Kermarrec, Anne-Marie and Van Steen, Maarten},
+  journal={ACM Transactions on Computer Systems (TOCS)},
+  volume={25},
+  number={3},
+  pages={8--es},
+  year={2007},
+  publisher={ACM New York, NY, USA}
+}
+
+@InProceedings{pmlr-v28-sutskever13, 
+    title = {On the importance of initialization and momentum in deep learning}, 
+    author = {Ilya Sutskever and James Martens and George Dahl and Geoffrey Hinton}, 
+    booktitle = {ICML}, 
+    year = {2013}, 
+    publisher = {PMLR}
+}
+
+@article{lecun1998gradient,
+  title={{Gradient-based Learning Applied to Document Recognition}},
+  author={LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and Haffner, Patrick},
+  journal={Proceedings of the IEEE},
+  volume={86},
+  number={11},
+  pages={2278--2324},
+  year={1998},
+  publisher={Ieee}
+}
+
+@article{stoica2003chord,
+  title={Chord: a scalable peer-to-peer lookup protocol for internet applications},
+  author={Stoica, Ion and Morris, Robert and Liben-Nowell, David and Karger, David R and Kaashoek, M Frans and Dabek, Frank and Balakrishnan, Hari},
+  journal={IEEE/ACM Transactions on networking},
+  volume={11},
+  number={1},
+  pages={17--32},
+  year={2003},
+  publisher={IEEE}
+}
diff --git a/main.tex b/main.tex
index 1f0cc1a1ae2147100d851745a471955fb1cd151d..4a3663cf3f7be05fc9894ed887ad7731fc2184a6 100644
--- a/main.tex
+++ b/main.tex
@@ -132,7 +132,7 @@ modification of the standard D-SGD algorithm which ensures that gradients are
 unbiased with respect to the class distribution. Clique Averaging can be used to implement optimization 
 techniques, such as momentum, that otherwise rely on an IID assumption on mini-batches.
 
-We empirically evaluate our approach on MNIST and CIFAR10 datasets using
+We empirically evaluate our approach on MNIST~\cite{mnistWebsite} and CIFAR10~\cite{krizhevsky2009learning} datasets using
 logistic regression and deep convolutional models with up to 1000 participants. This is
 in contrast to most previous work on fully decentralized algorithms
 considering only a few tens of participants \cite{tang18a,more_refs}. With 1000 participants, the resulting design requires 98\% less edges ($18.9$ vs $999$ edges per participant on average) and a 96\% reduction in the total number of required messages (37.8 messages per round per node on average instead of 999) to obtain a similar convergence speed as a fully-connected topology. Furthermore an additional 22\% improvement (14.5 edges per node on average instead of 18.9) is possible when using a small-world inter-clique topology, with further potential gains at larger scales because of its linear-logarithmic scaling.
@@ -189,7 +189,7 @@ expected value of $F_i$ on a random sample $s_i$ drawn from $D_i$.
 
 Removing the assumption of \textit{independent and identically distributed} (IID) data opens a wide range of potential practical difficulties. While non-IID simply means that a local dataset is a biased sample of the global distribution $D$, the difficulty of the learning problem depends on additional factors that compound with that bias. For example, an imbalance in the number of examples for each class represented in the global distribution compounds with the position of the nodes that have the examples of the rarest class. Additionally, if two local datasets have different number of examples, the examples in the smaller dataset will be visited more often than those in a larger dataset, potentially skewing the optimisation process to perform better on the examples seen more often.
 
-To focus our study while still retaining the core aspects of the problem, we make the following assumptions: (1) all classes are equally represented in the global dataset, by randomly removing examples from the larger classes if necessary; (2) all classes are represented on the same number of nodes; (3) all nodes have the same number of examples. Within those assumptions, we take the hardest possible problem, which is to have each node having examples of only a single class. For the following experiments, we use the MNIST (CITE) and CIFAR10 (CITE) datasets.
+To focus our study while still retaining the core aspects of the problem, we make the following assumptions: (1) all classes are equally represented in the global dataset, by randomly removing examples from the larger classes if necessary; (2) all classes are represented on the same number of nodes; (3) all nodes have the same number of examples. Within those assumptions, we take the hardest possible problem, which is to have each node having examples of only a single class. For the following experiments, we use the MNIST~\cite{mnistWebsite} and CIFAR10~\cite{krizhevsky2009learning} datasets.
 
 \subsection{Learning Algorithm}
 
@@ -272,16 +272,16 @@ Under our non-IID assumptions (Section~\ref{section:non-iid-assumptions}), a bal
 
 The construction of the resulting \textit{decentralized cliques}, \textsc{D-Cliques}, topology can be performed with Algorithm~\ref{Algorithm:D-Clique-Construction}. Essentially, each clique $C$ is constructed one at a time by selecting nodes with differing classes. Once all cliques are constructed, intra-clique and inter-clique edges are added. 
 
-Finally, weights are assigned to edges to ensure quick convergence. For this study we use Metropolis-Hasting (CITE), which while not offering optimal convergence speed in the general case, provides good convergence by taking into account the degree of immediate neighbours:
-
+Finally, weights are assigned to edges to ensure quick convergence. For this study we use Metropolis-Hasting weights~\cite{xiao2004fast}, which while not optimal, are quick to compute and still provide good convergence speed: 
 \begin{equation}
   W_{ij} = \begin{cases}
-    max(\text{degree}(i), \text{degree}(j)) + 1 & \text{if}~i \neq j \\
-   1 - \sum_{j \neq i} W_{ij} & \text{otherwise}
+    \frac{1}{max(\text{degree}(i), \text{degree}(j)) + 1} & \text{if}~i \neq j, \text{and $\exists$ edge between $i$ and $j$}\\
+   1 - \sum_{j \neq i} W_{ij} & \text{if}~$i = j$ \\
+   0 & \text{otherwise}
   \end{cases}
 \end{equation}
 
-In this paper, we focus on showing the convergence benefits of such a topology for decentralized federated learning. Algorithm~\ref{Algorithm:D-Clique-Construction} therefore centrally generates the topology, which is then tested in a simulator. We expect this algorithm should be straightforward to adapt for a decentralized execution: the computation of the classes globally present, $L$, could be computed using PushSum (CITE), and the selection of neighbours done with PeerSampling (CITE).
+In this paper, we focus on showing the convergence benefits of such a topology for decentralized federated learning. Algorithm~\ref{Algorithm:D-Clique-Construction} therefore centrally generates the topology, which is then tested in a simulator. We expect this algorithm should be straightforward to adapt for a decentralized execution: the relative frequency of global classes, which is a more specific variation of $L$, could be computed using PushSum~\cite{kempe2003gossip}, and neighbours could be selected with PeerSampling~\cite{jelasity2007gossip}.
 
 \begin{figure}[htbp]
     \centering 
@@ -360,7 +360,7 @@ As illustrated in Figure~\ref{fig:d-clique-mnist-clique-avg}, this significantly
 
 Quickly training higher capacity models, such as a deep convolutional network, on harder datasets, such as CIFAR10, usually requires additional optimization techniques. We show here how Clique Averaging (Section~\ref{section:clique-averaging}) easily enables the implementation of optimization techniques in the more general non-IID setting, that otherwise would require IID mini-batches.
 
-In particular, we implement momentum (CITE), which increases the magnitude of the components of the gradient that are shared between several consecutive steps. Momentum is critical for making deep convolutional networks, such as LeNet (CITE), converge quickly. However, a simple application of momentum in a non-IID setting can actually be detrimental. As illustrated in Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-effect}, LeNet, on CIFAR10 with 100 nodes using the 
+In particular, we implement momentum~\cite{pmlr-v28-sutskever13}, which increases the magnitude of the components of the gradient that are shared between several consecutive steps. Momentum is critical for making deep convolutional networks, such as LeNet~\cite{lecun1998gradient,quagmire}, converge quickly. However, a simple application of momentum in a non-IID setting can actually be detrimental. As illustrated in Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-effect}, LeNet, on CIFAR10 with 100 nodes using the 
 D-Cliques and momentum, actually fails to converge.  To put things in context, we compare the convergence speed to a single centralized IID node performing the same number of updates per epoch, therefore using a batch size 100 times larger: this is essentially equivalent to completely removing the impact of the topology, non-IIDness, and decentralized averaging on the convergence speed. As shown, not using momentum gives a better convergence speed, but there is still a significant gap.
 
 \begin{figure}[htbp]
@@ -392,7 +392,7 @@ It then suffices to modify the original gradient step to use momentum:
 x_i^{(k-\frac{1}{2})} \leftarrow x_i^{(k-1)} - \gamma v_i^{(k)} 
 \end{equation}
 
-Using momentum closes the gap, with a slightly lower convergence speed in the first 20 epochs, as illustrated in Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-clique-avg-effect}. We expect a similar approach could enable other optimization techniques (CITE) in non-IID settings.
+Using momentum closes the gap, with a slightly lower convergence speed in the first 20 epochs, as illustrated in Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-clique-avg-effect}. We expect a similar approach could enable other optimization techniques in non-IID settings.
 
  \section{Comparison to Similar Non-Clustered Topologies}
  \label{section:non-clustered}
@@ -459,7 +459,7 @@ First, the scheme that uses the fewest (almost\footnote{A path uses one less edg
 
 Second, surprisingly (to us), another scheme also scales linearly with a logarithmic bound on the averaging shortest number of hops between nodes, which we call "\textit{fractal}". In this scheme, as nodes are added, cliques are assembled in larger groups of $c$ cliques that are connected internally with one edge per pair of cliques, but with only one edge between pairs of larger groups. The scheme is recursive such that $c$ groups will themselves form a larger group the next level up. This scheme results in at most $nc$ edges per node if edges are evenly distributed, and therefore also scales linearly in the number of nodes.
 
-Third, cliques may also be connected in a smallworld-like~\cite{watts2000small} topology, that may be reminiscent of distributed-hash table designs such as Chord (CITE). In this scheme, cliques are first arranged in a ring. Then each clique add symmetric edges, both clockwise and counter-clockwise on the ring, to the $ns$ closest cliques in sets of cliques that are exponentially bigger the further they are on the ring, as detailed in Algorithm~\ref{Algorithm:Smallworld}. This ensures good clustering with other cliques that are close on the ring, while still keeping the average shortest path small (including nodes further on the ring). This scheme adds a $2(ns)log(\frac{n}{c})$ inter-clique edges and therefore grows in the order of $O(n + log(n))$ with the number of nodes.
+Third, cliques may also be connected in a smallworld-like~\cite{watts2000small} topology, that may be reminiscent of distributed-hash table designs such as Chord~\cite{stoica2003chord}. In this scheme, cliques are first arranged in a ring. Then each clique add symmetric edges, both clockwise and counter-clockwise on the ring, to the $ns$ closest cliques in sets of cliques that are exponentially bigger the further they are on the ring, as detailed in Algorithm~\ref{Algorithm:Smallworld}. This ensures good clustering with other cliques that are close on the ring, while still keeping the average shortest path small (including nodes further on the ring). This scheme adds a $2(ns)log(\frac{n}{c})$ inter-clique edges and therefore grows in the order of $O(n + log(n))$ with the number of nodes.
 
 \begin{algorithm}[h]
    \caption{$\textit{smallworld}(DC)$:  adds $O(\# N + log(\# N))$ edges}