From 91a7d3bb6691ca7034f7eed032b2be065b30ac71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien?= <aurelien.bellet@inria.fr>
Date: Fri, 2 Apr 2021 17:42:04 +0200
Subject: [PATCH] use examples. samples only use when we actually sample the
 distribution. notation m for batch size

---
 main.tex | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/main.tex b/main.tex
index 7636b35..9facdfa 100644
--- a/main.tex
+++ b/main.tex
@@ -246,9 +246,9 @@ Each node has access to a local dataset that
 {s_i \sim D_i} [F_i(x;s_i)],
 \label{eq:dist-optimization-problem}
 \end{equation}
-where $s_i$ is a data sample of $D_i$, $F_i$ is the loss function
+where $s_i$ is a data example drawn from $D_i$ and $F_i$ is the loss function
 on node $i$. Therefore, $\mathds{E}_{s_i \sim D_i} F_i(x;s_i)$ denotes  the
-expected loss of model $x$ on a random sample $s_i$ drawn from $D_i$.
+expected loss of model $x$ on a random example $s_i$ drawn from $D_i$.
 
 To collaboratively solve Problem \eqref{eq:dist-optimization-problem}, each
 node can exchange messages with its neighbors in an undirected network graph
@@ -265,8 +265,8 @@ shown in Algorithm~\ref{Algorithm:D-PSGD},
 a single iteration of D-SGD at node $i$ consists of sampling a mini-batch
 from its local distribution
 $D_i$, updating its local model $x_i$ by taking a stochastic gradient descent 
-(SGD) step according to this
-sample, and performing a weighted average of its local model with those of its
+(SGD) step according to the mini-batch, and performing a weighted average of
+its local model with those of its
 neighbors.
 This weighted average is defined by a
 mixing matrix $W$, in which $W_{ij}$ corresponds to the weight of
@@ -284,9 +284,11 @@ symmetric, i.e. $W_{ij} = W_{ji}$, see \cite{lian2017d-psgd}.
    \label{Algorithm:D-PSGD}
    \begin{algorithmic}[1]
         \State \textbf{Require:} initial model parameters $x_i^{(0)}$,
-        learning rate $\gamma$, mixing weights $W$, number of steps $K$
+        learning rate $\gamma$, mixing weights $W$, mini-batch size $m$,
+        number of steps $K$
         \For{$k = 1,\ldots, K$}
-          \State $s_i^{(k)} \gets \text{(mini-batch) sample from~} D_i$
+          \State $s_i^{(k)} \gets \text{mini-batch sample of size $m$ drawn
+          from~} D_i$
           \State $x_i^{(k-\frac{1}{2})} \gets x_i^{(k-1)} - \gamma \nabla F(x_i^{(k-1)}; s_i^{(k)})$ 
           \State $x_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} x_j^{(k-\frac{1}{2})}$
         \EndFor
@@ -301,7 +303,7 @@ symmetric, i.e. $W_{ij} = W_{ji}$, see \cite{lian2017d-psgd}.
 As demonstrated in Figure~\ref{fig:iid-vs-non-iid-problem}, lifting the
 assumption of IID data significantly challenges the learning algorithm. In
 this paper, we focus on an \textit{extreme case of local class bias}: we
-consider that each node only has samples
+consider that each node only has examples
 %examples
 from a single class.
 % Our results should generalize to lesser, and more
@@ -311,7 +313,8 @@ from a single class.
 
 To isolate the effect of local class bias from other potentially compounding
 factors, we make the following simplifying assumptions: (1) All classes are
-equally represented in the global dataset; (2) All classes are represented on the same number of nodes; (3) All nodes have the same number of samples.
+equally represented in the global dataset; (2) All classes are represented on
+the same number of nodes; (3) All nodes have the same number of examples.
 
 We believe that these assumptions are reasonable in the context of our study
 because: (1)
@@ -337,9 +340,9 @@ can remove much of the effect of local class bias.
 
 We experiment with two datasets: MNIST~\cite{mnistWebsite} and
 CIFAR10~\cite{krizhevsky2009learning}, which both have $c=10$ classes.
-For MNIST, we use 45k and 10k samples from the original 60k
+For MNIST, we use 45k and 10k examples from the original 60k
 training set for training and validation respectively. The remaining 5k
-training samples were randomly removed to ensure all 10 classes are balanced
+training examples were randomly removed to ensure all 10 classes are balanced
 while ensuring the dataset is evenly divisible across 100 and 1000 nodes.
 We use all 10k examples of
 the test set to measure test accuracy. For CIFAR10, classes are evenly
@@ -402,8 +405,9 @@ In this section, we present the design of D-Cliques. To give an intuition of our
 % where  each color represents a class of data.
 The colors of a node represent the different classes it holds
 locally. In the IID setting (Figure~\ref{fig:grid-iid-neighbourhood}), each
-node has samples of all classes in equal proportions. In the non-IID setting 
-(Figure~\ref{fig:grid-non-iid-neighbourhood}), each node has samples of only a
+node has examples of all classes in equal proportions. In the non-IID setting 
+(Figure~\ref{fig:grid-non-iid-neighbourhood}), each node has examples of only
+a
 single class and nodes are distributed randomly in the grid. A single training step, from the point of view of the center node, is equivalent to sampling a mini-batch five times larger from the union of the local distributions of all illustrated nodes.
 In the IID case, since gradients are computed from examples of all classes,
 the resulting average gradient  points in a direction that reduces the
@@ -566,7 +570,7 @@ of the local models across nodes.
 inter-clique connections (see main text).}
 \end{figure}
 
-We address this problem by adding \emph{Clique Averaging} to D-PSGD
+We address this problem by adding \emph{Clique Averaging} to D-SGD
 (Algorithm~\ref{Algorithm:Clique-Unbiased-D-PSGD}), which essentially
 decouples gradient averaging from model averaging. Only the gradients of
 neighbors within the same clique are used to compute the average gradient,
@@ -575,12 +579,15 @@ models, including those across inter-clique edges, participate in the model
 averaging step as in the original version.
 
 \begin{algorithm}[t]
-   \caption{D-PSGD with Clique Averaging, Node $i$}
+   \caption{D-SGD with Clique Averaging, Node $i$}
    \label{Algorithm:Clique-Unbiased-D-PSGD}
    \begin{algorithmic}[1]
-        \State \textbf{Require} initial model parameters $x_i^{(0)}$, learning rate $\gamma$, mixing weights $W$, number of steps $K$, loss function $F$
+        \State \textbf{Require} initial model parameters $x_i^{(0)}$, learning
+        rate $\gamma$, mixing weights $W$, mini-batch size $m$, number of
+        steps $K$
         \For{$k = 1,\ldots, K$}
-          \State $s_i^{(k)} \gets \textit{sample from~} D_i$
+          \State $s_i^{(k)} \gets \text{mini-batch sample of size $m$ drawn
+          from~} D_i$
           \State $g_i^{(k)} \gets \frac{1}{|\textit{Clique}(i)|}\sum_{j \in \textit{Clique(i)}}  \nabla F(x_j^{(k-1)}; s_j^{(k)})$
           \State $x_i^{(k-\frac{1}{2})} \gets x_i^{(k-1)} - \gamma g_i^{(k)}$ 
           \State $x_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} x_j^{(k-\frac{1}{2})}$
-- 
GitLab