From 616284d8e4afb4cb42f56166ce5366bc2119e222 Mon Sep 17 00:00:00 2001 From: Erick Lavoie <erick.lavoie@epfl.ch> Date: Wed, 10 Mar 2021 09:55:39 +0100 Subject: [PATCH] Added MNIST experiments --- main.tex | 20 ++++++++++++------- results/mnist/1-node-iid/experiments.sh | 19 ++++++++++++++++++ results/mnist/clique-ring/experiments.sh | 14 +++++++++++++ .../fully-connected-cliques/experiments.sh | 14 +++++++++++++ results/mnist/fully-connected/experiments.sh | 14 +++++++++++++ 5 files changed, 74 insertions(+), 7 deletions(-) create mode 100755 results/mnist/1-node-iid/experiments.sh create mode 100755 results/mnist/clique-ring/experiments.sh create mode 100755 results/mnist/fully-connected-cliques/experiments.sh create mode 100755 results/mnist/fully-connected/experiments.sh diff --git a/main.tex b/main.tex index eefa7ff..d86b59a 100644 --- a/main.tex +++ b/main.tex @@ -137,10 +137,6 @@ As a summary, we make the following contributions: The rest of the paper is organized as such. \dots -\section{Related Work} - -D2: numerically unstable when $W_{ij}$ rows and columns do not exactly sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbour contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. - \section{Problem Statement} \label{section:problem} @@ -206,6 +202,8 @@ The degree of \textit{skew} of local distributions $D_i$, i.e. how much the loca The global distribution of classes, for classification tasks, can be computed from the distribution of class examples on the nodes, with Distributed Averaging (CITE). Given the global distribution of classes, neighbours within cliques can be chosen based on a PeerSampling (CITE) service. Both services can be implemented such that they converge in a logarithmic number of steps compared to the number of nodes. It is therefore possible to obtain this information in a scalable way. In the rest of this paper, we assume these services are available and show that the approach provides a useful convergence speed after the cliques have been formed. + + TODO: Algo de construction des cliques (incluant les cas où les classes ne sont pas également représentées) \subsection{Connecting Cliques} @@ -307,15 +305,23 @@ TODO: Add D-Cliques arranged in a Ring instead of Fully-Connected Similar number of maximum hops but no or less clustering than D-Cliques (and no unbiasing of gradient). \begin{itemize} - \item Uniform Diverse Neighbourhood with No Clustering - \item Random network - \item Random Small-World Graph + \item Choice of 10 random neighbours (static) in a fully connected graph (D-PSGD aléatoire) + %\item Uniform Diverse Neighbourhood with No Clustering + %\item Random network + %\item Random Small-World Graph \end{itemize} \subsection{Effect of Scaling} Show scaling effect for 10, 100, 1000 nodes (with decreasing sample sizes) for Clique Ring, Hierarchical, Fully-Connected. +Robustness to not having fully-connected cliques (static and dynamic subsets). + +\section{Related Work} + +D2: numerically unstable when $W_{ij}$ rows and columns do not exactly sum to $1$, as the small differences are amplified in a positive feedback loop. More work is therefore required on the algorithm to make it usable with a wider variety of topologies. In comparison, D-cliques do not modify the SGD algorithm and instead simply removes some neighbour contributions that would otherwise bias the direction of the gradient. D-Cliques with D-PSGD are therefore as tolerant to ill-conditioned $W_{ij}$ matrices as regular D-PSGD in an IID setting. + + \section{Future Work} \begin{itemize} \item Non-uniform Class Representation diff --git a/results/mnist/1-node-iid/experiments.sh b/results/mnist/1-node-iid/experiments.sh new file mode 100755 index 0000000..f21e9ad --- /dev/null +++ b/results/mnist/1-node-iid/experiments.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +TOOLS=../../../../learn-topology/tools; CWD="$(pwd)"; cd $TOOLS +# 2560 +# 1280 +# 320 +# 128 +BSZS=' + 12800 + ' +# 7680 +LRS=' + 0.1 + ' +for BSZ in $BSZS; + do for LR in $LRS; + do python sgp-mnist.py --nb-nodes 1 --nb-epochs 100 --local-classes 10 --seed 1 --nodes-per-class 1 1 1 1 1 1 1 1 1 1 --global-train-ratios 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 --dist-optimization d-psgd --topology fully_connected --metric dissimilarity --learning-momentum 0. --sync-per-mini-batch 1 --results-directory $CWD/all --learning-rate $LR --batch-size $BSZ "$@" --single-process + done; +done; + diff --git a/results/mnist/clique-ring/experiments.sh b/results/mnist/clique-ring/experiments.sh new file mode 100755 index 0000000..65e7e7a --- /dev/null +++ b/results/mnist/clique-ring/experiments.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +TOOLS=../../../../learn-topology/tools; CWD="$(pwd)"; cd $TOOLS +BSZS=' + 128 + ' +LRS=' + 0.1 + ' +for BSZ in $BSZS; + do for LR in $LRS; + do python sgp-mnist.py --nb-nodes 100 --nb-epochs 100 --local-classes 1 --seed 1 --nodes-per-class 10 10 10 10 10 10 10 10 10 10 --global-train-ratios 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 --dist-optimization d-psgd --topology fully-connected-cliques --metric dissimilarity --learning-momentum 0. --sync-per-mini-batch 1 --results-directory $CWD/all --learning-rate $LR --batch-size $BSZ "$@" --parallel-training --nb-workers 10 --dataset mnist --model linear --clique-gradient --initial-averaging + done; +done; + diff --git a/results/mnist/fully-connected-cliques/experiments.sh b/results/mnist/fully-connected-cliques/experiments.sh new file mode 100755 index 0000000..65e7e7a --- /dev/null +++ b/results/mnist/fully-connected-cliques/experiments.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +TOOLS=../../../../learn-topology/tools; CWD="$(pwd)"; cd $TOOLS +BSZS=' + 128 + ' +LRS=' + 0.1 + ' +for BSZ in $BSZS; + do for LR in $LRS; + do python sgp-mnist.py --nb-nodes 100 --nb-epochs 100 --local-classes 1 --seed 1 --nodes-per-class 10 10 10 10 10 10 10 10 10 10 --global-train-ratios 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 --dist-optimization d-psgd --topology fully-connected-cliques --metric dissimilarity --learning-momentum 0. --sync-per-mini-batch 1 --results-directory $CWD/all --learning-rate $LR --batch-size $BSZ "$@" --parallel-training --nb-workers 10 --dataset mnist --model linear --clique-gradient --initial-averaging + done; +done; + diff --git a/results/mnist/fully-connected/experiments.sh b/results/mnist/fully-connected/experiments.sh new file mode 100755 index 0000000..4dbf110 --- /dev/null +++ b/results/mnist/fully-connected/experiments.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +TOOLS=../../../../learn-topology/tools; CWD="$(pwd)"; cd $TOOLS +BSZS=' + 128 + ' +LRS=' + 0.1 + ' +for BSZ in $BSZS; + do for LR in $LRS; + do python sgp-mnist.py --nb-nodes 100 --nb-epochs 100 --local-classes 1 --seed 1 --nodes-per-class 10 10 10 10 10 10 10 10 10 10 --global-train-ratios 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 0.802568 --dist-optimization d-psgd --topology fully_connected --metric random --learning-momentum 0. --sync-per-mini-batch 1 --results-directory $CWD/all --learning-rate $LR --batch-size $BSZ "$@" --parallel-training --nb-workers 10 --dataset mnist --model linear + done; +done; + -- GitLab