diff --git a/figures/d-cliques-cifar10-scaling-clique-ring-cst-updates.png b/figures/d-cliques-cifar10-scaling-clique-ring-cst-updates.png new file mode 100644 index 0000000000000000000000000000000000000000..05f246048570c7c5ac3a001b2f6689b997679bf7 Binary files /dev/null and b/figures/d-cliques-cifar10-scaling-clique-ring-cst-updates.png differ diff --git a/figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates.png b/figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates.png new file mode 100644 index 0000000000000000000000000000000000000000..4d9b418dcebabc8821183cfc7831a40a3ce27e3a Binary files /dev/null and b/figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates.png differ diff --git a/figures/d-cliques-cifar10-scaling-fully-connected-cst-updates.png b/figures/d-cliques-cifar10-scaling-fully-connected-cst-updates.png new file mode 100644 index 0000000000000000000000000000000000000000..3ed6281da957e5182092b0b20de5b44f89ea038e Binary files /dev/null and b/figures/d-cliques-cifar10-scaling-fully-connected-cst-updates.png differ diff --git a/main.tex b/main.tex index 2342654774c8498f490c8ee369775026858034e5..6af2c404afc0b50ccc8b956743a47c8b0f111a4a 100644 --- a/main.tex +++ b/main.tex @@ -301,8 +301,7 @@ We solve this problem by decoupling the gradient averaging from the weight avera \caption{\label{fig:d-cliques-mnist-clique-clustering} MNIST: Effects of Clustering within Cliques on Convergence Speed.} \end{figure} - \begin{figure}[htbp] - \centering +% REMOVED: Constant Batch-size % % To regenerate the figure, from directory results/scaling %% python ../../../learn-topology/tools/plot_convergence.py 10/mnist/fully-connected-cliques/all/2021-03-10-14:40:35-CET ../mnist/fully-connected-cliques/all/2021-03-10-10:19:44-CET 1000/mnist/fully-connected-cliques/all/2021-03-10-16:44:35-CET --labels '10 nodes bsz=128' '100 nodes bsz=128' '1000 nodes bsz=128 (45)' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-fully-connected-cst-bsz.png --ymin 80 --add-min-max % \begin{subfigure}[b]{0.48\textwidth} @@ -310,14 +309,16 @@ We solve this problem by decoupling the gradient averaging from the weight avera % \includegraphics[width=\textwidth]{figures/d-cliques-mnist-scaling-fully-connected-cst-bsz} % \caption{FCC: Constant Batch-Size} % \end{subfigure} - + + \begin{figure}[htbp] + \centering % To regenerate the figure, from directory results/scaling % python ../../../learn-topology/tools/plot_convergence.py 10/mnist/fully-connected-cliques/all/2021-03-12-09:13:27-CET ../mnist/fully-connected-cliques/all/2021-03-10-10:19:44-CET 1000/mnist/fully-connected-cliques/all/2021-03-14-17:56:26-CET --labels '10 nodes bsz=1280' '100 nodes bsz=128' '1000 nodes bsz=13' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-fully-connected-cst-updates.png --ymin 80 --add-min-max \begin{subfigure}[b]{0.7\textwidth} \centering \includegraphics[width=\textwidth]{figures/d-cliques-mnist-scaling-fully-connected-cst-updates} - \caption{Fully-Connected} + \caption{Fully-Connected (Cliques), $O(\frac{n^2}{c^2} + nc)$ edges} \end{subfigure} % To regenerate the figure, from directory results/scaling @@ -325,7 +326,7 @@ We solve this problem by decoupling the gradient averaging from the weight avera \begin{subfigure}[b]{0.7\textwidth} \centering \includegraphics[width=\textwidth]{figures/d-cliques-mnist-scaling-fractal-cliques-cst-updates} - \caption{Fractal} + \caption{Fractal, $O(nc)$ edges} \end{subfigure} @@ -334,10 +335,10 @@ We solve this problem by decoupling the gradient averaging from the weight avera \begin{subfigure}[b]{0.7\textwidth} \centering \includegraphics[width=\textwidth]{figures/d-cliques-mnist-scaling-clique-ring-cst-updates} - \caption{Ring} + \caption{Ring, $O(n)$ edges} \end{subfigure} - \caption{\label{fig:d-cliques-mnist-scaling-fully-connected} MNIST: D-Clique Scaling Behaviour (Constant Updates per Epoch)} + \caption{\label{fig:d-cliques-mnist-scaling-fully-connected} MNIST: D-Clique Scaling Behaviour, where $n$ is the number of nodes, and $c$ the size of a clique (Constant Updates per Epoch).} \end{figure} Show scaling effect for 10, 100, 1000 nodes (with decreasing sample sizes) for Clique Ring, Fractal, Fully-Connected. @@ -418,13 +419,40 @@ In addition, it is important that all nodes are initialized with the same model \caption{\label{fig:d-cliques-cifar10-clique-clustering} CIFAR10: Effects of Clustering within Cliques on Convergence Speed.} \end{figure} - \begin{figure}[htbp] + + + \begin{figure}[htbp] + \centering + % To regenerate the figure, from directory results/scaling -% python ../../../learn-topology/tools/plot_convergence.py 10/cifar10/fully-connected-cliques/all/2021-03-12-09:13:27-CET ../cifar10/fully-connected-cliques/all/2021-03-10-10:19:44-CET 1000/cifar10/fully-connected-cliques/all/2021-03-12-09:13:28-CET --labels '10 nodes bsz=1280' '100 nodes bsz=128' '1000 nodes bsz=13' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-fully-connected-cst-updates.png --ymin 80 --add-min-max +% python ../../../learn-topology/tools/plot_convergence.py 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/fully-connected-cliques/all/2021-03-10-13:58:57-CET 1000/cifar10/fully-connected-cliques/all/2021-03-14-17:41:20-CET --labels '10 nodes bsz=200' '100 nodes bsz=20' '1000 nodes bsz=2' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-fully-connected-cst-updates.png --add-min-max + + \begin{subfigure}[b]{0.7\textwidth} + \centering + \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-scaling-fully-connected-cst-updates} + \caption{Fully-Connected (Cliques), $O(\frac{n^2}{c^2} + nc)$ edges} + \end{subfigure} + + % To regenerate the figure, from directory results/scaling +% python ../../../learn-topology/tools/plot_convergence.py 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/fully-connected-cliques/all/2021-03-10-13:58:57-CET 1000/cifar10/fractal-cliques/all/2021-03-14-17:42:46-CET --labels '10 nodes bsz=200' '100 nodes bsz=20' '1000 nodes bsz=2' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates.png --add-min-max + \begin{subfigure}[b]{0.7\textwidth} \centering - %\includegraphics[width=\textwidth]{figures/d-cliques-cifar10-scaling-fully-connected-cst-updates} - \caption{\label{fig:d-cliques-cifar10-scaling-fully-connected} CIFAR10: Scaling Behaviour of Fully-Connected D-Clique (Constant Updates Per Epoch)} + \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates} + \caption{Fractal, $O(nc)$ edges} + \end{subfigure} + + + % To regenerate the figure, from directory results/scaling +% python ../../../learn-topology/tools/plot_convergence.py 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/clique-ring/all/2021-03-10-11:58:43-CET 1000/cifar10/clique-ring/all/2021-03-14-09:55:24-CET --labels '10 nodes bsz=200' '100 nodes bsz=20' '1000 nodes bsz=2' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-clique-ring-cst-updates.png --add-min-max + \begin{subfigure}[b]{0.7\textwidth} + \centering + \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-scaling-clique-ring-cst-updates} + \caption{Ring, $O(n)$ edges} + \end{subfigure} + + \caption{\label{fig:d-cliques-cifar10-scaling-fully-connected} CIFAR10: D-Clique Scaling Behaviour, where $n$ is the number of nodes, and $c$ the size of a clique (Constant Updates per Epoch).} \end{figure} + \subsection{Comparison to similar topologies} @@ -432,7 +460,10 @@ In addition, it is important that all nodes are initialized with the same model Similar number of maximum hops but no or less clustering than D-Cliques (and no unbiasing of gradient). \begin{itemize} - \item Choice of 10 random neighbours (static) in a fully connected graph (D-PSGD aléatoire) + \item Choice of 10 random neighbours (static) in a fully connected graph (D-PSGD aléatoire), such that all nodes have at most 10 edges + \item 10 random neighbours, such that all nodes have at most 10 edges, but all nodes have neighbours of all classes + \item item previous, with neighbour averaging + \item item previous, with neighbour averaging and uniform initialization %\item Uniform Diverse Neighbourhood with No Clustering %\item Random network %\item Random Small-World Graph