From bc623e5e950dca922a4e5494b1bf35464785517a Mon Sep 17 00:00:00 2001
From: Erick Lavoie <erick.lavoie@epfl.ch>
Date: Thu, 23 Sep 2021 15:37:17 +0200
Subject: [PATCH] Added draft of MLSys paper

---
 mlsys2022style/clean.sh      |    2 +
 mlsys2022style/fancyhdr.sty  |  485 ++++++++++++
 mlsys2022style/main.tex      | 1328 +++++++++++++++++++++++++++++++
 mlsys2022style/mlsys2022.bst | 1439 ++++++++++++++++++++++++++++++++++
 mlsys2022style/mlsys2022.sty |  750 ++++++++++++++++++
 5 files changed, 4004 insertions(+)
 create mode 100755 mlsys2022style/clean.sh
 create mode 100644 mlsys2022style/fancyhdr.sty
 create mode 100644 mlsys2022style/main.tex
 create mode 100644 mlsys2022style/mlsys2022.bst
 create mode 100644 mlsys2022style/mlsys2022.sty

diff --git a/mlsys2022style/clean.sh b/mlsys2022style/clean.sh
new file mode 100755
index 0000000..6f59830
--- /dev/null
+++ b/mlsys2022style/clean.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+rm main.aux main.bbl main.blg main.log main.pdf main.synctex.gz main.out    
diff --git a/mlsys2022style/fancyhdr.sty b/mlsys2022style/fancyhdr.sty
new file mode 100644
index 0000000..77ed4e3
--- /dev/null
+++ b/mlsys2022style/fancyhdr.sty
@@ -0,0 +1,485 @@
+% fancyhdr.sty version 3.2
+% Fancy headers and footers for LaTeX.
+% Piet van Oostrum, 
+% Dept of Computer and Information Sciences, University of Utrecht,
+% Padualaan 14, P.O. Box 80.089, 3508 TB Utrecht, The Netherlands
+% Telephone: +31 30 2532180. Email: piet@cs.uu.nl
+% ========================================================================
+% LICENCE:
+% This file may be distributed under the terms of the LaTeX Project Public
+% License, as described in lppl.txt in the base LaTeX distribution.
+% Either version 1 or, at your option, any later version.
+% ========================================================================
+% MODIFICATION HISTORY:
+% Sep 16, 1994
+% version 1.4: Correction for use with \reversemargin
+% Sep 29, 1994:
+% version 1.5: Added the \iftopfloat, \ifbotfloat and \iffloatpage commands
+% Oct 4, 1994:
+% version 1.6: Reset single spacing in headers/footers for use with
+% setspace.sty or doublespace.sty
+% Oct 4, 1994:
+% version 1.7: changed \let\@mkboth\markboth to
+% \def\@mkboth{\protect\markboth} to make it more robust
+% Dec 5, 1994:
+% version 1.8: corrections for amsbook/amsart: define \@chapapp and (more
+% importantly) use the \chapter/sectionmark definitions from ps@headings if
+% they exist (which should be true for all standard classes).
+% May 31, 1995:
+% version 1.9: The proposed \renewcommand{\headrulewidth}{\iffloatpage...
+% construction in the doc did not work properly with the fancyplain style. 
+% June 1, 1995:
+% version 1.91: The definition of \@mkboth wasn't restored on subsequent
+% \pagestyle{fancy}'s.
+% June 1, 1995:
+% version 1.92: The sequence \pagestyle{fancyplain} \pagestyle{plain}
+% \pagestyle{fancy} would erroneously select the plain version.
+% June 1, 1995:
+% version 1.93: \fancypagestyle command added.
+% Dec 11, 1995:
+% version 1.94: suggested by Conrad Hughes <chughes@maths.tcd.ie>
+% CJCH, Dec 11, 1995: added \footruleskip to allow control over footrule
+% position (old hardcoded value of .3\normalbaselineskip is far too high
+% when used with very small footer fonts).
+% Jan 31, 1996:
+% version 1.95: call \@normalsize in the reset code if that is defined,
+% otherwise \normalsize.
+% this is to solve a problem with ucthesis.cls, as this doesn't
+% define \@currsize. Unfortunately for latex209 calling \normalsize doesn't
+% work as this is optimized to do very little, so there \@normalsize should
+% be called. Hopefully this code works for all versions of LaTeX known to
+% mankind.  
+% April 25, 1996:
+% version 1.96: initialize \headwidth to a magic (negative) value to catch
+% most common cases that people change it before calling \pagestyle{fancy}.
+% Note it can't be initialized when reading in this file, because
+% \textwidth could be changed afterwards. This is quite probable.
+% We also switch to \MakeUppercase rather than \uppercase and introduce a
+% \nouppercase command for use in headers. and footers.
+% May 3, 1996:
+% version 1.97: Two changes:
+% 1. Undo the change in version 1.8 (using the pagestyle{headings} defaults
+% for the chapter and section marks. The current version of amsbook and
+% amsart classes don't seem to need them anymore. Moreover the standard
+% latex classes don't use \markboth if twoside isn't selected, and this is
+% confusing as \leftmark doesn't work as expected.
+% 2. include a call to \ps@empty in ps@@fancy. This is to solve a problem
+% in the amsbook and amsart classes, that make global changes to \topskip,
+% which are reset in \ps@empty. Hopefully this doesn't break other things.
+% May 7, 1996:
+% version 1.98:
+% Added % after the line  \def\nouppercase
+% May 7, 1996:
+% version 1.99: This is the alpha version of fancyhdr 2.0
+% Introduced the new commands \fancyhead, \fancyfoot, and \fancyhf.
+% Changed \headrulewidth, \footrulewidth, \footruleskip to
+% macros rather than length parameters, In this way they can be
+% conditionalized and they don't consume length registers. There is no need
+% to have them as length registers unless you want to do calculations with
+% them, which is unlikely. Note that this may make some uses of them
+% incompatible (i.e. if you have a file that uses \setlength or \xxxx=)
+% May 10, 1996:
+% version 1.99a:
+% Added a few more % signs
+% May 10, 1996:
+% version 1.99b:
+% Changed the syntax of \f@nfor to be resistent to catcode changes of :=
+% Removed the [1] from the defs of \lhead etc. because the parameter is
+% consumed by the \@[xy]lhead etc. macros.
+% June 24, 1997:
+% version 1.99c:
+% corrected \nouppercase to also include the protected form of \MakeUppercase
+% \global added to manipulation of \headwidth.
+% \iffootnote command added.
+% Some comments added about \@fancyhead and \@fancyfoot.
+% Aug 24, 1998
+% version 1.99d
+% Changed the default \ps@empty to \ps@@empty in order to allow
+% \fancypagestyle{empty} redefinition.
+% Oct 11, 2000
+% version 2.0
+% Added LPPL license clause.
+%
+% A check for \headheight is added. An errormessage is given (once) if the
+% header is too large. Empty headers don't generate the error even if
+% \headheight is very small or even 0pt. 
+% Warning added for the use of 'E' option when twoside option is not used.
+% In this case the 'E' fields will never be used.
+%
+% Mar 10, 2002
+% version 2.1beta
+% New command: \fancyhfoffset[place]{length}
+% defines offsets to be applied to the header/footer to let it stick into
+% the margins (if length > 0).
+% place is like in fancyhead, except that only E,O,L,R can be used.
+% This replaces the old calculation based on \headwidth and the marginpar
+% area.
+% \headwidth will be dynamically calculated in the headers/footers when
+% this is used.
+%
+% Mar 26, 2002
+% version 2.1beta2
+% \fancyhfoffset now also takes h,f as possible letters in the argument to
+% allow the header and footer widths to be different.
+% New commands \fancyheadoffset and \fancyfootoffset added comparable to
+% \fancyhead and \fancyfoot.
+% Errormessages and warnings have been made more informative.
+%
+% Dec 9, 2002
+% version 2.1
+% The defaults for \footrulewidth, \plainheadrulewidth and
+% \plainfootrulewidth are changed from \z@skip to 0pt. In this way when
+% someone inadvertantly uses \setlength to change any of these, the value
+% of \z@skip will not be changed, rather an errormessage will be given.
+
+% March 3, 2004
+% Release of version 3.0
+
+% Oct 7, 2004
+% version 3.1
+% Added '\endlinechar=13' to \fancy@reset to prevent problems with
+% includegraphics in header when verbatiminput is active.
+
+% March 22, 2005
+% version 3.2
+% reset \everypar (the real one) in \fancy@reset because spanish.ldf does
+% strange things with \everypar between << and >>.
+
+\def\ifancy@mpty#1{\def\temp@a{#1}\ifx\temp@a\@empty}
+
+\def\fancy@def#1#2{\ifancy@mpty{#2}\fancy@gbl\def#1{\leavevmode}\else
+                                   \fancy@gbl\def#1{#2\strut}\fi}
+
+\let\fancy@gbl\global
+
+\def\@fancyerrmsg#1{%
+        \ifx\PackageError\undefined
+        \errmessage{#1}\else
+        \PackageError{Fancyhdr}{#1}{}\fi}
+\def\@fancywarning#1{%
+        \ifx\PackageWarning\undefined
+        \errmessage{#1}\else
+        \PackageWarning{Fancyhdr}{#1}{}\fi}
+
+% Usage: \@forc \var{charstring}{command to be executed for each char}
+% This is similar to LaTeX's \@tfor, but expands the charstring.
+
+\def\@forc#1#2#3{\expandafter\f@rc\expandafter#1\expandafter{#2}{#3}}
+\def\f@rc#1#2#3{\def\temp@ty{#2}\ifx\@empty\temp@ty\else
+                                    \f@@rc#1#2\f@@rc{#3}\fi}
+\def\f@@rc#1#2#3\f@@rc#4{\def#1{#2}#4\f@rc#1{#3}{#4}}
+
+% Usage: \f@nfor\name:=list\do{body}
+% Like LaTeX's \@for but an empty list is treated as a list with an empty
+% element
+
+\newcommand{\f@nfor}[3]{\edef\@fortmp{#2}%
+    \expandafter\@forloop#2,\@nil,\@nil\@@#1{#3}}
+
+% Usage: \def@ult \cs{defaults}{argument}
+% sets \cs to the characters from defaults appearing in argument
+% or defaults if it would be empty. All characters are lowercased.
+
+\newcommand\def@ult[3]{%
+    \edef\temp@a{\lowercase{\edef\noexpand\temp@a{#3}}}\temp@a
+    \def#1{}%
+    \@forc\tmpf@ra{#2}%
+        {\expandafter\if@in\tmpf@ra\temp@a{\edef#1{#1\tmpf@ra}}{}}%
+    \ifx\@empty#1\def#1{#2}\fi}
+% 
+% \if@in <char><set><truecase><falsecase>
+%
+\newcommand{\if@in}[4]{%
+    \edef\temp@a{#2}\def\temp@b##1#1##2\temp@b{\def\temp@b{##1}}%
+    \expandafter\temp@b#2#1\temp@b\ifx\temp@a\temp@b #4\else #3\fi}
+
+\newcommand{\fancyhead}{\@ifnextchar[{\f@ncyhf\fancyhead h}%
+                                     {\f@ncyhf\fancyhead h[]}}
+\newcommand{\fancyfoot}{\@ifnextchar[{\f@ncyhf\fancyfoot f}%
+                                     {\f@ncyhf\fancyfoot f[]}}
+\newcommand{\fancyhf}{\@ifnextchar[{\f@ncyhf\fancyhf{}}%
+                                   {\f@ncyhf\fancyhf{}[]}}
+
+% New commands for offsets added
+
+\newcommand{\fancyheadoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyheadoffset h}%
+                                           {\f@ncyhfoffs\fancyheadoffset h[]}}
+\newcommand{\fancyfootoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyfootoffset f}%
+                                           {\f@ncyhfoffs\fancyfootoffset f[]}}
+\newcommand{\fancyhfoffset}{\@ifnextchar[{\f@ncyhfoffs\fancyhfoffset{}}%
+                                         {\f@ncyhfoffs\fancyhfoffset{}[]}}
+
+% The header and footer fields are stored in command sequences with
+% names of the form: \f@ncy<x><y><z> with <x> for [eo], <y> from [lcr]
+% and <z> from [hf].
+
+\def\f@ncyhf#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolcrhf,EOLCRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lcr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\fancy@def\csname
+                      f@ncy\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}}
+
+\def\f@ncyhfoffs#1#2[#3]#4{%
+    \def\temp@c{}%
+    \@forc\tmpf@ra{#3}%
+        {\expandafter\if@in\tmpf@ra{eolrhf,EOLRHF}%
+            {}{\edef\temp@c{\temp@c\tmpf@ra}}}%
+    \ifx\@empty\temp@c\else
+        \@fancyerrmsg{Illegal char `\temp@c' in \string#1 argument:
+          [#3]}%
+    \fi
+    \f@nfor\temp@c{#3}%
+        {\def@ult\f@@@eo{eo}\temp@c
+         \if@twoside\else
+           \if\f@@@eo e\@fancywarning
+             {\string#1's `E' option without twoside option is useless}\fi\fi
+         \def@ult\f@@@lcr{lr}\temp@c
+         \def@ult\f@@@hf{hf}{#2\temp@c}%
+         \@forc\f@@eo\f@@@eo
+             {\@forc\f@@lcr\f@@@lcr
+                 {\@forc\f@@hf\f@@@hf
+                     {\expandafter\setlength\csname
+                      f@ncyO@\f@@eo\f@@lcr\f@@hf\endcsname
+                      {#4}}}}}%
+     \fancy@setoffs}
+
+% Fancyheadings version 1 commands. These are more or less deprecated,
+% but they continue to work.
+
+\newcommand{\lhead}{\@ifnextchar[{\@xlhead}{\@ylhead}}
+\def\@xlhead[#1]#2{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#2}}
+\def\@ylhead#1{\fancy@def\f@ncyelh{#1}\fancy@def\f@ncyolh{#1}}
+
+\newcommand{\chead}{\@ifnextchar[{\@xchead}{\@ychead}}
+\def\@xchead[#1]#2{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#2}}
+\def\@ychead#1{\fancy@def\f@ncyech{#1}\fancy@def\f@ncyoch{#1}}
+
+\newcommand{\rhead}{\@ifnextchar[{\@xrhead}{\@yrhead}}
+\def\@xrhead[#1]#2{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#2}}
+\def\@yrhead#1{\fancy@def\f@ncyerh{#1}\fancy@def\f@ncyorh{#1}}
+
+\newcommand{\lfoot}{\@ifnextchar[{\@xlfoot}{\@ylfoot}}
+\def\@xlfoot[#1]#2{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#2}}
+\def\@ylfoot#1{\fancy@def\f@ncyelf{#1}\fancy@def\f@ncyolf{#1}}
+
+\newcommand{\cfoot}{\@ifnextchar[{\@xcfoot}{\@ycfoot}}
+\def\@xcfoot[#1]#2{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#2}}
+\def\@ycfoot#1{\fancy@def\f@ncyecf{#1}\fancy@def\f@ncyocf{#1}}
+
+\newcommand{\rfoot}{\@ifnextchar[{\@xrfoot}{\@yrfoot}}
+\def\@xrfoot[#1]#2{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#2}}
+\def\@yrfoot#1{\fancy@def\f@ncyerf{#1}\fancy@def\f@ncyorf{#1}}
+
+\newlength{\fancy@headwidth}
+\let\headwidth\fancy@headwidth
+\newlength{\f@ncyO@elh}
+\newlength{\f@ncyO@erh}
+\newlength{\f@ncyO@olh}
+\newlength{\f@ncyO@orh}
+\newlength{\f@ncyO@elf}
+\newlength{\f@ncyO@erf}
+\newlength{\f@ncyO@olf}
+\newlength{\f@ncyO@orf}
+\newcommand{\headrulewidth}{0.4pt}
+\newcommand{\footrulewidth}{0pt}
+\newcommand{\footruleskip}{.3\normalbaselineskip}
+
+% Fancyplain stuff shouldn't be used anymore (rather
+% \fancypagestyle{plain} should be used), but it must be present for
+% compatibility reasons.
+
+\newcommand{\plainheadrulewidth}{0pt}
+\newcommand{\plainfootrulewidth}{0pt}
+\newif\if@fancyplain \@fancyplainfalse
+\def\fancyplain#1#2{\if@fancyplain#1\else#2\fi}
+
+\headwidth=-123456789sp %magic constant
+
+% Command to reset various things in the headers:
+% a.o.  single spacing (taken from setspace.sty)
+% and the catcode of ^^M (so that epsf files in the header work if a
+% verbatim crosses a page boundary)
+% It also defines a \nouppercase command that disables \uppercase and
+% \Makeuppercase. It can only be used in the headers and footers.
+\let\fnch@everypar\everypar% save real \everypar because of spanish.ldf
+\def\fancy@reset{\fnch@everypar{}\restorecr\endlinechar=13
+ \def\baselinestretch{1}%
+ \def\nouppercase##1{{\let\uppercase\relax\let\MakeUppercase\relax
+     \expandafter\let\csname MakeUppercase \endcsname\relax##1}}%
+ \ifx\undefined\@newbaseline% NFSS not present; 2.09 or 2e
+   \ifx\@normalsize\undefined \normalsize % for ucthesis.cls
+   \else \@normalsize \fi
+ \else% NFSS (2.09) present
+  \@newbaseline%
+ \fi}
+
+% Initialization of the head and foot text.
+
+% The default values still contain \fancyplain for compatibility.
+\fancyhf{} % clear all
+% lefthead empty on ``plain'' pages, \rightmark on even, \leftmark on odd pages
+% evenhead empty on ``plain'' pages, \leftmark on even, \rightmark on odd pages
+\if@twoside
+  \fancyhead[el,or]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[er,ol]{\fancyplain{}{\sl\leftmark}}
+\else
+  \fancyhead[l]{\fancyplain{}{\sl\rightmark}}
+  \fancyhead[r]{\fancyplain{}{\sl\leftmark}}
+\fi
+\fancyfoot[c]{\rm\thepage} % page number
+
+% Use box 0 as a temp box and dimen 0 as temp dimen. 
+% This can be done, because this code will always
+% be used inside another box, and therefore the changes are local.
+
+\def\@fancyvbox#1#2{\setbox0\vbox{#2}\ifdim\ht0>#1\@fancywarning
+  {\string#1 is too small (\the#1): ^^J Make it at least \the\ht0.^^J
+    We now make it that large for the rest of the document.^^J
+    This may cause the page layout to be inconsistent, however\@gobble}%
+  \dimen0=#1\global\setlength{#1}{\ht0}\ht0=\dimen0\fi
+  \box0}
+
+% Put together a header or footer given the left, center and
+% right text, fillers at left and right and a rule.
+% The \lap commands put the text into an hbox of zero size,
+% so overlapping text does not generate an errormessage.
+% These macros have 5 parameters:
+% 1. LEFTSIDE BEARING % This determines at which side the header will stick
+%    out. When \fancyhfoffset is used this calculates \headwidth, otherwise
+%    it is \hss or \relax (after expansion).
+% 2. \f@ncyolh, \f@ncyelh, \f@ncyolf or \f@ncyelf. This is the left component.
+% 3. \f@ncyoch, \f@ncyech, \f@ncyocf or \f@ncyecf. This is the middle comp.
+% 4. \f@ncyorh, \f@ncyerh, \f@ncyorf or \f@ncyerf. This is the right component.
+% 5. RIGHTSIDE BEARING. This is always \relax or \hss (after expansion).
+
+\def\@fancyhead#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+  \@fancyvbox\headheight{\hbox
+    {\rlap{\parbox[b]{\headwidth}{\raggedright#2}}\hfill
+      \parbox[b]{\headwidth}{\centering#3}\hfill
+      \llap{\parbox[b]{\headwidth}{\raggedleft#4}}}\headrule}}#5}
+
+\def\@fancyfoot#1#2#3#4#5{#1\hbox to\headwidth{\fancy@reset
+    \@fancyvbox\footskip{\footrule
+      \hbox{\rlap{\parbox[t]{\headwidth}{\raggedright#2}}\hfill
+        \parbox[t]{\headwidth}{\centering#3}\hfill
+        \llap{\parbox[t]{\headwidth}{\raggedleft#4}}}}}#5}
+
+\def\headrule{{\if@fancyplain\let\headrulewidth\plainheadrulewidth\fi
+    \hrule\@height\headrulewidth\@width\headwidth \vskip-\headrulewidth}}
+
+\def\footrule{{\if@fancyplain\let\footrulewidth\plainfootrulewidth\fi
+    \vskip-\footruleskip\vskip-\footrulewidth
+    \hrule\@width\headwidth\@height\footrulewidth\vskip\footruleskip}}
+
+\def\ps@fancy{%
+\@ifundefined{@chapapp}{\let\@chapapp\chaptername}{}%for amsbook
+%
+% Define \MakeUppercase for old LaTeXen.
+% Note: we used \def rather than \let, so that \let\uppercase\relax (from
+% the version 1 documentation) will still work.
+%
+\@ifundefined{MakeUppercase}{\def\MakeUppercase{\uppercase}}{}%
+\@ifundefined{chapter}{\def\sectionmark##1{\markboth
+{\MakeUppercase{\ifnum \c@secnumdepth>\z@
+ \thesection\hskip 1em\relax \fi ##1}}{}}%
+\def\subsectionmark##1{\markright {\ifnum \c@secnumdepth >\@ne
+ \thesubsection\hskip 1em\relax \fi ##1}}}%
+{\def\chaptermark##1{\markboth {\MakeUppercase{\ifnum \c@secnumdepth>\m@ne
+ \@chapapp\ \thechapter. \ \fi ##1}}{}}%
+\def\sectionmark##1{\markright{\MakeUppercase{\ifnum \c@secnumdepth >\z@
+ \thesection. \ \fi ##1}}}}%
+%\csname ps@headings\endcsname % use \ps@headings defaults if they exist
+\ps@@fancy
+\gdef\ps@fancy{\@fancyplainfalse\ps@@fancy}%
+% Initialize \headwidth if the user didn't
+%
+\ifdim\headwidth<0sp
+%
+% This catches the case that \headwidth hasn't been initialized and the
+% case that the user added something to \headwidth in the expectation that
+% it was initialized to \textwidth. We compensate this now. This loses if
+% the user intended to multiply it by a factor. But that case is more
+% likely done by saying something like \headwidth=1.2\textwidth. 
+% The doc says you have to change \headwidth after the first call to
+% \pagestyle{fancy}. This code is just to catch the most common cases were
+% that requirement is violated.
+%
+    \global\advance\headwidth123456789sp\global\advance\headwidth\textwidth
+\fi}
+\def\ps@fancyplain{\ps@fancy \let\ps@plain\ps@plain@fancy}
+\def\ps@plain@fancy{\@fancyplaintrue\ps@@fancy}
+\let\ps@@empty\ps@empty
+\def\ps@@fancy{%
+\ps@@empty % This is for amsbook/amsart, which do strange things with \topskip
+\def\@mkboth{\protect\markboth}%
+\def\@oddhead{\@fancyhead\fancy@Oolh\f@ncyolh\f@ncyoch\f@ncyorh\fancy@Oorh}%
+\def\@oddfoot{\@fancyfoot\fancy@Oolf\f@ncyolf\f@ncyocf\f@ncyorf\fancy@Oorf}%
+\def\@evenhead{\@fancyhead\fancy@Oelh\f@ncyelh\f@ncyech\f@ncyerh\fancy@Oerh}%
+\def\@evenfoot{\@fancyfoot\fancy@Oelf\f@ncyelf\f@ncyecf\f@ncyerf\fancy@Oerf}%
+}
+% Default definitions for compatibility mode:
+% These cause the header/footer to take the defined \headwidth as width
+% And to shift in the direction of the marginpar area
+
+\def\fancy@Oolh{\if@reversemargin\hss\else\relax\fi}
+\def\fancy@Oorh{\if@reversemargin\relax\else\hss\fi}
+\let\fancy@Oelh\fancy@Oorh
+\let\fancy@Oerh\fancy@Oolh
+
+\let\fancy@Oolf\fancy@Oolh
+\let\fancy@Oorf\fancy@Oorh
+\let\fancy@Oelf\fancy@Oelh
+\let\fancy@Oerf\fancy@Oerh
+
+% New definitions for the use of \fancyhfoffset
+% These calculate the \headwidth from \textwidth and the specified offsets.
+
+\def\fancy@offsolh{\headwidth=\textwidth\advance\headwidth\f@ncyO@olh
+                   \advance\headwidth\f@ncyO@orh\hskip-\f@ncyO@olh}
+\def\fancy@offselh{\headwidth=\textwidth\advance\headwidth\f@ncyO@elh
+                   \advance\headwidth\f@ncyO@erh\hskip-\f@ncyO@elh}
+
+\def\fancy@offsolf{\headwidth=\textwidth\advance\headwidth\f@ncyO@olf
+                   \advance\headwidth\f@ncyO@orf\hskip-\f@ncyO@olf}
+\def\fancy@offself{\headwidth=\textwidth\advance\headwidth\f@ncyO@elf
+                   \advance\headwidth\f@ncyO@erf\hskip-\f@ncyO@elf}
+
+\def\fancy@setoffs{%
+% Just in case \let\headwidth\textwidth was used
+  \fancy@gbl\let\headwidth\fancy@headwidth
+  \fancy@gbl\let\fancy@Oolh\fancy@offsolh
+  \fancy@gbl\let\fancy@Oelh\fancy@offselh
+  \fancy@gbl\let\fancy@Oorh\hss
+  \fancy@gbl\let\fancy@Oerh\hss
+  \fancy@gbl\let\fancy@Oolf\fancy@offsolf
+  \fancy@gbl\let\fancy@Oelf\fancy@offself
+  \fancy@gbl\let\fancy@Oorf\hss
+  \fancy@gbl\let\fancy@Oerf\hss}
+
+\newif\iffootnote
+\let\latex@makecol\@makecol
+\def\@makecol{\ifvoid\footins\footnotetrue\else\footnotefalse\fi
+\let\topfloat\@toplist\let\botfloat\@botlist\latex@makecol}
+\def\iftopfloat#1#2{\ifx\topfloat\empty #2\else #1\fi}
+\def\ifbotfloat#1#2{\ifx\botfloat\empty #2\else #1\fi}
+\def\iffloatpage#1#2{\if@fcolmade #1\else #2\fi}
+
+\newcommand{\fancypagestyle}[2]{%
+  \@namedef{ps@#1}{\let\fancy@gbl\relax#2\relax\ps@fancy}}
diff --git a/mlsys2022style/main.tex b/mlsys2022style/main.tex
new file mode 100644
index 0000000..2d8edd9
--- /dev/null
+++ b/mlsys2022style/main.tex
@@ -0,0 +1,1328 @@
+%%%%%%%% mlsys 2022 EXAMPLE LATEX SUBMISSION FILE %%%%%%%%%%%%%%%%%
+
+\documentclass{article}
+
+% Recommended, but optional, packages for figures and better typesetting:
+%\usepackage{microtype}
+\usepackage{graphicx}
+\usepackage{booktabs} % for professional tables
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage{amssymb}
+\usepackage{xcolor}
+\usepackage{soul}
+%\usepackage{algorithm}
+%\usepackage[noend]{algpseudocode}
+\usepackage{dsfont}
+\usepackage{caption}
+\usepackage{subcaption}
+
+% hyperref makes hyperlinks in the resulting PDF.
+% If your build breaks (sometimes temporarily if a hyperlink spans a page)
+% please comment out the following usepackage line and replace
+% \usepackage{mlsys2022} with \usepackage[nohyperref]{mlsys2022} above.
+\usepackage{hyperref}
+
+% Attempt to make hyperref and algorithmic work together better:
+\newcommand{\theHalgorithm}{\arabic{algorithm}}
+
+% Use the following line for the initial blind version submitted for review:
+%\usepackage{mlsys2022}
+
+% If accepted, instead use the following line for the camera-ready submission:
+ \usepackage[accepted]{mlsys2022}
+
+% The \mlsystitle you define below is probably too long as a header.
+% Therefore, a short form for the running title is supplied here:
+%\mlsystitlerunning{D-Cliques}
+
+\begin{document}
+
+\twocolumn[
+\mlsystitle{D-Cliques: Compensating NonIIDness in Decentralized Federated Learning
+with Topology}
+
+% It is OKAY to include author information, even for blind
+% submissions: the style file will automatically remove it for you
+% unless you've provided the [accepted] option to the mlsys2022
+% package.
+
+% List of affiliations: The first argument should be a (short)
+% identifier you will use later to specify author affiliations
+% Academic affiliations should list Department, University, City, Region, Country
+% Industry affiliations should list Company, City, Region, Country
+
+% You can specify symbols, otherwise they are numbered in order.
+% Ideally, you should not use this facility. Affiliations will be numbered
+% in order of appearance and this is the preferred way.
+%\mlsyssetsymbol{equal}{*}
+
+\begin{mlsysauthorlist}
+\mlsysauthor{Aur\'elien Bellet}{inria-lille}
+\mlsysauthor{Anne-Marie Kermarrec}{epfl}
+\mlsysauthor{Erick Lavoie}{epfl}
+\end{mlsysauthorlist}
+
+\mlsysaffiliation{epfl}{EPFL, Lausanne, Switzerland}
+\mlsysaffiliation{inria-lille}{Inria, Lille, France}
+
+\mlsyscorrespondingauthor{Erick Lavoie}{erick.lavoie@epfl.ch}
+
+% You may provide any keywords that you
+% find helpful for describing your paper; these are used to populate
+% the "keywords" metadata in the PDF but will not be shown in the document
+\mlsyskeywords{Decentralized Learning, Federated Learning, Topology,
+Non-IID Data, Stochastic Gradient Descent}
+
+\vskip 0.3in
+
+\begin{abstract}
+%This document provides a basic paper template and submission guidelines.
+%Abstracts must be a single paragraph, ideally between 4--6 sentences long.
+%Gross violations will trigger corrections at the camera-ready phase.
+The convergence speed of machine learning models trained with Federated
+Learning is significantly affected by non-independent and identically
+distributed (non-IID) data partitions, even more so in a fully decentralized
+setting without a central server. In this paper, we show that the impact of
+\textit{local class bias}, an important type of data non-IIDness, can be
+significantly reduced by carefully designing
+the underlying communication topology. We present D-Cliques, a novel topology
+that reduces gradient bias by grouping nodes in interconnected cliques such
+that the local joint distribution in a clique is representative of the global
+class distribution. We also show how to adapt the updates of decentralized SGD
+to obtain unbiased gradients and implement an effective momentum with
+D-Cliques. Our empirical evaluation on MNIST and CIFAR10 demonstrates that our approach
+provides similar convergence speed as a fully-connected topology with a
+significant reduction in the number of edges and messages. In a 1000-node
+topology, D-Cliques requires 98\% less edges and 96\% less total messages,
+with further possible gains using a small-world topology across cliques.
+\end{abstract}
+]
+
+% this must go after the closing bracket ] following \twocolumn[ ...
+
+% This command actually creates the footnote in the first column
+% listing the affiliations and the copyright notice.
+% The command takes one argument, which is text to display at the start of the footnote.
+% The \mlsysEqualContribution command is standard text for equal contribution.
+% Remove it (just {}) if you do not need this facility.
+
+%\printAffiliationsAndNotice{}  % leave blank if no need to mention equal contribution
+\printAffiliationsAndNotice{\mlsysEqualContribution} % otherwise use the standard text.
+
+\section{Introduction}
+
+Machine learning is currently shifting from a \emph{centralized}
+paradigm, in which models are trained on data located on a single machine or
+in a data center, to \emph{decentralized} ones.
+Effectively, the latter paradigm closely matches the natural data distribution
+in the numerous use-cases where data is collected and processed by several
+independent
+parties (hospitals, companies, personal devices...).
+Federated Learning (FL) allows a set
+of participants to collaboratively train machine learning models
+on their joint
+data while keeping it where it has been produced. Not only does this avoid
+the costs of moving data, but it also  mitigates privacy and confidentiality concerns~\cite{kairouz2019advances}.
+Yet, working with natural data distributions introduces new challenges for
+learning systems, as
+local datasets
+reflect the usage and production patterns specific to each participant: they are
+\emph{not} independent and identically distributed
+(non-IID). More specifically, the relative frequency of different classes of examples may significantly vary
+across local datasets \cite{kairouz2019advances,quagmire}.
+Therefore, one of the key challenges in FL is to design algorithms that
+can efficiently deal with such non-IID data distributions
+\cite{kairouz2019advances,fedprox,scaffold,quagmire}.
+
+Federated learning algorithms can be classified into two categories depending
+on the underlying network topology they run on. In server-based FL, the
+network is organized according to a star topology: a central server orchestrates the training process by
+iteratively aggregating model updates received from the participants
+(\emph{clients}) and sending back the aggregated model \cite{mcmahan2016communication}. In contrast,
+fully decentralized FL algorithms operate over an arbitrary network topology
+where participants communicate only with their direct neighbors
+in the network. A classic example of such algorithms is Decentralized
+SGD (D-SGD) \cite{lian2017d-psgd}, in which participants alternate between
+local SGD updates and model averaging with neighboring nodes.
+
+In this paper, we focus on fully decentralized algorithms as they can
+generally scale better to the large number of participants seen in ``cross-device''
+applications \cite{kairouz2019advances}. Effectively, while a central
+server may quickly become a bottleneck as the number of participants increases, the topology used in fully decentralized algorithms can remain sparse
+enough such that all participants need only to communicate with a small number of other participants, i.e. nodes have small (constant or logarithmic) degree 
+\cite{lian2017d-psgd}. For IID data, recent work has shown both empirically 
+\cite{lian2017d-psgd,Lian2018} and theoretically \cite{neglia2020} that sparse
+topologies like rings or grids do not significantly affect the convergence
+speed compared to using denser topologies.
+
+In contrast to the IID case however, our experiments demonstrate that \emph{the impact of topology is extremely significant for non-IID data}. This phenomenon is illustrated
+in Figure~\ref{fig:iid-vs-non-iid-problem}: We observe that  a ring or
+a grid topology clearly jeopardizes the convergence speed as local
+distributions do not have relative frequency of classes similar to the global
+distribution, i.e. they exhibit \textit{local class bias}. We stress the fact
+that, unlike in centralized FL
+\cite{kairouz2019advances,scaffold,quagmire}, this
+happens even when nodes perform a single local update before averaging the
+model with their neighbors. In this paper, we address the following question:
+
+\textit{Can we design sparse topologies with  convergence
+  speed similar to the one obtained in a  fully connected network under
+  a large number of participants with local class bias?}
+
+\begin{figure*}[t]
+     \centering
+     
+     % From directory results/mnist
+     % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py ring/iid/all/2021-03-30-16:07:06-CEST ring/non-iid/all/2021-03-30-16:07:03-CEST --add-min-max --legend 'lower right' --yaxis test-accuracy --labels '100 nodes IID' '100 nodes non-IID' --save-figure ../../figures/ring-IID-vs-non-IID.png --font-size 20 --linestyles 'solid' 'dashed'
+     \begin{subfigure}[b]{0.25\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/ring-IID-vs-non-IID}
+\caption{\label{fig:ring-IID-vs-non-IID} Ring}
+     \end{subfigure}
+     \quad
+    % From directory results/mnist
+     % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py grid/iid/all/2021-03-30-16:07:01-CEST grid/non-iid/all/2021-03-30-16:06:59-CEST --add-min-max --legend 'lower right' --yaxis test-accuracy --labels '100 nodes IID' '100 nodes non-IID' --save-figure ../../figures/grid-IID-vs-non-IID.png --font-size 20 --linestyles 'solid' 'dashed'
+     \begin{subfigure}[b]{0.25\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/grid-IID-vs-non-IID}
+\caption{\label{fig:grid-IID-vs-non-IID} Grid}
+     \end{subfigure}
+     \quad
+         % From directory results/mnist
+     % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py fully-connected/iid/all/2021-03-30-16:07:20-CEST fully-connected/all/2021-03-10-09:25:19-CET  --add-min-max --legend 'lower right' --yaxis test-accuracy --labels '100 nodes IID' '100 nodes non-IID' --save-figure ../../figures/fully-connected-IID-vs-non-IID.png --font-size 20 --linestyles 'solid' 'dashed'
+     \begin{subfigure}[b]{0.25\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/fully-connected-IID-vs-non-IID}
+\caption{\label{fig:fully-connected-IID-vs-non-IID} Fully-connected}
+     \end{subfigure}
+        \caption{IID vs non-IID convergence speed of decentralized SGD for
+        logistic regression on
+        MNIST for different topologies. Bold lines show the average test
+        accuracy across nodes
+        while thin lines show the minimum
+        and maximum accuracy of individual nodes. While the effect of topology
+        is negligible for IID data, it is very significant in the
+        non-IID case. When fully-connected, both cases converge similarly. See
+        Section~\ref{section:experimental-settings} for details on
+        the experimental setup.}
+        \label{fig:iid-vs-non-iid-problem}
+\end{figure*}
+
+Specifically, we make the following contributions:
+(1) We propose D-Cliques, a sparse topology in which nodes are organized in
+interconnected cliques, i.e. locally fully-connected sets of nodes, such that
+the joint data distribution of each clique is representative of the global 
+(IID) distribution; (2) We propose Clique Averaging, a  modified version of 
+the standard D-SGD algorithm which decouples gradient averaging, used for
+optimizing local models, from distributed averaging, used to ensure all models
+converge, therefore reducing the bias introduced by inter-clique connections; 
+(3) We show how Clique Averaging can be used to implement unbiased momentum
+that would otherwise be detrimental in the non-IID setting; (4) We 
+demonstrate
+through an extensive experimental study that our approach  removes the effect
+of the local class bias on the MNIST~\cite{mnistWebsite} and CIFAR10~
+\cite{krizhevsky2009learning} datasets, for training a linear model and a deep
+convolutional network;  (5) Finally, we demonstrate the scalability of our
+approach by considering  up to 1000-node networks, in contrast to most
+previous work on fully decentralized learning that considers only a few tens
+of nodes
+\cite{tang18a,neglia2020,momentum_noniid,cross_gradient,consensus_distance}.
+
+For instance, our results show that using D-Cliques in a 1000-node network
+requires 98\% less edges ($18.9$ vs $999$ edges per participant on average),
+thereby yielding a 96\% reduction in the total number of required messages 
+(37.8 messages per round per node on average instead of 999), to obtain a similar convergence speed as a fully-connected topology. Furthermore an additional 22\% improvement
+% (14.5 edges per node on average instead of 18.9)
+is possible when using a small-world inter-clique topology, with further potential gains at larger scales because of its quasilinear scaling ($O(n \log(n))$) in $n$, the number of nodes.
+
+The rest of this paper is organized as follows. We first present the problem
+statement and our methodology (Section~\ref{section:problem}). The D-Cliques
+design is presented in Section~\ref{section:d-cliques}) along with an
+empirical illustration of its benefits. In
+Section~\ref{section:clique-averaging-momentum}, we
+show how to further reduce bias with Clique Averaging and how to use it to
+implement momentum.  We present the results of our extensive experimental
+study in  Section~\ref{section:non-clustered}. We review some related work in
+ Section~\ref{section:related-work}, and conclude with promising directions
+ for future work in Section~\ref{section:conclusion}.
+ 
+ \section{Problem Statement}
+
+\label{section:problem}
+
+We consider a set $N = \{1, \dots, n \}$ of $n$ nodes seeking to
+collaboratively solve a classification task with $c$ classes. Each node has access to a local dataset that
+ follows its own local distribution $D_i$. The goal is to find a global model
+ $x$ that performs well on the union of the local distributions by minimizing
+ the average training loss:
+\begin{equation}
+\min_{x} \frac{1}{n}\sum_{i=1}^{n} \mathds{E}_
+{s_i \sim D_i} [F_i(x;s_i)],
+\label{eq:dist-optimization-problem}
+\end{equation}
+where $s_i$ is a data example drawn from $D_i$ and $F_i$ is the loss function
+on node $i$. Therefore, $\mathds{E}_{s_i \sim D_i} F_i(x;s_i)$ denotes  the
+expected loss of model $x$ on a random example $s_i$ drawn from $D_i$.
+
+To collaboratively solve Problem \eqref{eq:dist-optimization-problem}, each
+node can exchange messages with its neighbors in an undirected network graph
+$G(N,E)$ where $\{i,j\}\in E$ denotes an edge (communication channel)
+between nodes $i$ and $j$.
+
+\subsection{Training Algorithm}
+
+In this work, we use the popular Decentralized Stochastic
+Gradient Descent algorithm, aka D-SGD~\cite{lian2017d-psgd}. As
+shown in Algorithm~\ref{Algorithm:D-PSGD},
+a single iteration of D-SGD at node $i$ consists of sampling a mini-batch
+from its local distribution
+$D_i$, updating its local model $x_i$ by taking a stochastic gradient descent 
+(SGD) step according to the mini-batch, and performing a weighted average of
+its local model with those of its
+neighbors.
+This weighted average is defined by a
+mixing matrix $W$, in which $W_{ij}$ corresponds to the weight of
+the outgoing connection from node $i$ to $j$ and $W_{ij} = 0$ for $
+\{i,j\}\notin
+E$. To ensure that the local models converge on average to a stationary
+point
+of Problem
+\eqref{eq:dist-optimization-problem}, $W$
+must be doubly
+stochastic ($\sum_{j \in N} W_{ij} = 1$ and $\sum_{j \in N} W_{ji} = 1$) and
+symmetric, i.e. $W_{ij} = W_{ji}$~\cite{lian2017d-psgd}.
+
+%\begin{algorithm}[t]
+%   \caption{D-SGD, Node $i$}
+%   \label{Algorithm:D-PSGD}
+%   \begin{algorithmic}[1]
+%        \State \textbf{Require:} initial model parameters $x_i^{(0)}$,
+%        learning rate $\gamma$, mixing weights $W$, mini-batch size $m$,
+%        number of steps $K$
+%        \For{$k = 1,\ldots, K$}
+%          \State $s_i^{(k)} \gets \text{mini-batch sample of size $m$ drawn
+%          from~} D_i$
+%          \State $x_i^{(k-\frac{1}{2})} \gets x_i^{(k-1)} - \gamma \nabla F(x_i^{(k-1)}; s_i^{(k)})$ 
+%          \State $x_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} x_j^{(k-\frac{1}{2})}$
+%        \EndFor
+%   \end{algorithmic}
+%\end{algorithm}
+
+\subsection{Methodology}
+
+\subsubsection{Non-IID assumptions.}
+\label{section:non-iid-assumptions}
+
+As demonstrated in Figure~\ref{fig:iid-vs-non-iid-problem}, lifting the
+assumption of IID data significantly challenges the learning algorithm. In
+this paper, we focus on an \textit{extreme case of local class bias}: we
+consider that each node only has examples from a single class.
+
+To isolate the effect of local class bias from other potentially compounding
+factors, we make the following simplifying assumptions: (1) All classes are
+equally represented in the global dataset; (2) All classes are represented on
+the same number of nodes; (3) All nodes have the same number of examples.
+
+We believe that these assumptions are reasonable in the context of our study
+because: (1)
+Global class
+imbalance equally
+affects the optimization process on a single node and is therefore not
+specific to the decentralized setting; (2) Our results do not exploit specific
+positions in the topology;  (3) Imbalanced dataset sizes across nodes can be
+addressed for instance by appropriately weighting the individual loss
+functions. Our results can be extended to support additional compounding factors in future work.
+
+\subsubsection{Experimental setup.}
+\label{section:experimental-settings}
+
+Our main goal is to provide a fair comparison of the convergence speed across
+different topologies and algorithmic variations, in order to
+show that our approach
+can remove much of the effect of local class bias.
+
+We experiment with two datasets: MNIST~\cite{mnistWebsite} and
+CIFAR10~\cite{krizhevsky2009learning}, which both have $c=10$ classes.
+For MNIST, we use 45k and 10k examples from the original 60k
+training set for training and validation respectively. The remaining 5k
+training examples were randomly removed to ensure all 10 classes are balanced
+while ensuring that the dataset is evenly divisible across 100 and 1000 nodes.
+We use all 10k examples of
+the test set to measure prediction accuracy. For CIFAR10, classes are evenly
+balanced: we use 45k/50k images of the original training set for training,
+5k/50k for validation, and all 10k examples of the test set for measuring
+prediction accuracy.
+
+We
+use a logistic regression classifier for MNIST, which
+provides up to 92.5\% accuracy in the centralized setting.
+For CIFAR10, we use a Group-Normalized variant of LeNet~\cite{quagmire}, a
+deep convolutional network which achieves an accuracy of $72.3\%$ in the
+centralized setting.
+These models are thus reasonably accurate (which is sufficient to
+study the effect of the topology) while being sufficiently fast to train in a
+fully decentralized setting and simple enough to configure and analyze.
+Regarding hyper-parameters, we jointly optimize the learning rate and
+mini-batch size on the
+validation set for 100 nodes, obtaining respectively $0.1$ and $128$ for
+MNIST and $0.002$ and $20$ for CIFAR10.
+For CIFAR10, we additionally use a momentum of $0.9$.
+
+We evaluate 100- and 1000-node networks by creating multiple models in memory and simulating the exchange of messages between nodes.
+To ignore the impact of distributed execution strategies and system
+optimization techniques, we report the test accuracy of all nodes (min, max,
+average) as a function of the number of times each example of the dataset has
+been sampled by a node, i.e. an \textit{epoch}. This is equivalent to the classic case of a single node sampling the full distribution.
+To further make results comparable across different number of nodes, we lower
+the batch size proportionally to the number of nodes added, and inversely,
+e.g. on MNIST, 128 with 100 nodes vs. 13 with 1000 nodes. This
+ensures the same number of model updates and averaging per epoch, which is
+important to have a fair comparison.\footnote{Updating and averaging models
+after every example can eliminate the impact of local class bias. However, the
+resulting communication overhead is impractical.}
+
+Finally, we compare our results against an ideal baseline: either a
+fully-connected network topology with the same number of nodes or a single IID
+node. In both cases, the topology has no effect on
+the optimization. For a certain choice of number of nodes and
+mini-batch size, both approaches are equivalent. 
+
+\section{D-Cliques: Creating Locally Representative Cliques}
+\label{section:d-cliques}
+
+In this section, we present the design of D-Cliques. To give an intuition of our approach, let us consider the neighborhood of a single node in a grid similar to that of Figure~\ref{fig:grid-IID-vs-non-IID}, represented on Figure~\ref{fig:grid-iid-vs-non-iid-neighbourhood}.
+The colors of a node represent the different classes present in its local
+dataset. In the IID setting (Figure~\ref{fig:grid-iid-neighbourhood}), each
+node has examples of all classes in equal proportions. In the non-IID setting 
+(Figure~\ref{fig:grid-non-iid-neighbourhood}), each node has examples of only
+a
+single class and nodes are distributed randomly in the grid.
+
+A single training step, from the point of view of the center node, is equivalent to sampling a mini-batch five times larger from the union of the local distributions of all illustrated nodes.
+In the IID case, since gradients are computed from examples of all classes,
+the resulting averaged gradient  points in a direction that tends to reduce
+the loss across all classes. In contrast, in the non-IID case, only a subset
+of classes are
+represented in the immediate neighborhood of the node, thus the gradients will
+be biased towards these classes.
+Importantly, as the distributed averaging algorithm takes several steps to
+converge, this variance persists across iterations as the locally computed
+gradients are far from the global average.\footnote{It is possible, but
+very costly, to mitigate this by performing a sufficiently large number of
+averaging steps between each gradient step.} This can significantly slow down
+convergence speed to the point of making decentralized optimization
+impractical.
+
+\begin{figure}[t]
+     \centering
+     \begin{subfigure}[b]{0.18\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/grid-iid-neighbourhood}
+\caption{\label{fig:grid-iid-neighbourhood} IID}
+     \end{subfigure}
+     \begin{subfigure}[b]{0.18\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/grid-non-iid-neighbourhood}
+\caption{\label{fig:grid-non-iid-neighbourhood}  Non-IID}
+     \end{subfigure}
+        \caption{Neighborhood in an IID and non-IID grid.}
+        \label{fig:grid-iid-vs-non-iid-neighbourhood}
+\end{figure}
+
+In D-Cliques, we address the issues of non-iidness by carefully designing a
+network topology composed of \textit{cliques} and \textit{inter-clique
+connections}:
+\begin{itemize}
+ \item  D-Cliques recover a balanced representation of classes, similar to
+ that of the IID case, by constructing a topology such that each node is
+ part of a \textit{clique} with neighbors representing all classes.
+ \item To ensure a global consensus and convergence, 
+ \textit{inter-clique connections}
+ are introduced by connecting a small number of node pairs that are
+ part of  different cliques.
+\end{itemize}
+In the following, we introduce up to one inter-clique connection per node such that each clique has exactly one
+edge with all other cliques, see Figure~\ref{fig:d-cliques-figure} for the
+corresponding D-Cliques network in the case of $n=100$ nodes and $c=10$
+classes. We will explore sparser inter-clique topologies in Section~\ref{section:interclique-topologies}.
+
+The mixing matrix $W$ required by D-SGD is obtained from standard
+Metropolis-Hasting weights~\cite{xiao2004fast} computed from the above
+topology, namely:
+\begin{equation}
+  W_{ij} = \begin{cases}
+    \frac{1}{\max(\text{degree}(i), \text{degree}(j)) + 1} & \text{if}~i \neq
+    j \text{ and } \{i,j\}\in E,\\
+   1 - \sum_{j \neq i} W_{ij} & \text{if}~$i = j$, \\
+   0 & \text{otherwise}.
+  \end{cases}
+  \label{eq:metro}
+\end{equation}
+
+
+We refer to Algorithm~\ref{Algorithm:D-Clique-Construction} in the appendix
+for a formal account of D-Cliques construction. We note that it only requires
+the knowledge of the local class distribution at each node. For the sake of
+simplicity, we assume that D-Cliques is constructed from the global
+knowledge of these distributions, which can easily be obtained by
+decentralized averaging in a pre-processing step. 
+
+The key idea of D-Cliques is that because the clique-level distribution $D_{
+\textit{clique}} = \sum_{i
+\in \textit{clique}} D_i$ is representative of the global distribution,
+the local models of nodes across cliques remain rather close. Therefore, a
+sparse inter-clique topology can be used, significantly reducing the total
+number of edges without slowing down the convergence. Furthermore, the degree
+of each node in the network remains low and even, making the D-Cliques
+topology very well-suited to decentralized federated learning. 
+
+\begin{figure}[t]
+    \centering 
+             
+    \begin{subfigure}[b]{0.20\textwidth}
+    \centering
+    \includegraphics[width=\textwidth]{../figures/fully-connected-cliques}
+    \caption{\label{fig:d-cliques-figure} D-Cliques (fully-connected
+    cliques)}
+    \end{subfigure}
+    \hfill
+    % To regenerate figure, from results/mnist
+    % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py fully-connected/all/2021-03-10-09:25:19-CET no-init-no-clique-avg/fully-connected-cliques/all/2021-03-12-11:12:49-CET --add-min-max --yaxis test-accuracy --ymin 80 --ymax 92.5 --labels '100 nodes non-IID fully-connected' '100 nodes non-IID d-cliques' --save-figure ../../figures/d-cliques-mnist-vs-fully-connected.png --legend 'lower right' --font-size 16 --linestyles 'solid' 'dashed'
+    \begin{subfigure}[b]{0.26\textwidth}
+    \centering
+    \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-vs-fully-connected.png}
+    \caption{\label{fig:d-cliques-example-convergence-speed} Convergence Speed
+    on MNIST}
+    \end{subfigure}
+    
+\caption{\label{fig:d-cliques-example} D-Cliques topology and convergence
+speed on MNIST.}
+\end{figure}
+
+Figure~\ref{fig:d-cliques-example-convergence-speed} illustrates the
+performance of D-Cliques on MNIST with $n=100$ nodes. Observe that the
+convergence speed is
+very close
+to that of a fully-connected topology, and significantly better than with
+a ring or a grid (see Figure~\ref{fig:iid-vs-non-iid-problem}). With 
+100 nodes, it offers a reduction of $\approx90\%$ in the number of edges
+compared to a fully-connected topology. Nonetheless, there is still
+significant variance in the accuracy across nodes, which is due to the bias
+introduced by inter-clique edges. We address this issue in the next section.
+
+\section{Optimizing with Clique Averaging and Momentum}
+\label{section:clique-averaging-momentum}
+
+In this section, we present Clique Averaging. This feature, when added to D-SGD,
+removes the bias caused by the inter-cliques edges of
+D-Cliques. We also show how it can be used to successfully implement momentum
+for non-IID data.
+
+\subsection{Clique Averaging: Debiasing Gradients from Inter-Clique Edges}
+\label{section:clique-averaging}
+
+While limiting the number of inter-clique connections reduces the
+amount of messages traveling on the network, it also introduces its own
+bias.
+Figure~\ref{fig:connected-cliques-bias} illustrates the problem on the
+simple case of two cliques connected by one inter-clique edge (here,
+between the green node of the left clique and the pink node of the right
+clique). Let us focus on node A. With weights computed as in \eqref{eq:metro},
+node A's self-weight is $\frac{12}
+{110}$, the weight between A and the green node connected to B is
+$\frac{10}{110}$, and
+all other neighbors of A have a weight of $\frac{11}{110}$. Therefore, the
+gradient at A is biased towards its own class (pink) and against the green
+class. A similar bias holds for all other nodes
+without inter-clique edges with respect to their respective classes. For node
+B, all its edge weights (including its self-weight) are equal to $\frac{1}
+{11}$. However, the green class is represented twice (once as a clique
+neighbor and once from the inter-clique edge), while all other classes are
+represented only once. This biases the gradient toward the green class. The
+combined effect of these two sources of bias is to increase the variance
+of the local models across nodes.
+
+\begin{figure}[t]
+         \centering
+         \includegraphics[width=0.3\textwidth]{../figures/connected-cliques-bias}
+\caption{\label{fig:connected-cliques-bias} Illustrating the bias induced by
+inter-clique connections (see main text).}
+\end{figure}
+
+We address this problem by adding \emph{Clique Averaging} to D-SGD
+(Algorithm~\ref{Algorithm:Clique-Unbiased-D-PSGD}), which essentially
+decouples gradient averaging from model averaging. The idea is to use only the
+gradients of
+neighbors within the same clique to compute the average gradient,
+providing an equal representation to all classes. In contrast, all neighbors'
+models, including those across inter-clique edges, participate in the model
+averaging step as in the original version.
+
+%\begin{algorithm}[t]
+%   \caption{D-SGD with Clique Averaging, Node $i$}
+%   \label{Algorithm:Clique-Unbiased-D-PSGD}
+%   \begin{algorithmic}[1]
+%        \State \textbf{Require} initial model parameters $x_i^{(0)}$, learning
+%        rate $\gamma$, mixing weights $W$, mini-batch size $m$, number of
+%        steps $K$
+%        \For{$k = 1,\ldots, K$}
+%          \State $s_i^{(k)} \gets \text{mini-batch sample of size $m$ drawn
+%          from~} D_i$
+%          \State $g_i^{(k)} \gets \frac{1}{|\textit{Clique}(i)|}\sum_{j \in \textit{Clique(i)}}  \nabla F(x_j^{(k-1)}; s_j^{(k)})$
+%          \State $x_i^{(k-\frac{1}{2})} \gets x_i^{(k-1)} - \gamma g_i^{(k)}$ 
+%          \State $x_i^{(k)} \gets \sum_{j \in N} W_{ji}^{(k)} x_j^{(k-\frac{1}{2})}$
+%        \EndFor
+%   \end{algorithmic}
+%\end{algorithm}
+
+% To regenerate figure, from results/mnist:
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py fully-connected/all/2021-03-10-09:25:19-CET no-init-no-clique-avg/fully-connected-cliques/all/2021-03-12-11:12:49-CET  no-init/fully-connected-cliques/all/2021-03-12-11:12:01-CET --add-min-max --yaxis test-accuracy --labels '100 nodes non-IID fully-connected' '100 nodes non-IID d-cliques w/o clique avg.' '100 nodes d-cliques non-IID w/ clique avg.' --legend 'lower right' --ymin 89 --ymax 92.5 --font-size 13 --save-figure ../../figures/d-clique-mnist-clique-avg.png --linestyles 'solid' 'dashed' 'dotted'
+\begin{figure}[t]
+         \centering
+         \includegraphics[width=0.35\textwidth]{../figures/d-clique-mnist-clique-avg}
+\caption{\label{fig:d-clique-mnist-clique-avg} Effect of Clique Averaging on MNIST. Y-axis starts at 89.}
+\end{figure}
+
+As illustrated in Figure~\ref{fig:d-clique-mnist-clique-avg}, this
+significantly reduces the variance of models across nodes and accelerates
+convergence to reach the same level as the one obtained with a
+fully-connected topology. Note that Clique Averaging induces a small
+additional cost, as gradients
+and models need to be sent in two separate rounds of messages. Nonetheless, compared to fully connecting all nodes, the total number of messages is reduced by $\approx 80\%$.
+
+\subsection{Implementing Momentum with Clique Averaging}
+\label{section:momentum}
+
+Efficiently training high capacity models usually requires additional
+optimization techniques. In particular, momentum~\cite{pmlr-v28-sutskever13}
+increases the magnitude of the components of the gradient that are shared
+between several consecutive steps, and is critical for deep convolutional networks like
+LeNet~\cite{lecun1998gradient,quagmire} to converge quickly. However, a direct
+application of momentum in a non-IID setting can actually be very detrimental.
+As illustrated in Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-effect}
+for the case of LeNet on CIFAR10 with 100 nodes, D-Cliques with momentum
+even fails to converge. Not using momentum actually gives a faster
+convergence, but there is a significant gap compared to the case of a single
+IID node with momentum.
+
+\begin{figure}[t]
+    \centering 
+    % To regenerate figure, from results/cifar10
+    % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-13:52:58-CET  no-init-no-clique-avg/fully-connected-cliques/all/2021-03-13-18:34:35-CET no-init-no-clique-avg-no-momentum/fully-connected-cliques/all/2021-03-26-13:47:35-CET/ --legend 'upper right' --add-min-max --labels '1-node IID w/ momentum'  '100 nodes non-IID d-cliques w/ momentum' '100 nodes non-IID d-cliques w/o momentum'  --font-size 14 --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-momentum-non-iid-effect.png --ymax 100 --linestyles 'solid' 'dashed' 'dotted'         
+    \begin{subfigure}[b]{0.35\textwidth}
+    \centering
+    \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-momentum-non-iid-effect}
+    \caption{\label{fig:d-cliques-cifar10-momentum-non-iid-effect} Without Clique Averaging }
+    \end{subfigure}
+    \hfill
+    % To regenerate figure, from results/cifar10
+    % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-13:52:58-CET no-init/fully-connected-cliques/all/2021-03-13-18:32:55-CET --legend 'upper right' --add-min-max --labels '1-node IID w/ momentum' '100 nodes non-IID d-clique w/ momentum' --font-size 14 --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-momentum-non-iid-clique-avg-effect.png --ymax 100 --linestyles 'solid' 'dashed' 'dotted' 
+    \begin{subfigure}[b]{0.35\textwidth}
+    \centering
+    \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-momentum-non-iid-clique-avg-effect}
+    \caption{\label{fig:d-cliques-cifar10-momentum-non-iid-clique-avg-effect} With Clique Averaging}
+    \end{subfigure}
+\caption{\label{fig:cifar10-momentum} Non-IID Effect of Momentum on CIFAR10 with LeNet}
+\end{figure}
+
+We show here that Clique Averaging (Section~\ref{section:clique-averaging})
+allows us to compute an unbiased momentum from the
+unbiased average gradient $g_i^{(k)}$ of Algorithm~\ref{Algorithm:Clique-Unbiased-D-PSGD}:
+\begin{equation}
+v_i^{(k)} \leftarrow m v_i^{(k-1)} +  g_i^{(k)} 
+\end{equation}
+It then suffices to modify the original gradient step to use momentum:
+\begin{equation}
+x_i^{(k-\frac{1}{2})} \leftarrow x_i^{(k-1)} - \gamma v_i^{(k)} 
+\end{equation}
+
+As shown in
+Figure~\ref{fig:d-cliques-cifar10-momentum-non-iid-clique-avg-effect}, this
+simple modification restores the benefits of momentum and closes the gap
+with the centralized setting.
+
+\section{Comparative Evaluation and Extensions}
+\label{section:non-clustered}
+
+In this section, we first compare D-Cliques to alternative topologies to
+confirm the relevance of our main design choices. Then,
+we evaluate some extensions of D-Cliques to further reduce the number of
+inter-clique connections so as to gracefully scale with the number of
+nodes.
+
+\subsection{Comparing D-Cliques to Other Sparse Topologies}
+
+We demonstrate the advantages of D-Cliques over alternative sparse topologies
+that have a similar number of edges. First, we consider topologies in which
+the neighbors of each node are selected at random (hence without any clique
+structure).
+Specifically, for $n=100$ nodes, we
+construct a random topology such that each node has exactly 10 edges, which is
+similar to the average 9.9 edges of our D-Cliques topology 
+(Figure~\ref{fig:d-cliques-figure}). To better understand the role of
+the clique structure beyond merely ensuring class representativity among
+neighbors,
+we also compare to a random topology similar to the one described above except
+that edges are
+chosen such that each node has neighbors of all possible classes. Finally, we
+also implement an analog of Clique Averaging for these random topologies,
+where all nodes de-bias their gradient based on the class distribution of
+their neighbors. In the latter case, since nodes do not form a clique, each
+node obtains a different average gradient.
+
+The results for MNIST and CIFAR10 are shown in
+Figure~\ref{fig:d-cliques-comparison-to-non-clustered-topologies}. For MNIST,
+a purely random topology has higher variance and lower convergence speed than
+D-Cliques (with or without Clique Averaging), while a random topology with
+class representativity performs similarly as D-Cliques without Clique
+Averaging. However and perhaps surprisingly, a random topology with unbiased
+gradient performs slightly worse than without it. In any case, D-Cliques with
+Clique Averaging outperforms all random topologies, showing that the clique
+structure has a small but noticeable effect on the average accuracy and
+significantly reduces the variance across nodes in this setup.
+
+\begin{figure}[t]
+     \centering     
+         \begin{subfigure}[b]{0.35\textwidth}
+% To regenerate the figure, from directory results/mnist
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py fully-connected-cliques/all/2021-03-10-10:19:44-CET no-init-no-clique-avg/fully-connected-cliques/all/2021-03-12-11:12:49-CET  random-10/all/2021-07-23-11:59:56-CEST  random-10-diverse/all/2021-03-17-20:28:35-CET --labels 'd-clique (fcc)' 'd-clique (fcc) no clique avg.' '10 random edges' '10 random edges (all classes represented)' --add-min-max --legend 'lower right' --ymin 80 --ymax 92.5 --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-linear-comparison-to-non-clustered-topologies.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted' 'dashdot'
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-linear-comparison-to-non-clustered-topologies}
+                  \caption{MNIST with Linear Model}
+         \end{subfigure}
+                 \hfill                      
+% To regenerate the figure, from directory results/cifar10
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py no-init/fully-connected-cliques/all/2021-03-13-18:32:55-CET no-init-no-clique-avg/fully-connected-cliques/all/2021-03-13-18:34:35-CET random-10/all/2021-07-23-14:33:48-CEST  random-10-diverse/all/2021-03-17-20:30:41-CET random-10-diverse-unbiased-gradient/all/2021-03-17-20:31:14-CET --labels 'd-clique (fcc) clique avg.' 'd-clique (fcc) no clique avg.' '10 random edges' '10 random edges (all classes repr.)' '10 random (all classes repr.) with unbiased grad.' --add-min-max --legend 'upper left' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-linear-comparison-to-non-clustered-topologies.png --ymax 119 --font-size 13  --linestyles 'solid' 'dashed' 'dotted' 'dashdot' 'solid' --markers '' '' '' '' 'o'
+        \begin{subfigure}[b]{0.35\textwidth}
+        \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-linear-comparison-to-non-clustered-topologies}
+         \caption{CIFAR10 with LeNet}
+     \end{subfigure} 
+ \caption{\label{fig:d-cliques-comparison-to-non-clustered-topologies} Comparison to Non-Clustered Topologies} 
+\end{figure}
+
+On the harder CIFAR10 dataset with a deep convolutional network, the
+differences are much more dramatic:
+D-Cliques with Clique Averaging and momentum turns out to be critical for fast
+convergence.
+Crucially, all random topologies fail to converge to a good solution. This
+confirms that our clique structure is important to reduce variance
+across nodes and improve the convergence. The difference with the previous
+experiment seems to be due to both the use of a higher capacity model and to
+the intrinsic characteristics of the datasets.
+
+While the previous experiments suggest that our clique structure is
+instrumental in obtaining good performance, one may wonder whether
+intra-clique full connectivity is actually necessary.
+Figure~\ref{fig:d-cliques-intra-connectivity} shows the convergence speed of
+a D-Cliques topology where cliques have been sparsified by randomly
+removing 1 or 5 undirected edges per clique (out of 45). Strikingly, both for MNIST and
+CIFAR10, removing just a single edge from the cliques has a
+significant effect on the
+convergence speed. On CIFAR10, it even entirely negates the
+benefits of D-Cliques.
+
+Overall, these results show that achieving fast convergence on non-IID
+data with sparse topologies requires a very careful design, as we have
+proposed with D-Cliques.
+
+\begin{figure*}[t]
+     \centering
+
+\begin{subfigure}[htbp]{0.4\textwidth}
+     \centering   
+% To regenerate the figure, from directory results/mnist
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py no-init/fully-connected-cliques/all/2021-03-12-11:12:01-CET rm-1-edge/all/2021-03-18-17:28:27-CET rm-1-edge-unbiased-grad/all/2021-03-18-17:28:47-CET --add-min-max --ymin 85 --ymax 92.5 --legend 'lower right' --yaxis test-accuracy --labels 'fcc, clique grad.' 'fcc -1 edge/clique, no clique grad.' 'fcc -1 edge/clique, clique grad.' --save-figure ../../figures/d-cliques-mnist-clique-clustering-fcc-minus-1-edge.png  --font-size 13  --linestyle 'solid' 'dashed' 'dotted' 
+         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-clique-clustering-fcc-minus-1-edge}     
+\caption{\label{fig:d-cliques-mnist-clique-clustering-minus-1-edge} MNIST (-1 edge/clique)}
+\end{subfigure}
+\hfill
+\begin{subfigure}[htbp]{0.4\textwidth}
+     \centering
+% To regenerate the figure, from directory results/cifar10
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py no-init/fully-connected-cliques/all/2021-03-13-18:32:55-CET rm-1-edge/all/2021-03-18-17:29:58-CET rm-1-edge-unbiased-grad/all/2021-03-18-17:30:17-CET --add-min-max --ymax 80 --legend 'upper left' --yaxis test-accuracy --labels 'fcc, clique grad.' 'fcc -1 edge/clique, no clique grad.' 'fcc -1 edge/clique, clique grad.' --save-figure ../../figures/d-cliques-cifar10-clique-clustering-fcc-minus-1-edge.png --font-size 13 --linestyle 'solid' 'dashed' 'dotted'
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-clique-clustering-fcc-minus-1-edge}
+\caption{\label{fig:d-cliques-cifar10-clique-clustering-minus-1-edge} CIFAR10 (-1 edge/clique)}
+\end{subfigure}
+
+%\begin{subfigure}[htbp]{0.35\textwidth}
+%     \centering  
+%% To regenerate the figure, from directory results/mnist
+%% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py no-init/fully-connected-cliques/all/2021-03-12-11:12:01-CET rm-5-edges/all/2021-03-18-17:29:10-CET rm-5-edges-unbiased-grad/all/2021-03-18-17:29:36-CET --add-min-max --ymin 85 --ymax 92.5 --legend 'lower right' --yaxis test-accuracy --labels 'fcc, clique grad.' 'fcc -5 edges/clique, no clique grad.' 'fcc -5 edges/clique, clique grad.' --save-figure ../../figures/d-cliques-mnist-clique-clustering-fcc-minus-5-edges.png  --font-size 13 --linestyle 'solid' 'dashed' 'dotted'   
+%         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-clique-clustering-fcc-minus-5-edges}     
+%\caption{\label{fig:d-cliques-mnist-clique-clustering-minus-5-edges} MNIST (-5 edges/clique)}
+%\end{subfigure}
+%\hfill
+%\begin{subfigure}[htbp]{0.35\textwidth}
+%     \centering
+%% To regenerate the figure, from directory results/cifar10
+%% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py no-init/fully-connected-cliques/all/2021-03-13-18:32:55-CET rm-5-edges/all/2021-03-18-17:30:38-CET rm-5-edges-unbiased-grad/all/2021-03-18-17:31:04-CET --add-min-max --ymax 80 --legend 'upper left' --yaxis test-accuracy --labels 'fcc, clique grad.' 'fcc -5 edges/clique, no clique grad.'  'fcc -5 edges/clique, clique grad.' --save-figure ../../figures/d-cliques-cifar10-clique-clustering-fcc-minus-5-edges.png --font-size 13 --linestyle 'solid' 'dashed' 'dotted'
+%         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-clique-clustering-fcc-minus-5-edges}
+%\caption{\label{fig:d-cliques-cifar10-clique-clustering-minus-5-edges} CIFAR10 (-5 edges/clique)}
+%\end{subfigure}
+
+\caption{\label{fig:d-cliques-intra-connectivity} Importance of Intra-Clique Full-Connectivity}
+\end{figure*}
+
+\subsection{Scaling up D-Cliques with Sparser Inter-Clique Topologies}
+\label{section:interclique-topologies}
+
+
+So far, we have used a fully-connected inter-clique topology for D-Cliques,
+which has the advantage of bounding the
+\textit{path length}\footnote{The \textit{path length} is the number of edges on the path with the shortest number of edges between two nodes.} to $3$ between any pair of nodes. This choice requires $
+\frac{n}{c}(\frac{n}{c} - 1)$ inter-clique edges, which scales quadratically
+in the number of nodes $n$ for a given clique size $c$\footnote{We consider \textit{directed} edges in the analysis: the number of undirected edges is half and does not affect asymptotic behavior.}. This can become significant at larger scales when $n$ is
+large compared to $c$.
+
+In this last series of experiments, we evaluate the effect of choosing sparser
+inter-clique topologies on the convergence speed for a larger network of 1000
+nodes. We compare the scalability and convergence speed of several
+D-Cliques variants, which all use $O(nc)$ edges
+to create cliques as a starting point.
+
+We first measure the convergence speed of inter-cliques topologies whose number of edges scales linearly with the number of nodes. Among those, the \textit{ring} has the (almost) fewest possible number of edges: it
+uses $\frac{2n}{c}$ inter-clique edges but its average path length between nodes 
+also scales linearly.
+We also consider another topology, which we call \textit{fractal}, that provides a
+logarithmic
+bound on the average path length. In this hierarchical scheme, 
+cliques are assembled in larger groups of $c$ cliques that are connected internally with one edge per
+pair of cliques, but with only one edge between pairs of larger groups. The
+topology is built recursively such that $c$ groups will themselves form a
+larger group at the next level up. This results in at most $c$ edges per node 
+if edges are evenly distributed: i.e., each group within the same level adds 
+at most $c-1$ edges to other groups, leaving one node per group with $c-1$ 
+edges that can receive an additional edge to connect with other groups at the next level.
+Since nodes have at most $c$ edges, $n$ nodes have at most $nc$ edges, therefore
+the number of edges in this fractal scheme indeed scales linearly in the number of nodes.
+
+Second, we look at another scheme 
+in which the number of edges scales in a near, but not quite, linear fashion.
+We propose to connect cliques according to a
+small-world-like topology~\cite{watts2000small} applied on top of a
+ring~\cite{stoica2003chord}. In this scheme, cliques are first arranged in a
+ring. Then each clique adds symmetric edges, both clockwise and
+counter-clockwise on the ring, with the $m$ closest cliques in sets of
+cliques that are exponentially bigger the further they are on the ring (see
+Algorithm~\ref{Algorithm:Smallworld} in the appendix for
+details on the construction). This ensures a good connectivity with other
+cliques that are close on the ring, while still keeping the average
+path length small. This scheme uses $\frac{n}{c}*2(m)\log(\frac{n}{c})$ inter-clique edges and
+therefore grows in the order of $O(n\log(n))$ with the number of nodes.
+
+Figure~\ref{fig:d-cliques-cifar10-convolutional} shows the convergence
+speed of all the above schemes on MNIST and CIFAR10, compared to the ideal
+baseline
+of a
+single IID node performing the same number of updates per epoch (representing
+the fastest convergence speed achievable if topology had no impact). Among the linear schemes, the ring
+topology converges but is much slower than our fractal scheme. Among the super-linear schemes, the small-world
+topology has a convergence speed that is almost the same as with a
+fully-connected inter-clique topology but with 22\% less edges
+(14.5 edges on average instead of 18.9). 
+
+While the small-world inter-clique topology shows promising scaling behaviour, the
+fully-connected topology still offers
+significant benefits with 1000 nodes, as it represents a 98\% reduction in the
+number of edges compared to fully connecting individual nodes (18.9 edges on
+average instead of 999) and a 96\% reduction in the number of messages (37.8
+messages per round per node on average instead of 999). We refer to
+Appendix~\ref{app:scaling} for additional results comparing the convergence
+speed across different number of nodes. Overall, these results
+show that D-Cliques can nicely scale with the number of nodes.
+
+\begin{figure*}[t]
+     \centering
+       % To regenerate the figure, from directory results/mnist
+ % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-09:20:03-CET ../scaling/1000/mnist/fractal-cliques/all/2021-03-14-17:41:59-CET ../scaling/1000/mnist/clique-ring/all/2021-03-13-18:22:36-CET     --add-min-max --yaxis test-accuracy --legend 'lower right' --ymin 84 --ymax 92.5 --labels '1 node IID' 'd-cliques (fractal)' 'd-cliques (ring)'  --save-figure ../../figures/d-cliques-mnist-1000-nodes-comparison-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
+     \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+            \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-1000-nodes-comparison-linear}
+             \caption{\label{fig:d-cliques-mnist-1000-nodes-comparison-linear} MNIST with Linear Model: Linear Inter-clique Topologies.}
+     \end{subfigure}
+     \hfill
+     % To regenerate the figure, from directory results/cifar10
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-13:52:58-CET ../scaling/1000/cifar10/fractal-cliques/all/2021-03-14-17:42:46-CET ../scaling/1000/cifar10/clique-ring/all/2021-03-14-09:55:24-CET  --add-min-max --yaxis test-accuracy --labels '1-node IID' 'd-cliques (fractal)' 'd-cliques (ring)' --legend 'lower right' --save-figure ../../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
+     \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-linear}
+\caption{\label{fig:d-cliques-cifar10-1000-vs-1-node-test-accuracy-linear}  CIFAR10 with LeNet Model: Linear Inter-clique Topologies.}
+     \end{subfigure}
+    
+     
+ % To regenerate the figure, from directory results/mnist
+ % python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-09:20:03-CET ../scaling/1000/mnist/fully-connected-cliques/all/2021-03-14-17:56:26-CET ../scaling/1000/mnist/smallworld-logn-cliques/all/2021-03-23-21:45:39-CET --add-min-max --yaxis test-accuracy --legend 'lower right' --ymin 84 --ymax 92.5 --labels '1 node IID'  'd-cliques (fully-connected cliques)' 'd-cliques (smallworld)'  --save-figure ../../figures/d-cliques-mnist-1000-nodes-comparison-super-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
+     \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+            \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-1000-nodes-comparison-super-linear}
+             \caption{\label{fig:d-cliques-mnist-1000-nodes-comparison-super-linear} MNIST with Linear Model: Superlinear Inter-clique Topologies.}
+     \end{subfigure}
+     \hfill
+     % To regenerate the figure, from directory results/cifar10
+% python ../../../../Software/non-iid-topology-simulator/tools/plot_convergence.py 1-node-iid/all/2021-03-10-13:52:58-CET ../scaling/1000/cifar10/fully-connected-cliques/all/2021-03-14-17:41:20-CET ../scaling/1000/cifar10/smallworld-logn-cliques/all/2021-03-23-22:13:57-CET  --add-min-max --yaxis test-accuracy --labels '1-node IID' 'd-cliques (fully-connected cliques)' 'd-cliques (smallworld)' --legend 'lower right' --save-figure ../../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-super-linear.png --font-size 13 --linestyles 'solid' 'dashed' 'dotted'
+     \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-1000-vs-1-node-test-accuracy-super-linear}
+\caption{\label{fig:d-cliques-cifar10-1000-vs-1-node-test-accuracy-super-linear}  CIFAR10 with LeNet Model: Superlinear Inter-clique Topologies.}
+     \end{subfigure}
+     
+\caption{\label{fig:d-cliques-cifar10-convolutional} D-Cliques Convergence Speed with 1000 nodes, non-IID, Constant Updates per Epoch, with Different Inter-Clique Topologies.}
+\end{figure*}
+
+\section{Related Work}
+\label{section:related-work}
+
+In this section, we review some related work on dealing with non-IID data in
+federated learning, and on the role of topology in fully decentralized
+algorithms.
+
+\paragraph{Dealing with non-IID data in server-based FL.}
+Non-IID data is not much of an issue in server-based FL if
+clients send their parameters to the server after each gradient update.
+Problems arise when one seeks to reduce
+the number of communication rounds by allowing each participant to perform
+multiple local updates, as in the popular FedAvg algorithm 
+\cite{mcmahan2016communication}. Indeed, non-IID data can prevent
+such algorithms from
+converging to a good solution \cite{quagmire,scaffold}. This led to the design
+of algorithms that are specifically designed to mitigate the impact
+of non-IID data while performing
+multiple local updates, using adaptive client sampling \cite{quagmire}, update
+corrections \cite{scaffold} or regularization in the local objective 
+\cite{fedprox}. Another direction is to embrace the non-IID scenario by
+learning personalized models for each client 
+\cite{smith2017federated,perso_fl_mean,maml,moreau}.
+We note that recent work explores rings of server-based topologies 
+\cite{tornado}, but the focus is not on dealing with non-IID data but
+to make server-based FL more scalable to a large number of clients.
+
+\paragraph{Dealing with non-IID data in fully decentralized FL.}
+Non-IID data is known to negatively impact the convergence speed
+of fully decentralized FL algorithms in practice \cite{jelasity}. Aside from approaches that aim to learn personalized models \cite{Vanhaesebrouck2017a,Zantedeschi2020a}, this
+motivated the design of algorithms with modified updates based on variance
+reduction \cite{tang18a}, momentum correction \cite{momentum_noniid},
+cross-gradient
+aggregation \cite{cross_gradient}, or multiple averaging steps
+between updates (see \cite{consensus_distance} and references therein). These
+algorithms
+typically require significantly more communication and/or computation, and
+have only been evaluated on small-scale networks with a few tens of
+nodes.\footnote{We
+also observed that \cite{tang18a} is subject to numerical
+instabilities when run on topologies other than rings. When
+the rows and columns of $W$ do not exactly
+sum to $1$ (due to finite precision), these small differences get amplified by
+the proposed updates and make the algorithm diverge.}
+In contrast, D-Cliques focuses on the design of a sparse topology which is
+able to compensate for the effect of non-IID data and scales to large
+networks. We do not modify the simple
+and efficient D-SGD
+algorithm \cite{lian2017d-psgd} beyond removing some neighbor
+contributions
+that otherwise bias the gradient direction.
+
+\paragraph{Impact of topology in fully decentralized FL.} It is well
+known
+that the choice of network topology can affect the
+convergence of fully decentralized algorithms. In theoretical convergence
+rates, this is typically accounted
+for by a dependence on the spectral gap of
+the network, see for instance 
+\cite{Duchi2012a,Colin2016a,lian2017d-psgd,Nedic18}.
+However, for IID data, practice contradicts these classic
+results as fully decentralized algorithms have been observed to converge
+essentially as fast
+on sparse topologies like rings or grids as they do on a fully connected
+network \cite{lian2017d-psgd,Lian2018}. Recent work 
+\cite{neglia2020,consensus_distance} sheds light on this phenomenon with refined convergence analyses based on differences between gradients or parameters across nodes, which are typically
+smaller in the IID case. However, these results do not give any clear insight
+regarding the role of the topology in the non-IID case. We note that some work
+has gone into designing efficient topologies to optimize the use of
+network resources (see e.g., \cite{marfoq}), but the topology is chosen
+independently of how data is distributed across nodes. In summary, the role
+of topology in the non-IID data scenario is not well understood and we are not
+aware of prior work focusing on this question. Our work is the first
+to show that an
+appropriate choice of data-dependent topology can effectively compensate for
+non-IID data.
+
+\section{Conclusion}
+\label{section:conclusion}
+
+We proposed D-Cliques, a sparse topology that recovers the convergence
+speed of a fully-connected network in the presence of local class bias.
+D-Cliques is based on assembling subsets of nodes into cliques such
+that the clique-level class distribution is representative of the global
+distribution, thereby locally recovering IIDness. Cliques are joined in a
+sparse inter-clique topology so that
+they quickly converge to the same model. We proposed Clique
+Averaging to remove the non-IID bias in gradient computation by
+averaging gradients only with other nodes within the clique. Clique Averaging
+can in turn be used to implement unbiased momentum to recover the convergence
+speed usually only possible with IID mini-batches. Through our experiments, we
+showed that the clique structure of D-Cliques is critical in obtaining these
+results and that a small-world inter-clique topology with only $O(n \log (n))$ 
+edges achieves the best compromise between
+convergence speed and scalability with the number of nodes.
+
+D-Cliques thus appears to be very promising to reduce bandwidth
+usage on FL servers and to implement fully decentralized alternatives in a
+wider range of applications where global coordination is impossible or costly.
+For instance, the presence and relative frequency of classes in each node
+could be computed using PushSum~\cite{kempe2003gossip}, and the topology could
+be constructed in a decentralized and adaptive way with
+PeerSampling~\cite{jelasity2007gossip}. This will be investigated in future work.
+We also believe that our ideas can be useful to deal
+with more general types of data non-IIDness beyond the important case of
+local class bias that we studied in this paper. An important example is
+covariate shift or feature distribution skew \cite{kairouz2019advances}, for
+which local density estimates could be used as basis to construct cliques that
+approximately recover the global distribution.
+
+
+\bibliography{../main.bib}
+\bibliographystyle{mlsys2022}
+
+
+
+\appendix
+\section{Detailed Algorithms}
+ 
+ We present a more detailed and precise explanation of the two main algorithms
+ of the paper, for D-Cliques construction
+ (Algorithm~\ref{Algorithm:D-Clique-Construction}) and to establish a small-world
+ inter-clique topology (Algorithm~\ref{Algorithm:Smallworld}).
+ 
+ \subsection{D-Cliques Construction}
+ 
+ Algorithm~\ref{Algorithm:D-Clique-Construction} shows the overall approach
+ for constructing a D-Cliques topology in the non-IID case.\footnote{An IID
+ version of D-Cliques, in which each node has an equal number of examples of
+ all classes, can be implemented by picking $\#L$ nodes per clique at random.}
+ It expects the following inputs: $L$, the set of all classes present in the global distribution $D = \bigcup_{i \in N} D_i$; $N$, the set of all nodes; a function $classes(S)$, which given a subset $S$ of nodes in $N$ returns the set of classes in their joint local distributions ($D_S = \bigcup_{i \in S} D_i$); a function $intraconnect(DC)$, which given $DC$, a set of cliques (set of set of nodes), creates a set of edges ($\{\{i,j\}, \dots \}$) connecting all nodes within each clique to one another; a function $interconnect(DC)$, which given a set of cliques, creates a set of edges ($\{\{i,j\}, \dots \}$) connecting nodes belonging to different cliques; and a function $weigths(E)$, which given a set of edges, returns the weighted matrix $W_{ij}$.  Algorithm~\ref{Algorithm:D-Clique-Construction} returns both $W_{ij}$, for use in D-SGD (Algorithm~\ref{Algorithm:D-PSGD} and~\ref{Algorithm:Clique-Unbiased-D-PSGD}), and $DC$, for use with Clique Averaging (Algorithm~\ref{Algorithm:Clique-Unbiased-D-PSGD}).
+ 
+%   \begin{algorithm}[h]
+%   \caption{D-Cliques Construction}
+%   \label{Algorithm:D-Clique-Construction}
+%   \begin{algorithmic}[1]
+%        \State \textbf{Require:} set of classes globally present $L$, 
+%        \State~~ set of all nodes $N = \{ 1, 2, \dots, n \}$,
+%        \State~~ fn $\textit{classes}(S)$ that returns the classes present in a subset of nodes $S$,
+%        \State~~ fn $\textit{intraconnect}(DC)$ that returns edges intraconnecting cliques of $DC$,
+%        \State~~ fn $\textit{interconnect}(DC)$ that returns edges interconnecting cliques of $DC$ (Sec.~\ref{section:interclique-topologies})
+%         \State~~ fn $\textit{weights}(E)$ that assigns weights to edges in $E$ 
+%         
+%        \State $R \leftarrow \{ n~\text{for}~n \in N \}$ \Comment{Remaining nodes}
+%        \State $DC \leftarrow \emptyset$ \Comment{D-Cliques}
+%        \State $\textit{C} \leftarrow \emptyset$ \Comment{Current Clique}
+%        \While{$R \neq \emptyset$}
+%		\State $n \leftarrow \text{pick}~1~\text{from}~\{ m \in R | \textit{classes}(\{m\}) \subsetneq \textit{classes}(\textit{C}) \}$
+%		\State $R \leftarrow R \setminus \{ n \}$
+%		\State $C \leftarrow C \cup \{ n \}$
+%		\If{$\textit{classes}(C) = L$}
+%		    \State $DC \leftarrow DC \cup \{ C \}$
+%		    \State $C \leftarrow \emptyset$
+%		\EndIf
+%        \EndWhile
+%        \State \Return $(weights(\textit{intraconnect}(DC) \cup \textit{interconnect}(DC)), DC)$
+%   \end{algorithmic}
+%\end{algorithm}
+ 
+The implementation builds a single clique by adding nodes with different
+classes until all classes of the global distribution are represented. Each
+clique is built sequentially until all nodes are parts of cliques.
+Because all classes are represented on an equal number of nodes, all cliques
+will have nodes of all classes. Furthermore, since nodes have examples
+of a single class, we are guaranteed a valid assignment is possible in a greedy manner. After cliques are created, edges are added and weights are assigned to edges, using the corresponding input functions.
+
+\subsection{Small-world Inter-clique Topology}
+
+Algorithm~\ref{Algorithm:Smallworld} instantiates the function 
+\textit{interconnect} with a
+small-world inter-clique topology as described in Section~\ref{section:interclique-topologies}. It adds a
+linear number of inter-clique edges by first arranging cliques on a ring. It then adds a logarithmic number of ``finger'' edges to other cliques on the ring chosen such that there is a constant number of edges added per set, on sets that are exponentially bigger the further away on the ring. ``Finger'' edges are added symmetrically on both sides of the ring to the cliques in each set that are closest to a given set.
+
+%\begin{algorithm}[h]
+%   \caption{$\textit{smallworld}(DC)$:  adds $O(\# N \log(\# N))$ edges}
+%   \label{Algorithm:Smallworld}
+%   \begin{algorithmic}[1]
+%        \State \textbf{Require:} set of cliques $DC$ (set of set of nodes)
+%        \State ~~size of neighborhood $ns$ (default 2)
+%        \State ~~function $\textit{least\_edges}(S, E)$ that returns one of the nodes in $S$ with the least number of edges in $E$
+%        \State $E \leftarrow \emptyset$ \Comment{Set of Edges}
+%        \State $L \leftarrow [ C~\text{for}~C \in DC ]$ \Comment{Arrange cliques in a list}
+%        \For{$i \in \{1,\dots,\#DC\}$} \Comment{For every clique}
+%          \State \Comment{For sets of cliques exponentially further away from $i$}
+%          \For{$\textit{offset} \in \{ 2^x~\text{for}~x~\in \{ 0, \dots,
+%          \lceil \log_2(\#DC) \rceil \} \}$} 
+%             \State \Comment{Pick the $ns$ closest}
+%             \For{$k \in \{0,\dots,ns-1\}$}
+%                 \State \Comment{Add inter-clique connections in both directions}
+%                 \State $n \leftarrow \textit{least\_edges}(L_i, E)$
+%                 \State $m \leftarrow \textit{least\_edges}(L_{(i+\textit{offset}+k) \% \#DC}, E)$ \Comment{clockwise in ring}
+%                 \State $E \leftarrow E \cup \{ \{n,m\} \}$
+%                 \State $n \leftarrow \textit{least\_edges}(L_i, E)$
+%                 \State $m \leftarrow \textit{least\_edges}(L_{(i-\textit{offset}-k)\% \#DC} , E)$ \Comment{counter-clockwise in ring}
+%                 \State $E \leftarrow E \cup \{ \{n,m\} \}$
+%             \EndFor
+%           \EndFor
+%        \EndFor
+%        \State \Return E
+%   \end{algorithmic}
+%\end{algorithm}
+
+Algorithm~\ref{Algorithm:Smallworld}  expects a set of cliques $DC$, previously computed by 
+Algorithm~\ref{Algorithm:D-Clique-Construction}; a size of neighborhood $ns$,
+which is the number of finger edges to add per set of cliques, and a function 
+\textit{least\_edges}, which given a set of nodes $S$ and an existing set of
+edges $E =  \{\{i,j\}, \dots \}$, returns one of the nodes in $E$ with the least number of edges. It returns a new set of edges $\{\{i,j\}, \dots \}$ with all edges added by the small-world topology.
+
+The implementation first arranges the cliques of $DC$ in a list, which
+represents the ring. Traversing the list with increasing indices is equivalent
+to traversing the ring in the clockwise direction, and inversely. Then, for every clique $i$ on the ring from which we are computing the distance to others, a number of edges are added. All other cliques are implicitly arranged in mutually exclusive sets, with size and at offset exponentially bigger (doubling at every step). Then for every of these sets, $ns$ edges are added, both in the clockwise and counter-clockwise directions, always on the nodes with the least number of edges in each clique. The ring edges are implicitly added to the cliques at offset $1$ in both directions.
+ 
+ % \section{Additional Experiments}
+
+% \subsection{Effect of Clique Averaging and Uniform Initialization}
+
+% Section~\ref{section:clique-averaging} explained how Clique Averaging reduces bias and showed that Clique Averaging was significantly beneficial on MNIST with fully-connected D-Cliques. In this section, we provide additional results for the ring topology, as well as for CIFAR10. In addition, during our early exploration, we noticed that ensuring \textit{uniform initialization}, i.e. ensuring that all nodes start with the same model, increased convergence speed when connecting two cliques with 1-2 interclique edges. We therefore also verify whether this effect is still significant with 10 cliques (100 nodes), on a ring and with full connections between cliques, as well as on MNIST and CIFAR10. We also verified what interaction this had with Clique Averaging.
+
+% Figure~\ref{fig:d-cliques-mnist-init-clique-avg-effect} shows all the results for MNIST. Comparing Figure~\ref{fig:d-cliques-mnist-init-clique-avg-effect-ring-test-accuracy} to~\ref{fig:d-cliques-mnist-no-init-clique-avg-effect-ring-test-accuracy}, and Figure~\ref{fig:d-cliques-mnist-init-clique-avg-effect-fcc-test-accuracy} to~\ref{fig:d-cliques-mnist-no-init-clique-avg-effect-fcc-test-accuracy} together, we see that Uniform Initialization has imperceptible effects. However, for all four sub-figures, using Clique Averaging has a slightly better average convergence speed, and significantly lower variance between nodes, than not using it. Moreover, the improvement is larger with Fully-Connected D-Cliques.
+
+% \begin{figure}[htbp]
+%      \centering
+%       % To regenerate the figure, from directory results/mnist   
+%       % python ../../../learn-topology/tools/plot_convergence.py clique-ring/all/2021-03-10-18:14:35-CET no-clique-avg/clique-ring/all/2021-03-12-10:40:37-CET --add-min-max --yaxis test-accuracy --labels 'with clique avg.' 'without clique avg.' --legend 'lower right' --ymin 85 --ymax 92.5 --save-figure ../../figures/d-cliques-mnist-init-clique-avg-effect-ring-test-accuracy.png
+%       \begin{subfigure}[b]{0.48\textwidth}
+%          \centering
+%          \includegraphics[width=\textwidth]{figures/d-cliques-mnist-init-clique-avg-effect-ring-test-accuracy}
+%          \caption{\label{fig:d-cliques-mnist-init-clique-avg-effect-ring-test-accuracy} D-Cliques (Ring), with Uniform Initialization}
+%      \end{subfigure}
+%      \quad 
+%      % To regenerate the figure, from directory results/mnist
+%      % python ../../../learn-topology/tools/plot_convergence.py no-init/clique-ring/all/2021-03-12-10:40:11-CET no-init-no-clique-avg/clique-ring/all/2021-03-12-10:41:03-CET --add-min-max --yaxis test-accuracy --labels 'with clique avg.' 'without clique avg.' --legend 'lower right' --ymin 85 --ymax 92.5 --save-figure ../../figures/d-cliques-mnist-no-init-clique-avg-effect-ring-test-accuracy.png   
+%       \begin{subfigure}[b]{0.48\textwidth}
+%          \centering
+%          \includegraphics[width=\textwidth]{figures/d-cliques-mnist-no-init-clique-avg-effect-ring-test-accuracy}
+%          \caption{\label{fig:d-cliques-mnist-no-init-clique-avg-effect-ring-test-accuracy} D-Cliques (Ring), without Uniform Initialization}
+%      \end{subfigure}
+     
+%      % To regenerate the figure, from directory results/mnist
+%      %python ../../../learn-topology/tools/plot_convergence.py fully-connected-cliques/all/2021-03-10-10:19:44-CET no-clique-avg/fully-connected-cliques/all/2021-03-12-11:12:26-CET --add-min-max --yaxis test-accuracy --labels 'with clique avg.'    'without clique avg.'  --legend 'lower right' --ymin 85 --ymax 92.5 --save-figure ../../figures/d-cliques-mnist-init-clique-avg-effect-fcc-test-accuracy.png
+%        \begin{subfigure}[b]{0.48\textwidth}
+%          \centering
+%          \includegraphics[width=\textwidth]{figures/d-cliques-mnist-init-clique-avg-effect-fcc-test-accuracy}
+%          \caption{\label{fig:d-cliques-mnist-init-clique-avg-effect-fcc-test-accuracy} D-Cliques (Fully-Connected), with Uniform Initialization}
+%      \end{subfigure}
+%      \quad
+%       % To regenerate the figure, from directory results/mnist
+%      %python ../../../learn-topology/tools/plot_convergence.py no-init/fully-connected-cliques/all/2021-03-12-11:12:01-CET no-init-no-clique-avg/fully-connected-cliques/all/2021-03-12-11:12:49-CET --add-min-max --yaxis test-accuracy --labels 'with clique avg.' 'without clique avg.' --legend 'lower right' --ymin 85 --ymax 92.5 --save-figure ../../figures/d-cliques-mnist-no-init-clique-avg-effect-fcc-test-accuracy.png
+%        \begin{subfigure}[b]{0.48\textwidth}
+%          \centering
+%          \includegraphics[width=\textwidth]{figures/d-cliques-mnist-no-init-clique-avg-effect-fcc-test-accuracy}
+%          \caption{\label{fig:d-cliques-mnist-no-init-clique-avg-effect-fcc-test-accuracy} D-Cliques (Fully-Connected), without Uniform Initialization}
+%      \end{subfigure}
+     
+     
+% \caption{\label{fig:d-cliques-mnist-init-clique-avg-effect} MNIST: Effects of Clique Averaging and Uniform Initialization on Convergence Speed. (100 nodes, non-IID, D-Cliques, bsz=128)}
+% \end{figure}
+
+% Figure~\ref{fig:d-cliques-cifar10-init-clique-avg-effect} shows all the results for CIFAR10. One the one hand, with D-Cliques arranged in a ring, uniform initialization has a small but positive effect on convergence speed, whether Clique Averaging is used or not. With fully-connected D-Cliques,  the effect is significantly smaller and almost negligible, both with and without Clique Averaging. On the other hand, Clique Averaging is always beneficial, by a significantly larger margin for both interclique topologies and with and without uniform initialization. Moreover, the effect is stronger than for MNIST.
+
+%     \begin{figure}[htbp]
+%      \centering
+%      % To regenerate the figure, from directory results/cifar10
+%      % python ../../../learn-topology/tools/plot_convergence.py clique-ring/all/2021-03-10-11:58:43-CET no-init/clique-ring/all/2021-03-13-18:28:30-CET no-clique-avg/clique-ring/all/2021-03-13-18:27:09-CET  no-init-no-clique-avg/clique-ring/all/2021-03-13-18:29:58-CET --add-min-max --yaxis test-accuracy --labels 'with clique avg., with uniform init.' 'with clique avg., without uniform init.'  'without clique avg., with uniform init.'   'without clique avg., without uniform init.' --legend 'upper left' --ymax 115  --save-figure ../../figures/d-cliques-cifar10-init-clique-avg-effect-ring-test-accuracy.png --font-size 15
+%       \begin{subfigure}[b]{0.48\textwidth}
+%          \centering
+%          \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-init-clique-avg-effect-ring-test-accuracy}
+%          \caption{\label{fig:d-cliques-cifar10-init-clique-avg-effect-ring-test-accuracy} D-Cliques (Ring)}
+%      \end{subfigure}
+%      % To regenerate the figure, from directory results/cifar10
+%      %python ../../../learn-topology/tools/plot_convergence.py fully-connected-cliques/all/2021-03-10-13:58:57-CET no-init/fully-connected-cliques/all/2021-03-13-18:32:55-CET no-clique-avg/fully-connected-cliques/all/2021-03-13-18:31:36-CET  no-init-no-clique-avg/fully-connected-cliques/all/2021-03-13-18:34:35-CET --add-min-max --yaxis test-accuracy --labels 'with clique avg., with uniform init.' 'with clique avg., without uniform init.'  'without clique avg., with uniform init.'   'without clique avg., without uniform init.' --legend 'upper left'  --ymax 115 --save-figure ../../figures/d-cliques-cifar10-init-clique-avg-effect-fcc-test-accuracy.png --font-size 15
+%        \begin{subfigure}[b]{0.48\textwidth}
+%          \centering
+%          \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-init-clique-avg-effect-fcc-test-accuracy}
+%          \caption{\label{fig:d-cliques-cifar10-init-clique-avg-effect-fcc-test-accuracy} D-Cliques (Fully-Connected)}
+%      \end{subfigure}
+% \caption{\label{fig:d-cliques-cifar10-init-clique-avg-effect} CIFAR10: Effects of Clique Averaging and Uniform Initialization on Convergence Speed. (100 nodes, non-IID, D-Cliques, bsz=20, momentum=0.9)}
+% \end{figure}
+
+% We conclude that Uniform Initialization is not so important for convergence speed but that Clique Averaging is always significantly so.
+
+% \subsection{Comparison to Non-Clustered Topologies}    
+%     
+%     \begin{figure}
+%\centering
+%              \begin{subfigure}[htb]{0.48\textwidth}
+%% To regenerate the figure, from directory results/mnist/gn-lenet
+%% python ../../../../learn-topology/tools/plot_convergence.py no-init/all/2021-03-22-21:39:54-CET no-init-no-clique-avg/all/2021-03-22-21:40:16-CET random-10/all/2021-03-22-21:41:06-CET random-10-diverse/all/2021-03-22-21:41:46-CET random-10-diverse-unbiased-grad/all/2021-03-22-21:42:04-CET --legend 'lower right' --add-min-max --labels 'd-clique (fcc) clique avg.' 'd-clique (fcc) no clique avg.' '10 random edges' '10 random edges (all classes repr.)' '10 random edges (all classes repr.) with unbiased grad.' --ymin 80 --yaxis test-accuracy --save-figure ../../../figures/d-cliques-mnist-lenet-comparison-to-non-clustered-topologies.png
+%         \includegraphics[width=\textwidth]{figures/d-cliques-mnist-lenet-comparison-to-non-clustered-topologies}
+%         \caption{\label{fig:d-cliques-mnist-lenet-comparison-to-non-clustered-topologies} LeNet Model}
+%        \end{subfigure}
+%        \hfill
+%                      \begin{subfigure}[htb]{0.48\textwidth}
+%% To regenerate the figure, from directory results/mnist/gn-lenet
+%% python ../../../../learn-topology/tools/plot_convergence.py no-init/all/2021-03-22-21:39:54-CET no-init-no-clique-avg/all/2021-03-22-21:40:16-CET random-10/all/2021-03-22-21:41:06-CET random-10-diverse/all/2021-03-22-21:41:46-CET random-10-diverse-unbiased-grad/all/2021-03-22-21:42:04-CET --legend 'upper right' --add-min-max --labels 'd-clique (fcc) clique avg.' 'd-clique (fcc) no clique avg.' '10 random edges' '10 random edges (all classes repr.)' '10 random edges (all classes repr.) with unbiased grad.' --ymax 0.7 --yaxis scattering --save-figure ../../../figures/d-cliques-mnist-lenet-comparison-to-non-clustered-topologies-scattering.png
+%         \includegraphics[width=\textwidth]{figures/d-cliques-mnist-lenet-comparison-to-non-clustered-topologies-scattering}
+%         \caption{\label{fig:d-cliques-mnist-lenet-comparison-to-non-clustered-topologies-scattering} LeNet Model (Scattering)}
+%        \end{subfigure}
+%       
+%         \caption{\label{fig:d-cliques-mnist-comparison-to-non-clustered-topologies} MNIST: Comparison to non-Clustered Topologies}
+%\end{figure}
+%
+% \begin{figure}
+% \centering
+%     % To regenerate the figure, from directory results/cifar10
+%% python ../../../learn-topology/tools/plot_convergence.py fully-connected-cliques/all/2021-03-10-13:58:57-CET no-init-no-clique-avg/fully-connected-cliques/all/2021-03-13-18:34:35-CET  random-10/all/2021-03-17-20:30:03-CET  random-10-diverse/all/2021-03-17-20:30:41-CET random-10-diverse-unbiased-gradient/all/2021-03-17-20:31:14-CET random-10-diverse-unbiased-gradient-uniform-init/all/2021-03-17-20:31:41-CET --labels 'd-clique (fcc) clique avg., uniform init.' 'd-clique (fcc) no clique avg. no uniform init.' '10 random edges' '10 random edges (all classes repr.)' '10 random (all classes repr.) with unbiased grad.' '10 random (all classes repr.) with unbiased grad., uniform init.' --add-min-max --legend 'upper right' --yaxis scattering --save-figure ../../figures/d-cliques-cifar10-linear-comparison-to-non-clustered-topologies-scattering.png --ymax 0.7
+%        \begin{subfigure}[b]{0.48\textwidth}
+%        \centering
+%         \includegraphics[width=\textwidth]{figures/d-cliques-cifar10-linear-comparison-to-non-clustered-topologies-scattering}
+%         \caption{\label{fig:d-cliques-cifar10-linear-comparison-to-non-clustered-topologies-scattering} LeNet Model: Scattering}
+%     \end{subfigure}  
+%         
+%\caption{\label{fig:d-cliques-cifar10-linear-comparison-to-non-clustered-topologies} CIFAR10: Comparison to non-Clustered Topologies}
+%\end{figure} 
+%
+%
+%\begin{itemize}
+%  \item Clustering does not seem to make a difference in MNIST, even when using a higher-capacity model (LeNet) instead of a linear model. (Fig.\ref{fig:d-cliques-mnist-comparison-to-non-clustered-topologies})     
+%  \item Except for the random 10 topology, convergence speed seems to be correlated with scattering in CIFAR-10 with LeNet model (Fig.\ref{fig:d-cliques-cifar10-linear-comparison-to-non-clustered-topologies}). There is also more difference between topologies both in convergence speed and scattering than for MNIST (Fig.~\ref{fig:d-cliques-mnist-comparison-to-non-clustered-topologies}). Scattering computed similar to Consensus Control for Decentralized Deep Learning~\cite{consensus_distance}.
+%\end{itemize}
+%
+%
+
+% \clearpage
+
+\section{Additional Experiments on Scaling Behavior with Increasing Number of
+Nodes}
+\label{app:scaling}
+
+Section~\ref{section:interclique-topologies} compares the convergence speed of various inter-clique topologies at a scale of 1000 nodes. In this section, we show the effect of scaling the number of nodes, by comparing the convergence speed with 1, 10, 100, and 1000 nodes, and adjusting the batch size to maintain a constant number of updates per epoch. We present results for Ring, Fractal, Small-world, and Fully-Connected inter-clique topologies.
+ 
+Figure~\ref{fig:d-cliques-mnist-scaling-fully-connected} shows the results for
+MNIST. For all topologies, we notice a perfect scaling up to 100 nodes, i.e.
+the accuracy curves overlap, with low variance between nodes. Starting at 1000
+nodes, there is a significant increase in variance between nodes and the
+convergence is slower, only marginally for Fully-Connected but
+significantly so for Fractal and Ring. Small-world has higher variance between nodes but maintains a convergence speed close to that of Fully-Connected.
+
+Figure~\ref{fig:d-cliques-cifar10-scaling-fully-connected} shows the results
+for CIFAR10. When increasing from 1 to 10 nodes (resulting in a single
+fully-connected clique), there is actually a small increase both in final
+accuracy and convergence speed. We believe this increase is due to the
+gradient being computed with exactly the same number of examples from all
+classes with 10 fully-connected non-IID nodes, while the gradient for a single
+non-IID node may have a slightly larger bias because the random sampling does
+not guarantee the representation of all classes perfectly in each batch. At a
+scale of 100 nodes, there is no difference between Fully-Connected and
+Fractal, as the connections are the same; however, a Ring already shows a
+significantly slower convergence. At 1000 nodes, the convergence significantly
+slows down for Fractal and Ring, while remaining close, albeit with a larger
+variance, for Fully-Connected. Similar to MNIST, Small-world has
+higher variance and slightly lower convergence speed than Fully-Connected but
+remains very close.
+
+We therefore conclude that Fully-Connected and Small-world have good scaling
+properties in terms of convergence speed, and that the
+linear-logarithmic number of edges of Small-world makes it the best compromise
+between convergence speed and connectivity, and thus the best choice for
+efficient large-scale decentralized learning in practice.
+
+\begin{figure}[htbp]
+         \centering     
+              % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py ../mnist/1-node-iid/all/2021-03-10-09:20:03-CET 10/mnist/fully-connected-cliques/all/2021-03-12-09:13:27-CET ../mnist/fully-connected-cliques/all/2021-03-10-10:19:44-CET 1000/mnist/fully-connected-cliques/all/2021-03-14-17:56:26-CET --labels '1 node IID bsz=12800' '10 nodes bsz=1280' '100 nodes bsz=128' '1000 nodes bsz=13' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-fully-connected-cst-updates.png --ymin 80 --add-min-max
+      \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-scaling-fully-connected-cst-updates}
+         \caption{Fully-Connected}
+     \end{subfigure}
+     \quad
+       % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py ../mnist/1-node-iid/all/2021-03-10-09:20:03-CET 10/mnist/fully-connected-cliques/all/2021-03-12-09:13:27-CET ../mnist/smallworld-logn-cliques/all/2021-03-23-21:44:56-CET 1000/mnist/smallworld-logn-cliques/all/2021-03-23-21:45:39-CET --labels '1 node IID bsz=12800' '10 nodes bsz=1280' '100 nodes bsz=128' '1000 nodes bsz=13' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-smallworld-cst-updates.png --ymin 80 --add-min-max
+      \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-scaling-smallworld-cst-updates}
+         \caption{Small-world}
+     \end{subfigure}
+     \quad
+
+          % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py ../mnist/1-node-iid/all/2021-03-10-09:20:03-CET 10/mnist/clique-ring/all/2021-03-13-18:22:01-CET ../mnist/fully-connected-cliques/all/2021-03-10-10:19:44-CET 1000/mnist/fractal-cliques/all/2021-03-14-17:41:59-CET --labels '1 node IID bsz=12800' '10 nodes bsz=1280' '100 nodes bsz=128' '1000 nodes bsz=13' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-fractal-cliques-cst-updates.png --ymin 80 --add-min-max
+         \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-scaling-fractal-cliques-cst-updates}
+         \caption{Fractal}
+     \end{subfigure}  
+     \quad
+     % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py ../mnist/1-node-iid/all/2021-03-10-09:20:03-CET 10/mnist/clique-ring/all/2021-03-13-18:22:01-CET ../mnist/clique-ring/all/2021-03-10-18:14:35-CET 1000/mnist/clique-ring/all/2021-03-13-18:22:36-CET --labels '1 node IID bsz=12800' '10 nodes bsz=1280' '100 nodes bsz=128' '1000 nodes bsz=13' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-clique-ring-cst-updates.png --ymin 80 --add-min-max
+         \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-mnist-scaling-clique-ring-cst-updates}
+         \caption{Ring}
+     \end{subfigure}  
+     
+     \caption{\label{fig:d-cliques-mnist-scaling-fully-connected} MNIST:
+     D-Cliques scaling behavior (constant updates per epoch) for different
+     inter-clique topologies.}
+\end{figure}
+     
+\begin{figure}[htbp]
+         \centering
+     
+              % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py ../cifar10/1-node-iid/all/2021-03-10-13:52:58-CET 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/fully-connected-cliques/all/2021-03-10-13:58:57-CET 1000/cifar10/fully-connected-cliques/all/2021-03-14-17:41:20-CET --labels '1 node IID bsz=2000' '10 nodes non-IID bsz=200' '100 nodes non-IID bsz=20' '1000 nodes non-IID bsz=2' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-fully-connected-cst-updates.png --add-min-max
+      \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-scaling-fully-connected-cst-updates}
+         \caption{Fully-Connected}
+     \end{subfigure}
+     \quad
+     % python ../../../learn-topology/tools/plot_convergence.py ../cifar10/1-node-iid/all/2021-03-10-13:52:58-CET 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/smallworld-logn-cliques/all/2021-03-23-22:13:23-CET 1000/cifar10/smallworld-logn-cliques/all/2021-03-23-22:13:57-CET --labels '1 node IID bsz=2000' '10 nodes non-IID bsz=200' '100 nodes non-IID bsz=20' '1000 nodes non-IID bsz=2' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-smallworld-cst-updates.png --add-min-max
+      \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-scaling-smallworld-cst-updates}
+         \caption{Small-world}
+     \end{subfigure}
+     
+     
+          % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py  ../cifar10/1-node-iid/all/2021-03-10-13:52:58-CET 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/fully-connected-cliques/all/2021-03-10-13:58:57-CET 1000/cifar10/fractal-cliques/all/2021-03-14-17:42:46-CET  --labels '1 node IID bsz=2000' '10 nodes non-IID bsz=200' '100 nodes non-IID bsz=20' '1000 nodes non-IID bsz=2'  --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates.png --add-min-max
+         \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-scaling-fractal-cliques-cst-updates}
+         \caption{Fractal}
+     \end{subfigure}  
+     \quad
+     % To regenerate the figure, from directory results/scaling
+% python ../../../learn-topology/tools/plot_convergence.py  ../cifar10/1-node-iid/all/2021-03-10-13:52:58-CET 10/cifar10/fully-connected-cliques/all/2021-03-13-19:06:02-CET ../cifar10/clique-ring/all/2021-03-10-11:58:43-CET 1000/cifar10/clique-ring/all/2021-03-14-09:55:24-CET  --labels '1 node IID bsz=2000' '10 nodes non-IID bsz=200' '100 nodes non-IID bsz=20' '1000 nodes non-IID bsz=2'   --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-cifar10-scaling-clique-ring-cst-updates.png --add-min-max
+         \begin{subfigure}[b]{0.35\textwidth}
+         \centering
+         \includegraphics[width=\textwidth]{../figures/d-cliques-cifar10-scaling-clique-ring-cst-updates}
+         \caption{Ring}
+     \end{subfigure}  
+     
+     \caption{\label{fig:d-cliques-cifar10-scaling-fully-connected} CIFAR10: D-Cliques scaling behavior (constant updates per epoch) for different
+     inter-clique topologies.}
+\end{figure}
+
+
+%%         % To regenerate the figure, from directory results/scaling
+%%% python ../../../learn-topology/tools/plot_convergence.py 10/mnist/fully-connected-cliques/all/2021-03-10-14:40:35-CET ../mnist/fully-connected-cliques/all/2021-03-10-10:19:44-CET 1000/mnist/fully-connected-cliques/all/2021-03-10-16:44:35-CET --labels '10 nodes bsz=128' '100 nodes bsz=128' '1000 nodes bsz=128 (45)' --legend 'lower right' --yaxis test-accuracy --save-figure ../../figures/d-cliques-mnist-scaling-fully-connected-cst-bsz.png --ymin 80 --add-min-max
+%         \begin{figure}[htbp]
+%         \centering
+%         \includegraphics[width=0.48\textwidth]{figures/d-cliques-mnist-scaling-fully-connected-cst-bsz}
+%         \caption{FCC: Constant Batch-Size}
+%     \end{figure} 
+     
+ 
+
+
+\end{document}
+
+
diff --git a/mlsys2022style/mlsys2022.bst b/mlsys2022style/mlsys2022.bst
new file mode 100644
index 0000000..b9fd17b
--- /dev/null
+++ b/mlsys2022style/mlsys2022.bst
@@ -0,0 +1,1439 @@
+%% File: `mlsys2022.bst'
+%% A modification of `plainnl.bst' for use with natbib package 
+%%
+%% Copyright 2010 Hal Daum\'e III
+%% Modified by Dimitris Papailiopoulos based on `icml2018.bst`
+%%
+%% Copyright 1993-2007 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+%%
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%%
+ % Version and source file information:
+ % \ProvidesFile{icml2010.mbs}[2007/11/26 1.93 (PWD)]
+ %
+ % BibTeX `plainnat' family
+ %   version 0.99b for BibTeX versions 0.99a or later,
+ %   for LaTeX versions 2.09 and 2e.
+ %
+ % For use with the `natbib.sty' package; emulates the corresponding
+ %   member of the `plain' family, but with author-year citations.
+ %
+ % With version 6.0 of `natbib.sty', it may also be used for numerical
+ %   citations, while retaining the commands \citeauthor, \citefullauthor,
+ %   and \citeyear to print the corresponding information.
+ %
+ % For version 7.0 of `natbib.sty', the KEY field replaces missing
+ %   authors/editors, and the date is left blank in \bibitem.
+ %
+ % Includes field EID for the sequence/citation number of electronic journals
+ %  which is used instead of page numbers.
+ %
+ % Includes fields ISBN and ISSN.
+ %
+ % Includes field URL for Internet addresses.
+ %
+ % Includes field DOI for Digital Object Idenfifiers.
+ %
+ % Works best with the url.sty package of Donald Arseneau.
+ %
+ % Works with identical authors and year are further sorted by
+ %   citation key, to preserve any natural sequence.
+ %
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    doi
+    eid
+    edition
+    editor
+    howpublished
+    institution
+    isbn
+    issn
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    url
+    volume
+    year
+  }
+  {}
+  { label extra.label sort.label short.list }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+        { add.period$ write$
+          newline$
+          "\newblock " write$
+        }
+        { output.state before.all =
+            'write$
+            { add.period$ " " * write$ }
+          if$
+        }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+        'skip$
+        { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "\emph{" swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{vv~}{ll}{, jj}{, f.}" format.name$ 't :=
+      nameptr #1 >
+        { namesleft #1 >
+            { ", " * t * }
+            { numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.key}
+{ empty$
+    { key field.or.null }
+    { "" }
+  if$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+        { " (eds.)" * }
+        { " (ed.)" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.isbn}
+{ isbn empty$
+    { "" }
+    { new.block "ISBN " isbn * }
+  if$
+}
+
+FUNCTION {format.issn}
+{ issn empty$
+    { "" }
+    { new.block "ISSN " issn * }
+  if$
+}
+
+FUNCTION {format.url}
+{ url empty$
+    { "" }
+    { new.block "URL \url{" url * "}" * }
+  if$
+}
+
+FUNCTION {format.doi}
+{ doi empty$
+    { "" }
+    { new.block "\doi{" doi * "}" * }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {format.full.names}
+{'s :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr
+      "{vv~}{ll}" format.name$ 't :=
+      nameptr #1 >
+        {
+          namesleft #1 >
+            { ", " * t * }
+            {
+              numnames #2 >
+                { "," * }
+                'skip$
+              if$
+              t "others" =
+                { " et~al." * }
+                { " and " * t * }
+              if$
+            }
+          if$
+        }
+        't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {author.editor.full}
+{ author empty$
+    { editor empty$
+        { "" }
+        { editor format.full.names }
+      if$
+    }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {author.full}
+{ author empty$
+    { "" }
+    { author format.full.names }
+  if$
+}
+
+FUNCTION {editor.full}
+{ editor empty$
+    { "" }
+    { editor format.full.names }
+  if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.full
+    { type$ "proceedings" =
+        'editor.full
+        'author.full
+      if$
+    }
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem[" write$
+  label write$
+  ")" make.full.names duplicate$ short.list =
+     { pop$ }
+     { * }
+   if$
+  "]{" * write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+        { t #1 #2 substring$ "--" = not
+            { "--" *
+              t #2 global.max$ substring$ 't :=
+            }
+            {   { t #1 #1 substring$ "-" = }
+                { "-" *
+                  t #2 global.max$ substring$ 't :=
+                }
+              while$
+            }
+          if$
+        }
+        { t #1 #1 substring$ *
+          t #2 global.max$ substring$ 't :=
+        }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year duplicate$ empty$
+    { "empty year in " cite$ * warning$
+       pop$ "" }
+    'skip$
+  if$
+  month empty$
+    'skip$
+    { month
+      " " * swap$ *
+    }
+  if$
+  extra.label *
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+        'skip$
+        { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+        { series field.or.null }
+        { output.state mid.sentence =
+            { "number" }
+            { "Number" }
+          if$
+          number tie.or.space.connect
+          series empty$
+            { "there's a number but no series in " cite$ * warning$ }
+            { " in " * series * }
+          if$
+        }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+        { edition "l" change.case$ " edition" * }
+        { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+        { #1 'multiresult := }
+        { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+        { "pp.\ " pages n.dashify tie.or.space.connect }
+        { "pp.\ " pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.eid}
+{ eid empty$
+    { "" }
+    { "art." eid tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.pages }
+        { ":\penalty0 " * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.eid}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "\penalty0 (" number * ")" * *
+      volume empty$
+        { "there's a number but no volume in " cite$ * warning$ }
+        'skip$
+      if$
+    }
+  if$
+  eid empty$
+    'skip$
+    { duplicate$ empty$
+        { pop$ format.eid }
+        { ":\penalty0 " * eid * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+        { "chapter" }
+        { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+        'skip$
+        { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+        { "In " booktitle emphasize * }
+        { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+        { "need key or journal for " cite$ * " to crossref " * crossref *
+          warning$
+          ""
+        }
+        { "In \emph{" journal * "}" * }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { series empty$
+            { "need editor, key, or series for " cite$ * " to crossref " *
+              crossref * warning$
+              "" *
+            }
+            { "\emph{" * series * "}" * }
+          if$
+        }
+        'skip$
+      if$
+    }
+    'skip$
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+        { booktitle empty$
+            { "need editor, key, or booktitle for " cite$ * " to crossref " *
+              crossref * warning$
+              ""
+            }
+            { "In \emph{" booktitle * "}" * }
+          if$
+        }
+        { "In " }
+      if$
+    }
+    { "In " }
+  if$
+  " \citet{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      eid empty$
+        { format.vol.num.pages output }
+        { format.vol.num.eid output }
+      if$
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      eid empty$
+        { format.pages output }
+        { format.eid output }
+      if$
+    }
+  if$
+  format.issn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check
+      editor format.key output
+    }
+    { format.authors output.nonnull
+      crossref missing$
+        { "author and editor" editor either.or.check }
+        'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+        { organization publisher new.sentence.checkb
+          organization output
+          publisher output
+          format.date "year" output.check
+        }
+        { address output.nonnull
+          format.date "year" output.check
+          new.sentence
+          organization output
+          publisher output
+        }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  organization address new.block.checkb
+  organization output
+  address output
+  format.edition output
+  format.date output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  author format.key output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  format.issn output
+  format.url output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  format.editors output
+  editor format.key output
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address output
+  format.date "year" output.check
+  new.sentence
+  organization output
+  publisher output
+  format.isbn output
+  format.doi output
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  format.url output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  author format.key output
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  format.url output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {format.lab.names}
+{ 's :=
+  s #1 "{vv~}{ll}" format.name$
+  s num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+        'skip$
+        { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+            { " et~al." * }
+            { " \& " * s #2 "{vv~}{ll}" format.name$ * }
+          if$
+        }
+      if$
+    }
+  if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+    { key empty$
+        { cite$ #1 #3 substring$ }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { cite$ #1 #3 substring$ }
+            'key
+          if$
+        }
+        { editor format.lab.names }
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { author format.lab.names }
+  if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+    { key empty$
+        { organization empty$
+            { cite$ #1 #3 substring$ }
+            { "The " #4 organization chop.word #3 text.prefix$ }
+          if$
+        }
+        'key
+      if$
+    }
+    { editor format.lab.names }
+  if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.key.label
+    { type$ "proceedings" =
+        'editor.key.organization.label
+        { type$ "manual" =
+            'author.key.organization.label
+            'author.key.label
+          if$
+        }
+      if$
+    }
+  if$
+  'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+  short.list
+  "("
+  *
+  year duplicate$ empty$
+  short.list key field.or.null = or
+     { pop$ "" }
+     'skip$
+  if$
+  *
+  'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    {
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr #1 >
+        {
+          "   "  *
+          namesleft #1 = t "others" = and
+            { "zzzzz" * }
+            { numnames #2 > nameptr #2 = and
+                { "zz" * year field.or.null * "   " * }
+                'skip$
+              if$
+              t sortify *
+            }
+          if$
+        }
+        { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+        { "to sort, need author or key in " cite$ * warning$
+          ""
+        }
+        { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+        { key empty$
+            { "to sort, need author, editor, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need author, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+        { key empty$
+            { "to sort, need editor, organization, or key in " cite$ * warning$
+              ""
+            }
+            { key sortify }
+          if$
+        }
+        { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+
+FUNCTION {presort}
+{ calc.label
+  label sortify
+  "    "
+  *
+  type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+        'editor.organization.sort
+        { type$ "manual" =
+            'author.organization.sort
+            'author.sort
+          if$
+        }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  cite$
+  *
+  #1 entry.max$ substring$
+  'sort.label :=
+  sort.label *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #0 int.to.chr$ 'last.label :=
+  "" 'next.extra :=
+  #0 'longest.label.width :=
+  #0 'last.extra.num :=
+  #0 'number.label :=
+}
+
+FUNCTION {forward.pass}
+{ last.label label =
+    { last.extra.num #1 + 'last.extra.num :=
+      last.extra.num int.to.chr$ 'extra.label :=
+    }
+    { "a" chr.to.int$ 'last.extra.num :=
+      "" 'extra.label :=
+      label 'last.label :=
+    }
+  if$
+  number.label #1 + 'number.label :=
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+    { "a" 'extra.label := }
+    'skip$
+  if$
+  extra.label 'next.extra :=
+  extra.label
+  duplicate$ empty$
+    'skip$
+    { "{\natexlab{" swap$ * "}}" * }
+  if$
+  'extra.label :=
+  label extra.label * 'label :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {bib.sort.order}
+{ sort.label  'sort.key$ :=
+}
+
+ITERATE {bib.sort.order}
+
+SORT
+
+FUNCTION {begin.bib}
+{   preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+  write$ newline$
+  "\providecommand{\natexlab}[1]{#1}"
+  write$ newline$
+  "\providecommand{\url}[1]{\texttt{#1}}"
+  write$ newline$
+  "\expandafter\ifx\csname urlstyle\endcsname\relax"
+  write$ newline$
+  "  \providecommand{\doi}[1]{doi: #1}\else"
+  write$ newline$
+  "  \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi"
+  write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git a/mlsys2022style/mlsys2022.sty b/mlsys2022style/mlsys2022.sty
new file mode 100644
index 0000000..a081b7a
--- /dev/null
+++ b/mlsys2022style/mlsys2022.sty
@@ -0,0 +1,750 @@
+% File: mlsys2022.sty (LaTeX style file for mlsys-2022, version of 2017-10-28)
+
+% This file contains the LaTeX formatting parameters for a two-column 
+% conference proceedings that is 8.5 inches wide by 11 inches high.  
+%
+% Modified by Dimitris Papailiopoulos and Virginia Smith based on style file of ICML 2018,
+% which can be found at:
+% <https://media.nips.cc/Conferences/ICML2018/Styles/icml2018_style.tar.gz>
+%
+% Hacked by Terran Lane, 2003:
+% - Updated to use LaTeX2e style file conventions (ProvidesPackage,
+%   etc.)
+% - Added an ``appearing in'' block at the base of the first column
+%   (thus keeping the ``appearing in'' note out of the bottom margin
+%   where the printer should strip in the page numbers).
+% - Added a package option [accepted] that selects between the ``Under
+%   review'' notice (default, when no option is specified) and the
+%   ``Appearing in'' notice (for use when the paper has been accepted
+%   and will appear).
+%
+%   Originally created as:  ml2k.sty (LaTeX style file for ICML-2000)
+%   by P. Langley (12/23/99)
+
+%%%%%%%%%%%%%%%%%%%%
+%% This version of the style file supports both a ``review'' version
+%% and a ``final/accepted'' version.  The difference is only in the
+%% text that appears in the note at the bottom of the first column of
+%% the first page.  The default behavior is to print a note to the
+%% effect that the paper is under review and don't distribute it.  The
+%% final/accepted version prints an ``Appearing in'' note.  To get the
+%% latter behavior, in the calling file change the ``usepackage'' line
+%% from:
+%%	\usepackage{icml2018}
+%% to
+%%	\usepackage[accepted]{icml2018}
+%%%%%%%%%%%%%%%%%%%%
+
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{mlsys2022}[2018/07/02 v0.1 mlsys Conference Style File]
+
+% Before 2018, \usepackage{times} was in the example TeX, but inevitably
+% not everybody did it.
+\RequirePackage{times}
+
+% Use fancyhdr package
+\RequirePackage{fancyhdr}
+\RequirePackage{color}
+\RequirePackage{algorithm}
+\RequirePackage{algorithmic}
+\RequirePackage{natbib}
+\RequirePackage{eso-pic} % used by \AddToShipoutPicture 
+\RequirePackage{forloop}
+
+%%%%%%%% Options
+\DeclareOption{accepted}{%
+  \renewcommand{\Notice@String}{\mlsys@appearing}
+  \gdef\isaccepted{1}
+}
+\DeclareOption{nohyperref}{%
+  \gdef\nohyperref{1}
+}
+
+\ifdefined\nohyperref\else\ifdefined\hypersetup
+  \definecolor{mydarkblue}{rgb}{0,0.08,0.45}
+  \hypersetup{ %
+    pdftitle={},
+    pdfauthor={},
+    pdfsubject={mlsys 2022},
+    pdfkeywords={},
+    pdfborder=0 0 0,
+    pdfpagemode=UseNone,
+    colorlinks=true,
+    linkcolor=mydarkblue,
+    citecolor=mydarkblue,
+    filecolor=mydarkblue,
+    urlcolor=mydarkblue,
+    pdfview=FitH}
+
+  \ifdefined\isaccepted \else
+    \hypersetup{pdfauthor={Anonymous Submission}}
+  \fi
+\fi\fi
+
+%%%%%%%%%%%%%%%%%%%%
+% This string is printed at the bottom of the page for the
+% final/accepted version of the ``appearing in'' note.  Modify it to
+% change that text.
+%%%%%%%%%%%%%%%%%%%%
+%\newcommand{\ICML@appearing}{\textit{mlsys 2022},
+\newcommand{\mlsys@appearing}{
+	\textit{Proceedings of the
+		$\mathit{5}^{th}$ MLSys Conference},
+	Santa Clara, CA, USA, 2022.
+	Copyright 2022 by the author(s).}
+
+%%%%%%%%%%%%%%%%%%%%
+% This string is printed at the bottom of the page for the draft/under
+% review version of the ``appearing in'' note.  Modify it to change
+% that text.
+%%%%%%%%%%%%%%%%%%%%
+\newcommand{\Notice@String}{Preliminary work.  Under review by the
+Machine Learning and  Systems  (MLSys) Conference\@.  Do not distribute.}
+
+% Cause the declared options to actually be parsed and activated
+\ProcessOptions\relax
+
+% Uncomment the following for debugging.  It will cause LaTeX to dump
+% the version of the ``appearing in'' string that will actually appear
+% in the document.
+%\typeout{>> Notice string='\Notice@String'}
+
+% Change citation commands to be more like old ICML styles
+\newcommand{\yrcite}[1]{\citeyearpar{#1}}
+\renewcommand{\cite}[1]{\citep{#1}}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% to ensure the letter format is used. pdflatex does compile the
+% page size into the pdf. This is done using \pdfpagewidth and 
+% \pdfpageheight. As Latex does not know this directives, we first
+% check whether pdflatex or latex is used.
+%
+% Kristian Kersting 2005
+%
+% in order to account for the more recent use of pdfetex as the default
+% compiler, I have changed the pdf verification.
+%
+% Ricardo Silva 2007
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\paperwidth=8.5in
+\paperheight=11in
+
+% old PDFLaTex verification, circa 2005
+%
+%\newif\ifpdf\ifx\pdfoutput\undefined
+%  \pdffalse % we are not running PDFLaTeX
+%\else
+%  \pdfoutput=1 % we are running PDFLaTeX
+%  \pdftrue
+%\fi
+
+\newif\ifpdf %adapted from ifpdf.sty
+\ifx\pdfoutput\undefined
+\else
+   \ifx\pdfoutput\relax
+   \else
+     \ifcase\pdfoutput
+     \else
+       \pdftrue
+     \fi
+   \fi
+\fi
+
+\ifpdf
+%    \pdfpagewidth=\paperwidth
+%    \pdfpageheight=\paperheight
+  \setlength{\pdfpagewidth}{8.5in}
+  \setlength{\pdfpageheight}{11in}
+\fi
+
+% Physical page layout 
+
+\evensidemargin -0.23in  
+\oddsidemargin -0.23in 
+\setlength\textheight{9.0in}
+\setlength\textwidth{6.75in} 
+\setlength\columnsep{0.25in}
+\setlength\headheight{10pt}
+\setlength\headsep{10pt} 
+\addtolength{\topmargin}{-20pt}
+\addtolength{\topmargin}{-0.29in}
+
+% Historically many authors tried to include packages like geometry or fullpage,
+% which change the page layout. It either makes the proceedings inconsistent, or
+% wastes organizers' time chasing authors. So let's nip these problems in the
+% bud here. -- Iain Murray 2018.
+%\RequirePackage{printlen}
+\AtBeginDocument{%
+% To get the numbers below, include printlen package above and see lengths like this:
+%\printlength\oddsidemargin\\
+%\printlength\headheight\\
+%\printlength\textheight\\
+%\printlength\marginparsep\\
+%\printlength\footskip\\
+%\printlength\hoffset\\
+%\printlength\paperwidth\\
+%\printlength\topmargin\\
+%\printlength\headsep\\
+%\printlength\textwidth\\
+%\printlength\marginparwidth\\
+%\printlength\marginparpush\\
+%\printlength\voffset\\
+%\printlength\paperheight\\
+%
+\newif\ifmarginsmessedwith
+\marginsmessedwithfalse
+\ifdim\oddsidemargin=-16.62178pt     \else oddsidemargin has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\headheight=10.0pt             \else headheight has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\textheight=650.43pt           \else textheight has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\marginparsep=11.0pt           \else marginparsep has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\footskip=0.0pt                \else footskip has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\hoffset=0.0pt                 \else hoffset has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\paperwidth=614.295pt          \else paperwidth has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\topmargin=-24.95781pt         \else topmargin has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\headsep=10.0pt                \else headsep has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\textwidth=487.8225pt          \else textwidth has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\marginparwidth=65.0pt         \else marginparwidth has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\marginparpush=5.0pt           \else marginparpush has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\voffset=0.0pt                 \else voffset has been altered.\\ \marginsmessedwithtrue\fi
+\ifdim\paperheight=794.96999pt       \else paperheight has been altered.\\ \marginsmessedwithtrue\fi
+\ifmarginsmessedwith
+
+\textbf{\large \em The page layout violates the ICML style.}
+
+Please do not change the page layout, or include packages like geometry,
+savetrees, or fullpage, which change it for you.
+
+We're not able to reliably undo arbitrary changes to the style. Please remove
+the offending package(s), or layout-changing commands and try again.
+
+\fi}
+
+
+%% The following is adapted from code in the acmconf.sty conference
+%% style file.  The constants in it are somewhat magical, and appear
+%% to work well with the two-column format on US letter paper that
+%% ICML uses, but will break if you change that layout, or if you use
+%% a longer block of text for the copyright notice string.  Fiddle with
+%% them if necessary to get the block to fit/look right.
+%%
+%% -- Terran Lane, 2003
+%%
+%% The following comments are included verbatim from acmconf.sty:
+%%
+%%% This section (written by KBT) handles the 1" box in the lower left
+%%% corner of the left column of the first page by creating a picture,
+%%% and inserting the predefined string at the bottom (with a negative
+%%% displacement to offset the space allocated for a non-existent
+%%% caption).
+%%%
+\def\ftype@copyrightbox{8}
+\def\@copyrightspace{
+% Create a float object positioned at the bottom of the column.  Note
+% that because of the mystical nature of floats, this has to be called
+% before the first column is populated with text (e.g., from the title
+% or abstract blocks).  Otherwise, the text will force the float to
+% the next column.  -- TDRL.
+\@float{copyrightbox}[b]
+\begin{center}
+\setlength{\unitlength}{1pc}
+\begin{picture}(20,1.5)
+% Create a line separating the main text from the note block.
+% 4.818pc==0.8in.
+\put(0,2.5){\line(1,0){4.818}}
+% Insert the text string itself.  Note that the string has to be
+% enclosed in a parbox -- the \put call needs a box object to
+% position.  Without the parbox, the text gets splattered across the
+% bottom of the page semi-randomly.  The 19.75pc distance seems to be
+% the width of the column, though I can't find an appropriate distance
+% variable to substitute here.  -- TDRL.
+\put(0,0){\parbox[b]{19.75pc}{\small \Notice@String}}
+\end{picture}
+\end{center}
+\end@float}
+
+% Note: A few Latex versions need the next line instead of the former.
+% \addtolength{\topmargin}{0.3in}
+% \setlength\footheight{0pt}
+\setlength\footskip{0pt} 
+%\pagestyle{empty} 
+\flushbottom \twocolumn
+\sloppy
+
+% Clear out the addcontentsline command
+\def\addcontentsline#1#2#3{}
+ 
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%% commands for formatting paper title, author names, and addresses. 
+
+%%start%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%% title as running head -- Kristian Kersting 2005 %%%%%%%%%%%%%
+
+
+%\makeatletter
+%\newtoks\mytoksa
+%\newtoks\mytoksb
+%\newcommand\addtomylist[2]{%
+%  \mytoksa\expandafter{#1}%
+%  \mytoksb{#2}%
+%  \edef#1{\the\mytoksa\the\mytoksb}%
+%}
+%\makeatother 
+
+% box to check the size of the running head
+\newbox\titrun
+
+% general page style
+\pagestyle{fancy}
+\fancyhf{}
+\fancyhead{}
+\fancyfoot{}
+% set the width of the head rule to 1 point
+\renewcommand{\headrulewidth}{1pt}
+
+% definition to set the head as running head in the preamble
+\def\mlsystitlerunning#1{\gdef\@mlsystitlerunning{#1}}
+
+% main definition adapting \mlsystitle from 2022
+\long\def\mlsystitle#1{%
+
+   %check whether @mlsystitlerunning exists
+   % if not \mlsystitle is used as running head
+   \ifx\undefined\@mlsystitlerunning%
+	\gdef\@mlsystitlerunning{#1}
+   \fi
+
+   %add it to pdf information
+  \ifdefined\nohyperref\else\ifdefined\hypersetup
+     \hypersetup{pdftitle={#1}}
+   \fi\fi
+
+   %get the dimension of the running title
+   \global\setbox\titrun=\vbox{\small\bf\@mlsystitlerunning}
+
+   % error flag
+   \gdef\@runningtitleerror{0}
+
+   % running title too long
+   \ifdim\wd\titrun>\textwidth%
+	  {\gdef\@runningtitleerror{1}}%
+   % running title breaks a line
+   \else\ifdim\ht\titrun>6.25pt
+	   {\gdef\@runningtitleerror{2}}%
+	\fi
+   \fi 
+
+   % if there is somthing wrong with the running title
+   \ifnum\@runningtitleerror>0
+	   \typeout{}%
+           \typeout{}%
+           \typeout{*******************************************************}%
+           \typeout{Title exceeds size limitations for running head.}%
+           \typeout{Please supply a shorter form for the running head}
+           \typeout{with \string\mlsystitlerunning{...}\space prior to \string\begin{document}}%
+           \typeout{*******************************************************}%
+ 	    \typeout{}%
+           \typeout{}%
+           % set default running title
+	   \chead{\small\bf Title Suppressed Due to Excessive Size}%
+    \else
+	   % 'everything' fine, set provided running title
+  	   \chead{\small\bf\@mlsystitlerunning}%
+    \fi
+
+  % no running title on the first page of the paper
+  \thispagestyle{empty}
+
+%%%%%%%%%%%%%%%%%%%% Kristian Kersting %%%%%%%%%%%%%%%%%%%%%%%%%  
+%end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+  {\center\baselineskip 18pt
+                       \toptitlebar{\Large\bf\scshape #1}\bottomtitlebar}
+}
+
+
+\gdef\icmlfullauthorlist{}
+\newcommand\addstringtofullauthorlist{\g@addto@macro\icmlfullauthorlist}
+\newcommand\addtofullauthorlist[1]{%
+  \ifdefined\icmlanyauthors%
+    \addstringtofullauthorlist{, #1}%
+  \else%
+    \addstringtofullauthorlist{#1}%
+    \gdef\icmlanyauthors{1}%
+  \fi%
+  \ifdefined\nohyperref\else\ifdefined\hypersetup%
+    \hypersetup{pdfauthor=\icmlfullauthorlist}%
+  \fi\fi}
+
+%%% GS: changed toptitlebar from 1pt to 2pt
+\def\toptitlebar{\hrule height2pt \vskip .25in} 
+\def\bottomtitlebar{\vskip .22in \hrule height1pt \vskip .3in} 
+
+\newenvironment{mlsysauthorlist}{%
+  \setlength\topsep{0pt}
+  \setlength\parskip{0pt}
+  \begin{center}
+}{%
+  \end{center}
+}
+
+\newcounter{@affiliationcounter}
+\newcommand{\@pa}[1]{%
+% ``#1''
+\ifcsname the@affil#1\endcsname
+   % do nothing
+\else
+  \ifcsname @icmlsymbol#1\endcsname
+    % nothing
+  \else
+  \stepcounter{@affiliationcounter}%
+  \newcounter{@affil#1}%
+  \setcounter{@affil#1}{\value{@affiliationcounter}}%
+  \fi
+\fi%
+\ifcsname @icmlsymbol#1\endcsname
+  \textsuperscript{\csname @icmlsymbol#1\endcsname\,}%
+\else
+  %\expandafter\footnotemark[\arabic{@affil#1}\,]%
+  \textsuperscript{\arabic{@affil#1}\,}%
+\fi
+}
+
+%\newcommand{\mlsysauthor}[2]{%
+%\addtofullauthorlist{#1}%
+%#1\@for\theaffil:=#2\do{\pa{\theaffil}}%
+%}
+\newcommand{\mlsysauthor}[2]{%
+  \ifdefined\isaccepted
+    \mbox{\bf #1}\,\@for\theaffil:=#2\do{\@pa{\theaffil}} \addtofullauthorlist{#1}%
+   \else
+    \ifdefined\@icmlfirsttime
+    \else
+      \gdef\@icmlfirsttime{1}
+      \mbox{\bf Anonymous Authors}\@pa{@anon} \addtofullauthorlist{Anonymous Authors}
+     \fi
+    \fi
+}
+
+\newcommand{\mlsyssetsymbol}[2]{%
+  \expandafter\gdef\csname @icmlsymbol#1\endcsname{#2}
+ }
+   
+
+\newcommand{\mlsysaffiliation}[2]{%
+\ifdefined\isaccepted
+\ifcsname the@affil#1\endcsname
+ \expandafter\gdef\csname @affilname\csname the@affil#1\endcsname\endcsname{#2}%
+\else
+  {\bf AUTHORERR: Error in use of \textbackslash{}mlsysaffiliation command. Label ``#1'' not mentioned in some \textbackslash{}mlsysauthor\{author name\}\{labels here\} command beforehand. }
+  \typeout{}%
+  \typeout{}%
+  \typeout{*******************************************************}%
+  \typeout{Affiliation label undefined. }%
+  \typeout{Make sure \string\mlsysaffiliation\space follows }
+  \typeout{all of \string\mlsysauthor\space commands}%
+  \typeout{*******************************************************}%
+  \typeout{}%
+  \typeout{}%
+\fi
+\else % \isaccepted
+ % can be called multiple times... it's idempotent
+ \expandafter\gdef\csname @affilname1\endcsname{Anonymous Institution, Anonymous City, Anonymous Region, Anonymous Country}
+\fi
+}
+
+\newcommand{\mlsyscorrespondingauthor}[2]{
+\ifdefined\isaccepted
+ \ifdefined\mlsyscorrespondingauthor@text
+   \g@addto@macro\mlsyscorrespondingauthor@text{, #1 \textless{}#2\textgreater{}}
+ \else
+   \gdef\mlsyscorrespondingauthor@text{#1 \textless{}#2\textgreater{}}
+ \fi
+\else
+\gdef\mlsyscorrespondingauthor@text{Anonymous Author \textless{}anon.email@domain.com\textgreater{}}
+\fi
+}
+
+\newcommand{\mlsysEqualContribution}{\textsuperscript{*}Equal contribution }
+
+\newcounter{@affilnum}
+\newcommand{\printAffiliationsAndNotice}[1]{%
+\stepcounter{@affiliationcounter}%
+{\let\thefootnote\relax\footnotetext{\hspace*{-\footnotesep}\ifdefined\isaccepted #1\fi%
+\forloop{@affilnum}{1}{\value{@affilnum} < \value{@affiliationcounter}}{
+\textsuperscript{\arabic{@affilnum}}\ifcsname @affilname\the@affilnum\endcsname%
+\csname @affilname\the@affilnum\endcsname%
+\else
+{\bf AUTHORERR: Missing \textbackslash{}mlsysaffiliation.}
+\fi
+}.
+\ifdefined\mlsyscorrespondingauthor@text
+Correspondence to: \mlsyscorrespondingauthor@text.
+\else
+{\bf AUTHORERR: Missing \textbackslash{}mlsyscorrespondingauthor.}
+\fi
+
+\ \\
+\Notice@String
+}
+}
+}
+
+%\makeatother
+
+\long\def\icmladdress#1{%
+ {\bf The \textbackslash{}icmladdress command is no longer used.  See the example\_paper PDF .tex for usage of \textbackslash{}icmlauther and \textbackslash{}mlsysaffiliation.}
+}
+
+%% keywords as first class citizens
+\def\mlsyskeywords#1{%
+%  \ifdefined\isaccepted \else
+%    \par {\bf Keywords:} #1%
+%  \fi
+%  \ifdefined\nohyperref\else\ifdefined\hypersetup
+%    \hypersetup{pdfkeywords={#1}}
+%  \fi\fi
+%  \ifdefined\isaccepted \else
+%    \par {\bf Keywords:} #1%
+%  \fi
+  \ifdefined\nohyperref\else\ifdefined\hypersetup
+    \hypersetup{pdfkeywords={#1}}
+  \fi\fi
+}
+
+% modification to natbib citations
+\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
+
+% Redefinition of the abstract environment. 
+\renewenvironment{abstract}
+   {%
+% Insert the ``appearing in'' copyright notice.
+%\@copyrightspace
+\centerline{\large\bf\scshape Abstract}
+   \vspace{-0.12in}\begin{quote}}
+   {\par\end{quote}\vskip 0.12in}
+
+% numbered section headings with different treatment of numbers
+
+\def\@startsection#1#2#3#4#5#6{\if@noskipsec \leavevmode \fi
+   \par \@tempskipa #4\relax
+   \@afterindenttrue
+% Altered the following line to indent a section's first paragraph. 
+%  \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \@afterindentfalse\fi
+   \ifdim \@tempskipa <\z@ \@tempskipa -\@tempskipa \fi
+   \if@nobreak \everypar{}\else
+     \addpenalty{\@secpenalty}\addvspace{\@tempskipa}\fi \@ifstar
+     {\@ssect{#3}{#4}{#5}{#6}}{\@dblarg{\@sict{#1}{#2}{#3}{#4}{#5}{#6}}}}
+
+\def\@sict#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
+     \def\@svsec{}\else 
+     \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname}\fi
+     \@tempskipa #5\relax
+      \ifdim \@tempskipa>\z@
+        \begingroup #6\relax
+        %%% GS: changed @svsec from .~ to \quad
+          \@hangfrom{\hskip #3\relax\@svsec\quad}{\interlinepenalty \@M #8\par}
+        \endgroup
+       \csname #1mark\endcsname{#7}\addcontentsline
+         {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                      \protect\numberline{\csname the#1\endcsname}\fi
+                    #7}\else
+        \def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
+                      {#7}\addcontentsline
+                           {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                             \protect\numberline{\csname the#1\endcsname}\fi
+                       #7}}\fi
+     \@xsect{#5}}
+
+\def\@sect#1#2#3#4#5#6[#7]#8{\ifnum #2>\c@secnumdepth
+     \def\@svsec{}\else 
+     \refstepcounter{#1}\edef\@svsec{\csname the#1\endcsname\hskip 0.4em }\fi
+     \@tempskipa #5\relax
+      \ifdim \@tempskipa>\z@ 
+        \begingroup #6\relax
+          \@hangfrom{\hskip #3\relax\@svsec}{\interlinepenalty \@M #8\par}
+        \endgroup
+       \csname #1mark\endcsname{#7}\addcontentsline
+         {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                      \protect\numberline{\csname the#1\endcsname}\fi
+                    #7}\else
+        \def\@svsechd{#6\hskip #3\@svsec #8\csname #1mark\endcsname
+                      {#7}\addcontentsline
+                           {toc}{#1}{\ifnum #2>\c@secnumdepth \else
+                             \protect\numberline{\csname the#1\endcsname}\fi
+                       #7}}\fi
+     \@xsect{#5}}
+
+% section headings with less space above and below them
+\def\thesection {\arabic{section}}
+\def\thesubsection {\thesection.\arabic{subsection}}
+%%% GS: added \scshape below
+\def\section{\@startsection{section}{1}{\z@}{-0.12in}{0.02in}
+             {\large\bf\scshape\raggedright}}
+\def\subsection{\@startsection{subsection}{2}{\z@}{-0.10in}{0.01in}
+                {\normalsize\bf\raggedright}}
+                %%% GS: removed \sc below, added \itshape
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-0.08in}{0.01in}
+                {\normalsize\itshape\raggedright}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+  0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
+  0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+ 
+% Footnotes 
+\footnotesep 6.65pt % 
+\skip\footins 9pt 
+\def\footnoterule{\kern-3pt \hrule width 0.8in \kern 2.6pt } 
+\setcounter{footnote}{0} 
+ 
+% Lists and paragraphs 
+\parindent 0pt 
+\topsep 4pt plus 1pt minus 2pt 
+\partopsep 1pt plus 0.5pt minus 0.5pt 
+\itemsep 2pt plus 1pt minus 0.5pt 
+\parsep 2pt plus 1pt minus 0.5pt 
+\parskip 6pt
+ 
+\leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em 
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em  
+\leftmarginvi .5em 
+\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt 
+ 
+\def\@listi{\leftmargin\leftmargini} 
+\def\@listii{\leftmargin\leftmarginii 
+   \labelwidth\leftmarginii\advance\labelwidth-\labelsep 
+   \topsep 2pt plus 1pt minus 0.5pt 
+   \parsep 1pt plus 0.5pt minus 0.5pt 
+   \itemsep \parsep} 
+\def\@listiii{\leftmargin\leftmarginiii 
+    \labelwidth\leftmarginiii\advance\labelwidth-\labelsep 
+    \topsep 1pt plus 0.5pt minus 0.5pt  
+    \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt 
+    \itemsep \topsep} 
+\def\@listiv{\leftmargin\leftmarginiv 
+     \labelwidth\leftmarginiv\advance\labelwidth-\labelsep} 
+\def\@listv{\leftmargin\leftmarginv 
+     \labelwidth\leftmarginv\advance\labelwidth-\labelsep} 
+\def\@listvi{\leftmargin\leftmarginvi 
+     \labelwidth\leftmarginvi\advance\labelwidth-\labelsep} 
+ 
+\abovedisplayskip 7pt plus2pt minus5pt% 
+\belowdisplayskip \abovedisplayskip 
+\abovedisplayshortskip  0pt plus3pt%    
+\belowdisplayshortskip  4pt plus3pt minus3pt% 
+ 
+% Less leading in most fonts (due to the narrow columns) 
+% The choices were between 1-pt and 1.5-pt leading 
+\def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} 
+\def\small{\@setsize\small{10pt}\ixpt\@ixpt} 
+\def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt} 
+\def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt} 
+\def\tiny{\@setsize\tiny{7pt}\vipt\@vipt} 
+\def\large{\@setsize\large{14pt}\xiipt\@xiipt} 
+\def\Large{\@setsize\Large{16pt}\xivpt\@xivpt} 
+\def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt} 
+\def\huge{\@setsize\huge{23pt}\xxpt\@xxpt} 
+\def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt} 
+
+% Revised formatting for figure captions and table titles. 
+\newsavebox\newcaptionbox\newdimen\newcaptionboxwid
+
+\long\def\@makecaption#1#2{
+ \vskip 10pt 
+        \baselineskip 11pt
+        \setbox\@tempboxa\hbox{#1. #2}
+        \ifdim \wd\@tempboxa >\hsize
+        \sbox{\newcaptionbox}{\small\sl #1.~}
+        \newcaptionboxwid=\wd\newcaptionbox
+        \usebox\newcaptionbox {\footnotesize #2}
+%        \usebox\newcaptionbox {\small #2}
+        \else 
+          \centerline{{\small\sl #1.} {\small #2}} 
+        \fi}
+
+\def\fnum@figure{Figure \thefigure}
+\def\fnum@table{Table \thetable}
+
+% Strut macros for skipping spaces above and below text in tables. 
+\def\abovestrut#1{\rule[0in]{0in}{#1}\ignorespaces}
+\def\belowstrut#1{\rule[-#1]{0in}{#1}\ignorespaces}
+
+\def\abovespace{\abovestrut{0.20in}}
+\def\aroundspace{\abovestrut{0.20in}\belowstrut{0.10in}}
+\def\belowspace{\belowstrut{0.10in}}
+
+% Various personal itemization commands. 
+\def\texitem#1{\par\noindent\hangindent 12pt
+               \hbox to 12pt {\hss #1 ~}\ignorespaces}
+\def\icmlitem{\texitem{$\bullet$}}
+
+% To comment out multiple lines of text.
+\long\def\comment#1{}
+
+
+
+
+%% Line counter (not in final version). Adapted from NIPS style file by Christoph Sawade
+
+% Vertical Ruler
+% This code is, largely, from the CVPR 2010 conference style file
+% ----- define vruler
+\makeatletter
+\newbox\icmlrulerbox
+\newcount\icmlrulercount
+\newdimen\icmlruleroffset
+\newdimen\cv@lineheight
+\newdimen\cv@boxheight
+\newbox\cv@tmpbox
+\newcount\cv@refno
+\newcount\cv@tot
+% NUMBER with left flushed zeros  \fillzeros[<WIDTH>]<NUMBER>
+\newcount\cv@tmpc@ \newcount\cv@tmpc
+\def\fillzeros[#1]#2{\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi
+\cv@tmpc=1 %
+\loop\ifnum\cv@tmpc@<10 \else \divide\cv@tmpc@ by 10 \advance\cv@tmpc by 1 \fi
+   \ifnum\cv@tmpc@=10\relax\cv@tmpc@=11\relax\fi \ifnum\cv@tmpc@>10 \repeat
+\ifnum#2<0\advance\cv@tmpc1\relax-\fi
+\loop\ifnum\cv@tmpc<#1\relax0\advance\cv@tmpc1\relax\fi \ifnum\cv@tmpc<#1 \repeat
+\cv@tmpc@=#2\relax\ifnum\cv@tmpc@<0\cv@tmpc@=-\cv@tmpc@\fi \relax\the\cv@tmpc@}%
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\makevruler[#1][#2][#3][#4][#5]{
+	\begingroup\offinterlineskip
+		\textheight=#5\vbadness=10000\vfuzz=120ex\overfullrule=0pt%
+		\global\setbox\icmlrulerbox=\vbox to \textheight{%
+			{
+				\parskip=0pt\hfuzz=150em\cv@boxheight=\textheight
+				\cv@lineheight=#1\global\icmlrulercount=#2%
+				\cv@tot\cv@boxheight\divide\cv@tot\cv@lineheight\advance\cv@tot2%
+				\cv@refno1\vskip-\cv@lineheight\vskip1ex%
+				\loop\setbox\cv@tmpbox=\hbox to0cm{					 % side margin
+					\hfil {\hfil\fillzeros[#4]\icmlrulercount}
+				}%
+				\ht\cv@tmpbox\cv@lineheight\dp\cv@tmpbox0pt\box\cv@tmpbox\break
+				\advance\cv@refno1\global\advance\icmlrulercount#3\relax
+				\ifnum\cv@refno<\cv@tot\repeat
+			}
+		}
+	\endgroup
+}%
+\makeatother
+% ----- end of vruler
+
+
+% \makevruler[<SCALE>][<INITIAL_COUNT>][<STEP>][<DIGITS>][<HEIGHT>]
+\def\icmlruler#1{\makevruler[12pt][#1][1][3][\textheight]\usebox{\icmlrulerbox}}
+\AddToShipoutPicture{%
+\icmlruleroffset=\textheight
+\advance\icmlruleroffset by 5.2pt % top margin
+  \color[rgb]{.7,.7,.7}
+  \ifdefined\isaccepted \else
+	  \AtTextUpperLeft{%
+	    \put(\LenToUnit{-35pt},\LenToUnit{-\icmlruleroffset}){%left ruler
+	      \icmlruler{\icmlrulercount}}
+%	    \put(\LenToUnit{1.04\textwidth},\LenToUnit{-\icmlruleroffset}){%right ruler
+%	      \icmlruler{\icmlrulercount}}
+	  }
+	 \fi
+}
+\endinput
-- 
GitLab