All changes since departing from Main; Contains the randomWalk tcp implementation

a4535f8b · Jeffrey Wigger · 77a1296e · a4535f8b · a4535f8b · a4535f8b
Commit a4535f8b authored 2 years ago by Jeffrey Wigger
--- a/eval/16_ring.edges
+++ b/eval/16_ring.edges
+16
+0 1
+0 15
+1 0
+1 2
+2 1
+2 3
+3 2
+3 4
+4 3
+4 5
+5 4
+5 6
+6 5
+6 7
+7 8
+7 6
+8 9
+8 7
+9 8
+9 10
+10 9
+10 11
+11 10
+11 12
+12 11
+12 13
+13 12
+13 14
+14 13
+14 15
+15 0
+15 14
--- a/eval/96_star.edges
+++ b/eval/96_star.edges
+96
+0 1
+0 2
+0 3
+0 4
+0 5
+0 6
+0 7
+0 8
+0 9
+0 10
+0 11
+0 12
+0 13
+0 14
+0 15
+0 16
+0 17
+0 18
+0 19
+0 20
+0 21
+0 22
+0 23
+0 24
+0 25
+0 26
+0 27
+0 28
+0 29
+0 30
+0 31
+0 32
+0 33
+0 34
+0 35
+0 36
+0 37
+0 38
+0 39
+0 40
+0 41
+0 42
+0 43
+0 44
+0 45
+0 46
+0 47
+0 48
+0 49
+0 50
+0 51
+0 52
+0 53
+0 54
+0 55
+0 56
+0 57
+0 58
+0 59
+0 60
+0 61
+0 62
+0 63
+0 64
+0 65
+0 66
+0 67
+0 68
+0 69
+0 70
+0 71
+0 72
+0 73
+0 74
+0 75
+0 76
+0 77
+0 78
+0 79
+0 80
+0 81
+0 82
+0 83
+0 84
+0 85
+0 86
+0 87
+0 88
+0 89
+0 90
+0 91
+0 92
+0 93
+0 94
+0 95
+1 0
+2 0
+3 0
+4 0
+5 0
+6 0
+7 0
+8 0
+9 0
+10 0
+11 0
+12 0
+13 0
+14 0
+15 0
+16 0
+17 0
+18 0
+19 0
+20 0
+21 0
+22 0
+23 0
+24 0
+25 0
+26 0
+27 0
+28 0
+29 0
+30 0
+31 0
+32 0
+33 0
+34 0
+35 0
+36 0
+37 0
+38 0
+39 0
+40 0
+41 0
+42 0
+43 0
+44 0
+45 0
+46 0
+47 0
+48 0
+49 0
+50 0
+51 0
+52 0
+53 0
+54 0
+55 0
+56 0
+57 0
+58 0
+59 0
+60 0
+61 0
+62 0
+63 0
+64 0
+65 0
+66 0
+67 0
+68 0
+69 0
+70 0
+71 0
+72 0
+73 0
+74 0
+75 0
+76 0
+77 0
+78 0
+79 0
+80 0
+81 0
+82 0
+83 0
+84 0
+85 0
+86 0
+87 0
+88 0
+89 0
+90 0
+91 0
+92 0
+93 0
+94 0
+95 0
--- a/eval/run_grid_celeba_synchronous.sh
+++ b/eval/run_grid_celeba_synchronous.sh
+#!/bin/bash
+# Documentation
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use Adam.
+#   For SGD these need to be changed manually
+# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
+#   on the train set and on the test set is carried out every global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=$1
+python_bin=$2
+logs_subfolder=$3
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+graph=6_star.edges
+config_file=~/tmp/config.ini
+procs_per_machine=1
+machines=6
+global_epochs=25
+eval_file=testing.py
+log_level=INFO
+
+ip_machines=$nfs_home/configs/ip_addr_6Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+export PYTHONFAULTHANDLER=1
+
+# Base configs for which the gird search is done
+tests=("step_configs/config_celeba_synchronous.ini")
+# Learning rates to test
+lrs=("0.001" "0.0001" "0.00001")
+# Batch sizes to test
+batchsize=("8")
+# The number of communication rounds per global epoch to test
+comm_rounds_per_global_epoch=("2000")
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+# Celeba has 63741 samples
+dataset_size=63741
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+
+for b in "${batchsize[@]}"
+do
+  echo batchsize: $b
+  for r in "${comm_rounds_per_global_epoch[@]}"
+  do
+    echo communication rounds per global epoch: $r
+    # calculating how many batches there are in a global epoch for each user/proc
+    batches_per_epoch=$(($samples_per_user / $b))
+    echo batches per global epoch: $batches_per_epoch
+    # the number of iterations in 25 global epochs
+    iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
+    echo iterations: $iterations
+    # calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+    batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
+    # since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+    new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
+    echo batches per communication round: $batches_per_comm_round
+    echo corrected iterations: $new_iterations
+    test_after=$(($new_iterations / $global_epochs))
+    echo test after: $test_after
+    for lr in "${lrs[@]}"
+    do
+      for i in "${tests[@]}"
+      do
+        echo $i
+        IFS='_' read -ra NAMES <<< $i
+        IFS='.' read -ra NAME <<< ${NAMES[-1]}
+        log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m
+        echo results are stored in: $log_dir
+        mkdir -p $log_dir
+        cp $i $config_file
+        # changing the config files to reflect the values of the current grid search state
+        $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+        $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+        $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+        $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
+        $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+        echo $i is done
+        sleep 1
+        echo end of sleep
+      done
+    done
+  done
+done
+#
+
--- a/eval/run_grid_cifar_synchronous.sh
+++ b/eval/run_grid_cifar_synchronous.sh
+#!/bin/bash
+# Documentation
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use Adam.
+#   For SGD these need to be changed manually
+# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
+#   on the train set and on the test set is carried out every global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=$1
+python_bin=$2
+logs_subfolder=$3
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+graph=6_star.edges
+config_file=~/tmp/config.ini
+procs_per_machine=1
+machines=6
+global_epochs=25
+eval_file=testing.py
+log_level=INFO
+
+ip_machines=$nfs_home/configs/ip_addr_6Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+export PYTHONFAULTHANDLER=1
+
+# Base configs for which the gird search is done
+tests=("step_configs/config_cifar_syn.ini")
+# Learning rates to test
+lrs=("0.1" "0.01" "0.001" "0.0001")
+# Batch sizes to test
+batchsize=("20")
+# The number of communication rounds per global epoch to test
+comm_rounds_per_global_epoch=("416")
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+# Cifar has 50000 samples
+dataset_size=50000
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+
+for b in "${batchsize[@]}"
+do
+  echo batchsize: $b
+  for r in "${comm_rounds_per_global_epoch[@]}"
+  do
+    echo communication rounds per global epoch: $r
+    # calculating how many batches there are in a global epoch for each user/proc
+    batches_per_epoch=$(($samples_per_user / $b))
+    echo batches per global epoch: $batches_per_epoch
+    # the number of iterations in 25 global epochs
+    iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
+    echo iterations: $iterations
+    # calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+    batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
+    # since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+    new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
+    echo batches per communication round: $batches_per_comm_round
+    echo corrected iterations: $new_iterations
+    test_after=$(($new_iterations / $global_epochs))
+    echo test after: $test_after
+    for lr in "${lrs[@]}"
+    do
+      for i in "${tests[@]}"
+      do
+        echo $i
+        IFS='_' read -ra NAMES <<< $i
+        IFS='.' read -ra NAME <<< ${NAMES[-1]}
+        log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m
+        echo results are stored in: $log_dir
+        mkdir -p $log_dir
+        cp $i $config_file
+        # changing the config files to reflect the values of the current grid search state
+        $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+        $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+        $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+        $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
+        $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+        echo $i is done
+        sleep 1
+        echo end of sleep
+      done
+    done
+  done
+done
+#
+
--- a/eval/run_grid_sp.sh
+++ b/eval/run_grid_sp.sh
+#!/bin/bash
+# Documentation
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use SGD.
+# - The script will set '--test_after' and '--train_evaluate_after' such that it happens at the end of a global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=$1
+python_bin=$2
+logs_subfolder=$3
+decpy_path=$nfs_home/decentralizepy/eval
+cd $decpy_path
+
+env_python=$python_bin/python3
+graph=96_regular.edges
+config_file=~/tmp/config.ini
+procs_per_machine=16
+machines=6
+global_epochs=80
+eval_file=testing.py
+log_level=INFO
+
+ip_machines=$nfs_home/configs/ip_addr_6Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+export PYTHONFAULTHANDLER=1
+
+# Base configs for which the gird search is done
+tests=("step_configs/config_shakespeare_sharing.ini")
+# Learning rates to test
+lrs=("0.5" "0.1")
+# Batch sizes to test
+batchsize=("16")
+# The number of communication rounds per global epoch to test
+comm_rounds_per_global_epoch=("5")
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+dataset_size=97545
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+
+for b in "${batchsize[@]}"
+do
+  echo batchsize: $b
+  for r in "${comm_rounds_per_global_epoch[@]}"
+  do
+    echo communication rounds per global epoch: $r
+    # calculating how many batches there are in a global epoch for each user/proc
+    batches_per_epoch=$(($samples_per_user / $b))
+    echo batches per global epoch: $batches_per_epoch
+    # the number of iterations in 25 global epochs
+    iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
+    echo iterations: $iterations
+    # calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+    batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
+    # since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+    new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
+    echo batches per communication round: $batches_per_comm_round
+    echo corrected iterations: $new_iterations
+    test_after=$(($new_iterations / $global_epochs))
+    echo test after: $test_after
+    for lr in "${lrs[@]}"
+    do
+      for i in "${tests[@]}"
+      do
+        echo $i
+        IFS='_' read -ra NAMES <<< $i
+        IFS='.' read -ra NAME <<< ${NAMES[-1]}
+        log_dir_base=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')
+        echo results are stored in: $log_dir_base
+        log_dir=$log_dir_base/machine$m
+        mkdir -p $log_dir
+        weight_store_dir=$log_dir_base/weights
+        mkdir -p $weight_store_dir
+        cp $i $config_file
+        # changing the config files to reflect the values of the current grid search state
+        $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+        $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+        $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+        $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
+        $env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -wsd $weight_store_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+        echo $i is done
+        sleep 500
+        echo end of sleep
+      done
+    done
+  done
+done
+#
+
--- a/eval/run_reddit_synchronous_local.sh
+++ b/eval/run_reddit_synchronous_local.sh
+#!/bin/bash
+# Documentation
+# Note: documentation was not written for this run file, so actual behaviour may differ
+# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
+# The second one (python_bin) is the path to the python bin folder.
+# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
+#
+# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
+# called configs which contains the file 'ip_addr_6Machines.json'
+# The python bin folder needs to include all the dependencies of this project including crudini.
+# The results will be stored in $nfs_home/$logs_subfolder
+# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
+# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
+# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
+# was run.
+# Example: ./run_grid.sh  /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
+#
+# Additional requirements:
+# Each node needs a folder called 'tmp' in the user's home directory
+#
+# Note:
+# - The script does not change the optimizer. All configs are writen to use Adam.
+#   For SGD these need to be changed manually
+# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
+#   on the train set and on the test set is carried out every global epoch.
+# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
+#   relevant for Adams and other optimizers with internal state)
+#
+# Addapting the script to other datasets:
+# Change the variable 'dataset_size' to reflect the data sets size.
+#
+# Known issues:
+# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
+#   machines may start running the script at the exact same moment.
+
+nfs_home=/tmp/logs/
+python_bin=/home/jeffrey/anaconda3/envs/sacs39/bin
+logs_subfolder=reddit_local/
+
+env_python=$python_bin/python3
+graph=6_star.edges
+config_file=~/tmp/config.ini
+procs_per_machine=6
+machines=1
+global_epochs=3
+eval_file=testing.py
+log_level=DEBUG
+
+ip_machines=ip_addr_1Machines.json
+
+m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
+
+# Base configs for which the gird search is done
+tests=("step_configs/config_reddit_synchronous_local.ini")
+# Learning rates
+lr="1"
+# Batch size
+batchsize="16"
+# The number of communication rounds per global epoch
+comm_rounds_per_global_epoch="30"
+procs=`expr $procs_per_machine \* $machines`
+echo procs: $procs
+# Celeba has 63741 samples
+# Reddit has 70642
+# Femnist 734463
+# Shakespeares 3678451
+dataset_size=70642
+# Calculating the number of samples that each user/proc will have on average
+samples_per_user=`expr $dataset_size / $procs`
+echo samples per user: $samples_per_user
+
+# random_seeds for which to rerun the experiments
+random_seeds=("97")
+# random_seed = 97
+echo batchsize: $batchsize
+echo communication rounds per global epoch: $comm_rounds_per_global_epoch
+# calculating how many batches there are in a global epoch for each user/proc
+batches_per_epoch=$(($samples_per_user / $batchsize))
+echo batches per global epoch: $batches_per_epoch
+# the number of iterations in 25 global epochs
+iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)")
+echo iterations: $iterations
+# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
+batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)")
+# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
+new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)")
+echo batches per communication round: $batches_per_comm_round
+echo corrected iterations: $new_iterations
+test_after=$(($new_iterations / $global_epochs))
+echo test after: $test_after
+for i in "${tests[@]}"
+do
+  for seed in "${random_seeds[@]}"
+  do
+    echo $i
+    IFS='_' read -ra NAMES <<< $i
+    IFS='.' read -ra NAME <<< ${NAMES[-1]}
+    log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')/machine$m
+    echo results are stored in: $log_dir
+    mkdir -p $log_dir
+    cp $i $config_file
+    # changing the config files to reflect the values of the current grid search state
+    $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
+    $python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
+    $python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
+    $python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize
+    $python_bin/crudini --set $config_file DATASET random_seed $seed
+
+    $env_python -q -X faulthandler $eval_file -ro 0 -tea $test_after -ld $log_dir -mid 0 -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
+    echo $i is done
+    sleep 10
+    echo end of sleep
+    done
+done
+#
\ No newline at end of file
--- a/eval/step_configs/config_celeba_dpsgdWithRW.ini
+++ b/eval/step_configs/config_celeba_dpsgdWithRW.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCPRandomWalk
+comm_class = TCPRandomWalk
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.DPSGDRW
+sharing_class = DPSGDRW
+rw_chance = 0.0
--- a/eval/step_configs/config_celeba_dpsgdWithRWAsync.ini
+++ b/eval/step_configs/config_celeba_dpsgdWithRWAsync.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCPRandomWalk
+comm_class = TCPRandomWalk
+addresses_filepath = ip_addr_6Machines.json
+sampler = equi_check_history
+
+[SHARING]
+sharing_package = decentralizepy.sharing.DPSGDRWAsync
+sharing_class = DPSGDRWAsync
--- a/eval/step_configs/config_celeba_sharingWithRW.ini
+++ b/eval/step_configs/config_celeba_sharingWithRW.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.SharingWithRW
+sharing_class = SharingWithRW
--- a/eval/step_configs/config_celeba_sharingWithRWAsync.ini
+++ b/eval/step_configs/config_celeba_sharingWithRWAsync.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCPRandomWalk
+comm_class = TCPRandomWalk
+addresses_filepath = ip_addr_6Machines.json
+sampler = equi_check_history
+
+[SHARING]
+sharing_package = decentralizepy.sharing.SharingWithRWAsync
+sharing_class = SharingWithRWAsync
--- a/eval/step_configs/config_celeba_sharingWithRWAsyncDynamic.ini
+++ b/eval/step_configs/config_celeba_sharingWithRWAsyncDynamic.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCPRandomWalkRouting
+comm_class = TCPRandomWalkRouting
+addresses_filepath = ip_addr_6Machines.json
+sampler = equi
+
+[SHARING]
+sharing_package = decentralizepy.sharing.SharingWithRWAsyncDynamic
+sharing_class = SharingWithRWAsyncDynamic
\ No newline at end of file
--- a/eval/step_configs/config_celeba_topkaccavg.ini
+++ b/eval/step_configs/config_celeba_topkaccavg.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Celeba
+dataset_class = Celeba
+model_class = CNN
+images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
+train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.LowerBoundTopK
+sharing_class = LowerBoundTopK
+lower_bound = 0.0
+alpha = 0.1
+metro_hastings = False
+accumulation = True
+accumulate_averaging_changes = True
\ No newline at end of file
--- a/eval/step_configs/config_cifar_syn.ini
+++ b/eval/step_configs/config_cifar_syn.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.CIFAR10
+dataset_class = CIFAR10
+model_class = LeNet
+train_dir = /mnt/nfs/shared/CIFAR
+test_dir = /mnt/nfs/shared/CIFAR
+; python list of fractions below
+sizes =
+random_seed = 99
+partition_niid = True
+shards = 4
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 65
+full_epochs = False
+batch_size = 8
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.Synchronous
+sharing_class = Synchronous
\ No newline at end of file
--- a/eval/step_configs/config_cifar_topkaccavg.ini
+++ b/eval/step_configs/config_cifar_topkaccavg.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.CIFAR10
+dataset_class = CIFAR10
+model_class = LeNet
+train_dir = /mnt/nfs/shared/CIFAR
+test_dir = /mnt/nfs/shared/CIFAR
+; python list of fractions below
+sizes =
+random_seed = 99
+partition_niid = True
+shards = 4
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 65
+full_epochs = False
+batch_size = 8
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.LowerBoundTopK
+sharing_class = LowerBoundTopK
+lower_bound = 0.0
+alpha = 0.5
+metro_hastings = False
+accumulation = True
+accumulate_averaging_changes = True
\ No newline at end of file
--- a/eval/step_configs/config_femnist_topkaccavg.ini
+++ b/eval/step_configs/config_femnist_topkaccavg.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Femnist
+dataset_class = Femnist
+random_seed = 97
+model_class = CNN
+train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train
+test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test
+; python list of fractions below
+sizes =
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+# There are 734463 femnist samples
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 47
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.LowerBoundTopK
+sharing_class = LowerBoundTopK
+lower_bound = 0.0
+alpha = 0.1
+metro_hastings = False
+accumulation = True
+accumulate_averaging_changes = True
\ No newline at end of file
--- a/eval/step_configs/config_movielens_sharing.ini
+++ b/eval/step_configs/config_movielens_sharing.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.MovieLens
+dataset_class = MovieLens
+model_class = MatrixFactorization
+train_dir = /mnt/nfs/shared/leaf/data/movielens
+test_dir = /mnt/nfs/shared/leaf/data/movielens
+; python list of fractions below
+sizes =
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.1
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 91
+full_epochs = False
+batch_size = 8
+shuffle = True
+loss_package = torch.nn
+loss_class = MSELoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.Sharing
+sharing_class = Sharing
\ No newline at end of file
--- a/eval/step_configs/config_reddit_sharing_local.ini
+++ b/eval/step_configs/config_reddit_sharing_local.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Reddit
+dataset_class = Reddit
+random_seed = 97
+model_class = RNN
+train_dir = /home/jeffrey/Downloads/reddit/per_user_data/train
+test_dir = /home/jeffrey/Downloads/reddit/new_small_data/test
+; python list of fractions below
+sizes =
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 47
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCPRandomWalkRouting
+comm_class = TCPRandomWalkRouting
+addresses_filepath = ip_addr_6Machines.json
+sampler = equi
+
+[SHARING]
+sharing_package = decentralizepy.sharing.SharingWithRWAsyncDynamic
+sharing_class = SharingWithRWAsyncDynamic
\ No newline at end of file
--- a/eval/step_configs/config_reddit_subsampling_local.ini
+++ b/eval/step_configs/config_reddit_subsampling_local.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Reddit
+dataset_class = Reddit
+random_seed = 97
+model_class = RNN
+train_dir = /home/jeffrey/Downloads/reddit/per_user_data/train
+test_dir = /home/jeffrey/Downloads/reddit/new_small_data/test
+; python list of fractions below
+sizes = 
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 4
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.SubSampling
+sharing_class = SubSampling
+alpha = 0.1
--- a/eval/step_configs/config_reddit_synchronous_local.ini
+++ b/eval/step_configs/config_reddit_synchronous_local.ini
+[DATASET]
+dataset_package = decentralizepy.datasets.Reddit
+dataset_class = Reddit
+random_seed = 97
+model_class = RNN
+train_dir = /home/jeffrey/Downloads/reddit/per_user_data/train
+test_dir = /home/jeffrey/Downloads/reddit/new_small_data/test
+; python list of fractions below
+sizes =
+
+[OPTIMIZER_PARAMS]
+optimizer_package = torch.optim
+optimizer_class = SGD
+lr = 0.001
+
+[TRAIN_PARAMS]
+training_package = decentralizepy.training.Training
+training_class = Training
+rounds = 47
+full_epochs = False
+batch_size = 16
+shuffle = True
+loss_package = torch.nn
+loss_class = CrossEntropyLoss
+
+[COMMUNICATION]
+comm_package = decentralizepy.communication.TCP
+comm_class = TCP
+addresses_filepath = ip_addr_6Machines.json
+
+[SHARING]
+sharing_package = decentralizepy.sharing.Synchronous
+sharing_class = Synchronous
\ No newline at end of file
--- a/src/decentralizepy/communication/TCP.py
+++ b/src/decentralizepy/communication/TCP.py
 import importlib
 import json
 import logging
+import lzma
 import pickle
 from collections import deque

@@ -50,6 +51,7 @@ class TCP(Communication):
        offset=20000,
        compression_package=None,
        compression_class=None,
+        log_dir=None,
    ):
        """
        Constructor