Skip to content
Snippets Groups Projects
Commit a4535f8b authored by Jeffrey Wigger's avatar Jeffrey Wigger
Browse files

All changes since departing from Main; Contains the randomWalk tcp implementation

parent 77a1296e
No related branches found
No related tags found
No related merge requests found
Showing
with 1143 additions and 0 deletions
16
0 1
0 15
1 0
1 2
2 1
2 3
3 2
3 4
4 3
4 5
5 4
5 6
6 5
6 7
7 8
7 6
8 9
8 7
9 8
9 10
10 9
10 11
11 10
11 12
12 11
12 13
13 12
13 14
14 13
14 15
15 0
15 14
96
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
0 60
0 61
0 62
0 63
0 64
0 65
0 66
0 67
0 68
0 69
0 70
0 71
0 72
0 73
0 74
0 75
0 76
0 77
0 78
0 79
0 80
0 81
0 82
0 83
0 84
0 85
0 86
0 87
0 88
0 89
0 90
0 91
0 92
0 93
0 94
0 95
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 0
14 0
15 0
16 0
17 0
18 0
19 0
20 0
21 0
22 0
23 0
24 0
25 0
26 0
27 0
28 0
29 0
30 0
31 0
32 0
33 0
34 0
35 0
36 0
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0
48 0
49 0
50 0
51 0
52 0
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 0
61 0
62 0
63 0
64 0
65 0
66 0
67 0
68 0
69 0
70 0
71 0
72 0
73 0
74 0
75 0
76 0
77 0
78 0
79 0
80 0
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 0
89 0
90 0
91 0
92 0
93 0
94 0
95 0
#!/bin/bash
# Documentation
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use Adam.
# For SGD these need to be changed manually
# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
# on the train set and on the test set is carried out every global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=$1
python_bin=$2
logs_subfolder=$3
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=6_star.edges
config_file=~/tmp/config.ini
procs_per_machine=1
machines=6
global_epochs=25
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
# Base configs for which the gird search is done
tests=("step_configs/config_celeba_synchronous.ini")
# Learning rates to test
lrs=("0.001" "0.0001" "0.00001")
# Batch sizes to test
batchsize=("8")
# The number of communication rounds per global epoch to test
comm_rounds_per_global_epoch=("2000")
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
# Celeba has 63741 samples
dataset_size=63741
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
for b in "${batchsize[@]}"
do
echo batchsize: $b
for r in "${comm_rounds_per_global_epoch[@]}"
do
echo communication rounds per global epoch: $r
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $b))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
for lr in "${lrs[@]}"
do
for i in "${tests[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m
echo results are stored in: $log_dir
mkdir -p $log_dir
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 1
echo end of sleep
done
done
done
done
#
#!/bin/bash
# Documentation
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use Adam.
# For SGD these need to be changed manually
# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
# on the train set and on the test set is carried out every global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=$1
python_bin=$2
logs_subfolder=$3
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=6_star.edges
config_file=~/tmp/config.ini
procs_per_machine=1
machines=6
global_epochs=25
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
# Base configs for which the gird search is done
tests=("step_configs/config_cifar_syn.ini")
# Learning rates to test
lrs=("0.1" "0.01" "0.001" "0.0001")
# Batch sizes to test
batchsize=("20")
# The number of communication rounds per global epoch to test
comm_rounds_per_global_epoch=("416")
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
# Cifar has 50000 samples
dataset_size=50000
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
for b in "${batchsize[@]}"
do
echo batchsize: $b
for r in "${comm_rounds_per_global_epoch[@]}"
do
echo communication rounds per global epoch: $r
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $b))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
for lr in "${lrs[@]}"
do
for i in "${tests[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')/machine$m
echo results are stored in: $log_dir
mkdir -p $log_dir
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 1
echo end of sleep
done
done
done
done
#
#!/bin/bash
# Documentation
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use SGD.
# - The script will set '--test_after' and '--train_evaluate_after' such that it happens at the end of a global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=$1
python_bin=$2
logs_subfolder=$3
decpy_path=$nfs_home/decentralizepy/eval
cd $decpy_path
env_python=$python_bin/python3
graph=96_regular.edges
config_file=~/tmp/config.ini
procs_per_machine=16
machines=6
global_epochs=80
eval_file=testing.py
log_level=INFO
ip_machines=$nfs_home/configs/ip_addr_6Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
export PYTHONFAULTHANDLER=1
# Base configs for which the gird search is done
tests=("step_configs/config_shakespeare_sharing.ini")
# Learning rates to test
lrs=("0.5" "0.1")
# Batch sizes to test
batchsize=("16")
# The number of communication rounds per global epoch to test
comm_rounds_per_global_epoch=("5")
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
dataset_size=97545
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
for b in "${batchsize[@]}"
do
echo batchsize: $b
for r in "${comm_rounds_per_global_epoch[@]}"
do
echo communication rounds per global epoch: $r
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $b))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $r >= $batches_per_epoch else print($global_epochs * $r)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $r); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $r); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $r)/x)*$iterations); print($iterations if y<$iterations else y)")
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
for lr in "${lrs[@]}"
do
for i in "${tests[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir_base=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$r:b=$b:$(date '+%Y-%m-%dT%H:%M')
echo results are stored in: $log_dir_base
log_dir=$log_dir_base/machine$m
mkdir -p $log_dir
weight_store_dir=$log_dir_base/weights
mkdir -p $weight_store_dir
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $b
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -wsd $weight_store_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 500
echo end of sleep
done
done
done
done
#
#!/bin/bash
# Documentation
# Note: documentation was not written for this run file, so actual behaviour may differ
# This bash file takes three inputs. The first argument (nfs_home) is the path to the nfs home directory.
# The second one (python_bin) is the path to the python bin folder.
# The last argument (logs_subfolder) is the path to the logs folder with respect to the nfs home directory.
#
# The nfs home directory should contain the code of this framework stored in $nfs_home/decentralizepy and a folder
# called configs which contains the file 'ip_addr_6Machines.json'
# The python bin folder needs to include all the dependencies of this project including crudini.
# The results will be stored in $nfs_home/$logs_subfolder
# Each of the experiments will be stored in its own folder inside the logs_subfolder. The folder of the experiment
# starts with the last part of the config name, i.e., for 'config_celeba_topkacc.ini' it will start with topkacc.
# The name further includes the learning rate, rounds and batchsize as well as the exact date at which the experiment
# was run.
# Example: ./run_grid.sh /mnt/nfs/wigger /mnt/nfs/wigger/anaconda3/envs/sacs39/bin /logs/celeba
#
# Additional requirements:
# Each node needs a folder called 'tmp' in the user's home directory
#
# Note:
# - The script does not change the optimizer. All configs are writen to use Adam.
# For SGD these need to be changed manually
# - The script will set '--test_after' and '--train_evaluate_after' to comm_rounds_per_global_epoch, i.e., the eavaluation
# on the train set and on the test set is carried out every global epoch.
# - The '--reset_optimizer' option is set to 0, i.e., the optimizer is not reset after a communication round (only
# relevant for Adams and other optimizers with internal state)
#
# Addapting the script to other datasets:
# Change the variable 'dataset_size' to reflect the data sets size.
#
# Known issues:
# - If the script is started at the very end of a minute then there is a change that two folders are created as not all
# machines may start running the script at the exact same moment.
nfs_home=/tmp/logs/
python_bin=/home/jeffrey/anaconda3/envs/sacs39/bin
logs_subfolder=reddit_local/
env_python=$python_bin/python3
graph=6_star.edges
config_file=~/tmp/config.ini
procs_per_machine=6
machines=1
global_epochs=3
eval_file=testing.py
log_level=DEBUG
ip_machines=ip_addr_1Machines.json
m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2`
# Base configs for which the gird search is done
tests=("step_configs/config_reddit_synchronous_local.ini")
# Learning rates
lr="1"
# Batch size
batchsize="16"
# The number of communication rounds per global epoch
comm_rounds_per_global_epoch="30"
procs=`expr $procs_per_machine \* $machines`
echo procs: $procs
# Celeba has 63741 samples
# Reddit has 70642
# Femnist 734463
# Shakespeares 3678451
dataset_size=70642
# Calculating the number of samples that each user/proc will have on average
samples_per_user=`expr $dataset_size / $procs`
echo samples per user: $samples_per_user
# random_seeds for which to rerun the experiments
random_seeds=("97")
# random_seed = 97
echo batchsize: $batchsize
echo communication rounds per global epoch: $comm_rounds_per_global_epoch
# calculating how many batches there are in a global epoch for each user/proc
batches_per_epoch=$(($samples_per_user / $batchsize))
echo batches per global epoch: $batches_per_epoch
# the number of iterations in 25 global epochs
iterations=$($env_python -c "from math import floor; print($batches_per_epoch * $global_epochs) if $comm_rounds_per_global_epoch >= $batches_per_epoch else print($global_epochs * $comm_rounds_per_global_epoch)")
echo iterations: $iterations
# calculating the number of batches each user/proc uses per communication step (The actual number may be a float, which we round down)
batches_per_comm_round=$($env_python -c "from math import floor; x = floor($batches_per_epoch / $comm_rounds_per_global_epoch); print(1 if x==0 else x)")
# since the batches per communication round were rounded down we need to change the number of iterations to reflect that
new_iterations=$($env_python -c "from math import floor; tmp = floor($batches_per_epoch / $comm_rounds_per_global_epoch); x = 1 if tmp == 0 else tmp; y = floor((($batches_per_epoch / $comm_rounds_per_global_epoch)/x)*$iterations); print($iterations if y<$iterations else y)")
echo batches per communication round: $batches_per_comm_round
echo corrected iterations: $new_iterations
test_after=$(($new_iterations / $global_epochs))
echo test after: $test_after
for i in "${tests[@]}"
do
for seed in "${random_seeds[@]}"
do
echo $i
IFS='_' read -ra NAMES <<< $i
IFS='.' read -ra NAME <<< ${NAMES[-1]}
log_dir=$nfs_home$logs_subfolder/${NAME[0]}:lr=$lr:r=$comm_rounds_per_global_epoch:b=$batchsize:$(date '+%Y-%m-%dT%H:%M')/machine$m
echo results are stored in: $log_dir
mkdir -p $log_dir
cp $i $config_file
# changing the config files to reflect the values of the current grid search state
$python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines
$python_bin/crudini --set $config_file OPTIMIZER_PARAMS lr $lr
$python_bin/crudini --set $config_file TRAIN_PARAMS rounds $batches_per_comm_round
$python_bin/crudini --set $config_file TRAIN_PARAMS batch_size $batchsize
$python_bin/crudini --set $config_file DATASET random_seed $seed
$env_python -q -X faulthandler $eval_file -ro 0 -tea $test_after -ld $log_dir -mid 0 -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 10
echo end of sleep
done
done
#
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalk
comm_class = TCPRandomWalk
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.DPSGDRW
sharing_class = DPSGDRW
rw_chance = 0.0
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalk
comm_class = TCPRandomWalk
addresses_filepath = ip_addr_6Machines.json
sampler = equi_check_history
[SHARING]
sharing_package = decentralizepy.sharing.DPSGDRWAsync
sharing_class = DPSGDRWAsync
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.SharingWithRW
sharing_class = SharingWithRW
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalk
comm_class = TCPRandomWalk
addresses_filepath = ip_addr_6Machines.json
sampler = equi_check_history
[SHARING]
sharing_package = decentralizepy.sharing.SharingWithRWAsync
sharing_class = SharingWithRWAsync
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalkRouting
comm_class = TCPRandomWalkRouting
addresses_filepath = ip_addr_6Machines.json
sampler = equi
[SHARING]
sharing_package = decentralizepy.sharing.SharingWithRWAsyncDynamic
sharing_class = SharingWithRWAsyncDynamic
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.Celeba
dataset_class = Celeba
model_class = CNN
images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba
train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.LowerBoundTopK
sharing_class = LowerBoundTopK
lower_bound = 0.0
alpha = 0.1
metro_hastings = False
accumulation = True
accumulate_averaging_changes = True
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.CIFAR10
dataset_class = CIFAR10
model_class = LeNet
train_dir = /mnt/nfs/shared/CIFAR
test_dir = /mnt/nfs/shared/CIFAR
; python list of fractions below
sizes =
random_seed = 99
partition_niid = True
shards = 4
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 65
full_epochs = False
batch_size = 8
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.Synchronous
sharing_class = Synchronous
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.CIFAR10
dataset_class = CIFAR10
model_class = LeNet
train_dir = /mnt/nfs/shared/CIFAR
test_dir = /mnt/nfs/shared/CIFAR
; python list of fractions below
sizes =
random_seed = 99
partition_niid = True
shards = 4
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 65
full_epochs = False
batch_size = 8
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.LowerBoundTopK
sharing_class = LowerBoundTopK
lower_bound = 0.0
alpha = 0.5
metro_hastings = False
accumulation = True
accumulate_averaging_changes = True
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.Femnist
dataset_class = Femnist
random_seed = 97
model_class = CNN
train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train
test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
# There are 734463 femnist samples
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 47
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.LowerBoundTopK
sharing_class = LowerBoundTopK
lower_bound = 0.0
alpha = 0.1
metro_hastings = False
accumulation = True
accumulate_averaging_changes = True
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.MovieLens
dataset_class = MovieLens
model_class = MatrixFactorization
train_dir = /mnt/nfs/shared/leaf/data/movielens
test_dir = /mnt/nfs/shared/leaf/data/movielens
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.1
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 91
full_epochs = False
batch_size = 8
shuffle = True
loss_package = torch.nn
loss_class = MSELoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.Sharing
sharing_class = Sharing
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.Reddit
dataset_class = Reddit
random_seed = 97
model_class = RNN
train_dir = /home/jeffrey/Downloads/reddit/per_user_data/train
test_dir = /home/jeffrey/Downloads/reddit/new_small_data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 47
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalkRouting
comm_class = TCPRandomWalkRouting
addresses_filepath = ip_addr_6Machines.json
sampler = equi
[SHARING]
sharing_package = decentralizepy.sharing.SharingWithRWAsyncDynamic
sharing_class = SharingWithRWAsyncDynamic
\ No newline at end of file
[DATASET]
dataset_package = decentralizepy.datasets.Reddit
dataset_class = Reddit
random_seed = 97
model_class = RNN
train_dir = /home/jeffrey/Downloads/reddit/per_user_data/train
test_dir = /home/jeffrey/Downloads/reddit/new_small_data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 4
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.SubSampling
sharing_class = SubSampling
alpha = 0.1
[DATASET]
dataset_package = decentralizepy.datasets.Reddit
dataset_class = Reddit
random_seed = 97
model_class = RNN
train_dir = /home/jeffrey/Downloads/reddit/per_user_data/train
test_dir = /home/jeffrey/Downloads/reddit/new_small_data/test
; python list of fractions below
sizes =
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 47
full_epochs = False
batch_size = 16
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCP
comm_class = TCP
addresses_filepath = ip_addr_6Machines.json
[SHARING]
sharing_package = decentralizepy.sharing.Synchronous
sharing_class = Synchronous
\ No newline at end of file
import importlib
import json
import logging
import lzma
import pickle
from collections import deque
......@@ -50,6 +51,7 @@ class TCP(Communication):
offset=20000,
compression_package=None,
compression_class=None,
log_dir=None,
):
"""
Constructor
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment