Skip to content
Snippets Groups Projects
Commit 4d5daa73 authored by Jeffrey Wigger's avatar Jeffrey Wigger
Browse files

cifar extreme

parent 5cc66a3e
No related branches found
No related tags found
No related merge requests found
......@@ -42,7 +42,7 @@ graph=96_regular.edges
config_file=~/tmp/config.ini
procs_per_machine=16
machines=6
global_epochs=800
global_epochs=1000
eval_file=testing.py
log_level=INFO
......@@ -52,7 +52,7 @@ m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print
export PYTHONFAULTHANDLER=1
# Base configs for which the gird search is done
tests=("step_configs/config_cifar_sharing.ini") #"step_configs/config_cifar_partialmodel.ini" "step_configs/config_cifar_topkacc.ini" "step_configs/config_cifar_subsampling.ini" "step_configs/config_cifar_wavelet.ini")
tests=("step_configs/config_cifar_sharing.ini" "step_configs/config_cifar_dpsgdWithRWAsync1.ini" "step_configs/config_cifar_dpsgdWithRWAsync2.ini" "step_configs/config_cifar_dpsgdWithRWAsync4.ini") #"step_configs/config_cifar_partialmodel.ini" "step_configs/config_cifar_topkacc.ini" "step_configs/config_cifar_subsampling.ini" "step_configs/config_cifar_wavelet.ini")
# Learning rates
lr="0.01"
# Batch size
......@@ -68,7 +68,7 @@ echo samples per user: $samples_per_user
# random_seeds for which to rerun the experiments
# random_seeds=("90" "91" "92" "93" "94")
random_seeds=("97")
random_seeds=("90" "91" "92")
echo batchsize: $batchsize
echo communication rounds per global epoch: $comm_rounds_per_global_epoch
# calculating how many batches there are in a global epoch for each user/proc
......@@ -107,7 +107,7 @@ do
$python_bin/crudini --set $config_file DATASET random_seed $seed
$env_python $eval_file -ro 0 -tea $test_after -ld $log_dir -wsd $weight_store_dir -mid $m -ps $procs_per_machine -ms $machines -is $new_iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level
echo $i is done
sleep 200
sleep 300
echo end of sleep
done
done
......
[DATASET]
dataset_package = decentralizepy.datasets.CIFAR10
dataset_class = CIFAR10
model_class = LeNet
train_dir = /mnt/nfs/shared/CIFAR
test_dir = /mnt/nfs/shared/CIFAR
; python list of fractions below
sizes =
random_seed = 99
partition_niid = True
shards = 1
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 65
full_epochs = False
batch_size = 8
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalk
comm_class = TCPRandomWalk
addresses_filepath = ip_addr_6Machines.json
sampler = equi_check_history
[SHARING]
sharing_package = decentralizepy.sharing.DPSGDRWAsync
sharing_class = DPSGDRWAsync
rw_chance=0.25
[DATASET]
dataset_package = decentralizepy.datasets.CIFAR10
dataset_class = CIFAR10
model_class = LeNet
train_dir = /mnt/nfs/shared/CIFAR
test_dir = /mnt/nfs/shared/CIFAR
; python list of fractions below
sizes =
random_seed = 99
partition_niid = True
shards = 1
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.001
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 65
full_epochs = False
batch_size = 8
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
[COMMUNICATION]
comm_package = decentralizepy.communication.TCPRandomWalk
comm_class = TCPRandomWalk
addresses_filepath = ip_addr_6Machines.json
sampler = equi_check_history
[SHARING]
sharing_package = decentralizepy.sharing.DPSGDRWAsync
sharing_class = DPSGDRWAsync
rw_chance=0.5
......@@ -29,8 +29,9 @@ loss_class = CrossEntropyLoss
comm_package = decentralizepy.communication.TCPRandomWalk
comm_class = TCPRandomWalk
addresses_filepath = ip_addr_6Machines.json
sampler = equi
sampler = equi_check_history
[SHARING]
sharing_package = decentralizepy.sharing.DPSGDRWAsync
sharing_class = DPSGDRWAsync
rw_chance=1
......@@ -499,6 +499,8 @@ class Node:
change = 5
if global_epoch == 119:
change = 10
if global_epoch == 499:
change = 20
global_epoch += change
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment