From cdc057cf1c93af9f421384b5de0edf3002f0563c Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger <jeffrey.wigger@epfl.ch> Date: Thu, 16 Jun 2022 06:01:05 +0200 Subject: [PATCH] last experiment --- eval/run_xtimes_cifar.sh | 2 +- .../config_cifar_dpsgdWithRWAsync4Jwins.ini | 47 +++++++++++++++++++ .../config_cifar_dpsgdWithRWAsync4Jwins30.ini | 47 +++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins.ini create mode 100644 eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins30.ini diff --git a/eval/run_xtimes_cifar.sh b/eval/run_xtimes_cifar.sh index 616572b..1d46679 100755 --- a/eval/run_xtimes_cifar.sh +++ b/eval/run_xtimes_cifar.sh @@ -52,7 +52,7 @@ m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print export PYTHONFAULTHANDLER=1 # Base configs for which the gird search is done -tests=("step_configs/config_cifar_sharing_dynamicGraphJwins30.ini") # ("step_configs/config_cifar_sharing.ini" "step_configs/config_cifar_dpsgdWithRWAsync1.ini" "step_configs/config_cifar_dpsgdWithRWAsync2.ini" "step_configs/config_cifar_dpsgdWithRWAsync4.ini") #"step_configs/config_cifar_partialmodel.ini" "step_configs/config_cifar_topkacc.ini" "step_configs/config_cifar_subsampling.ini" "step_configs/config_cifar_wavelet.ini") +tests=("step_configs/config_cifar_dpsgdWithRWAsync4Jwins.ini" "step_configs/config_cifar_dpsgdWithRWAsync4Jwins30.ini") # ("step_configs/config_cifar_sharing.ini" "step_configs/config_cifar_dpsgdWithRWAsync1.ini" "step_configs/config_cifar_dpsgdWithRWAsync2.ini" "step_configs/config_cifar_dpsgdWithRWAsync4.ini") #"step_configs/config_cifar_partialmodel.ini" "step_configs/config_cifar_topkacc.ini" "step_configs/config_cifar_subsampling.ini" "step_configs/config_cifar_wavelet.ini") # Learning rates lr="0.01" # Batch size diff --git a/eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins.ini b/eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins.ini new file mode 100644 index 0000000..c703433 --- /dev/null +++ b/eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins.ini @@ -0,0 +1,47 @@ +[DATASET] +dataset_package = decentralizepy.datasets.CIFAR10 +dataset_class = CIFAR10 +model_class = LeNet +train_dir = /mnt/nfs/shared/CIFAR +test_dir = /mnt/nfs/shared/CIFAR +; python list of fractions below +sizes = +random_seed = 99 +partition_niid = True +shards = 1 + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = SGD +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 65 +full_epochs = False +batch_size = 8 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCPRandomWalk +comm_class = TCPRandomWalk +addresses_filepath = ip_addr_6Machines.json +compression_package = decentralizepy.compression.Eliaszfplossy1 +compression_class = Eliaszfplossy1 +compress = True +sampler = equi_check_history + +[SHARING] +sharing_package = decentralizepy.sharing.JwinsDPSGDAsync +sharing_class = JwinsDPSGDAsync +alpha=0.0833 +lower_bound=0.2 +metro_hastings=False +change_based_selection = True +wavelet=sym2 +level= None +accumulation = True +accumulate_averaging_changes = True diff --git a/eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins30.ini b/eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins30.ini new file mode 100644 index 0000000..6dcdd1b --- /dev/null +++ b/eval/step_configs/config_cifar_dpsgdWithRWAsync4Jwins30.ini @@ -0,0 +1,47 @@ +[DATASET] +dataset_package = decentralizepy.datasets.CIFAR10 +dataset_class = CIFAR10 +model_class = LeNet +train_dir = /mnt/nfs/shared/CIFAR +test_dir = /mnt/nfs/shared/CIFAR +; python list of fractions below +sizes = +random_seed = 99 +partition_niid = True +shards = 1 + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = SGD +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 65 +full_epochs = False +batch_size = 8 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCPRandomWalk +comm_class = TCPRandomWalk +addresses_filepath = ip_addr_6Machines.json +compression_package = decentralizepy.compression.Eliaszfplossy1 +compression_class = Eliaszfplossy1 +compress = True +sampler = equi_check_history + +[SHARING] +sharing_package = decentralizepy.sharing.JwinsDPSGDAsync +sharing_class = JwinsDPSGDAsync +alpha=0.25 +lower_bound=0.2 +metro_hastings=False +change_based_selection = True +wavelet=sym2 +level= None +accumulation = True +accumulate_averaging_changes = True -- GitLab