From 76f1fbb08d290a13893778491b25f2185bd1553d Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Wed, 9 Mar 2022 12:20:31 +0100 Subject: [PATCH 01/16] integrating new sharing methods --- eval/96_regular.edges | 381 ++++++++++++++++++ eval/plot.py | 6 + eval/run.sh | 22 +- eval/step_configs/config_femnist_fft.ini | 37 ++ eval/step_configs/config_femnist_sharing.ini | 32 ++ .../config_femnist_subsampling.ini | 34 ++ eval/step_configs/config_femnist_topk.ini | 36 ++ .../step_configs/config_femnist_topkparam.ini | 34 ++ eval/step_configs/config_femnist_wavelet.ini | 41 ++ eval/testing.py | 3 +- setup.cfg | 1 + src/decentralizepy/models/Model.py | 3 + src/decentralizepy/node/Node.py | 6 + src/decentralizepy/sharing/FFT.py | 333 +++++++++++++++ src/decentralizepy/sharing/SubSampling.py | 287 +++++++++++++ src/decentralizepy/sharing/TopK.py | 227 +++++++++++ src/decentralizepy/sharing/TopKParams.py | 225 +++++++++++ src/decentralizepy/sharing/Wavelet.py | 370 +++++++++++++++++ .../training/FrequencyAccumulator.py | 105 +++++ .../training/FrequencyWaveletAccumulator.py | 113 ++++++ .../training/ModelChangeAccumulator.py | 103 +++++ src/decentralizepy/training/Training.py | 2 +- 22 files changed, 2384 insertions(+), 17 deletions(-) create mode 100644 eval/96_regular.edges create mode 100644 eval/step_configs/config_femnist_fft.ini create mode 100644 eval/step_configs/config_femnist_sharing.ini create mode 100644 eval/step_configs/config_femnist_subsampling.ini create mode 100644 eval/step_configs/config_femnist_topk.ini create mode 100644 eval/step_configs/config_femnist_topkparam.ini create mode 100644 eval/step_configs/config_femnist_wavelet.ini create mode 100644 src/decentralizepy/sharing/FFT.py create mode 100644 src/decentralizepy/sharing/SubSampling.py create mode 100644 src/decentralizepy/sharing/TopK.py create mode 100644 src/decentralizepy/sharing/TopKParams.py create mode 100644 src/decentralizepy/sharing/Wavelet.py create mode 100644 src/decentralizepy/training/FrequencyAccumulator.py create mode 100644 src/decentralizepy/training/FrequencyWaveletAccumulator.py create mode 100644 src/decentralizepy/training/ModelChangeAccumulator.py diff --git a/eval/96_regular.edges b/eval/96_regular.edges new file mode 100644 index 0000000..0db09a2 --- /dev/null +++ b/eval/96_regular.edges @@ -0,0 +1,381 @@ +96 +0 24 +0 1 +0 26 +0 95 +1 2 +1 0 +1 82 +1 83 +2 33 +2 90 +2 3 +2 1 +3 2 +3 4 +3 14 +3 79 +4 3 +4 12 +4 5 +4 86 +5 64 +5 42 +5 4 +5 6 +6 9 +6 5 +6 62 +6 7 +7 24 +7 8 +7 45 +7 6 +8 81 +8 17 +8 9 +8 7 +9 8 +9 10 +9 53 +9 6 +10 9 +10 11 +10 29 +10 31 +11 80 +11 10 +11 36 +11 12 +12 11 +12 4 +12 13 +12 70 +13 12 +13 53 +13 30 +13 14 +14 3 +14 15 +14 13 +14 47 +15 16 +15 26 +15 14 +16 41 +16 17 +16 15 +17 8 +17 16 +17 18 +17 83 +18 17 +18 19 +18 95 +18 63 +19 82 +19 18 +19 20 +19 22 +20 19 +20 59 +20 21 +20 22 +21 72 +21 58 +21 20 +21 22 +22 19 +22 20 +22 21 +22 23 +23 24 +23 65 +23 85 +23 22 +24 0 +24 25 +24 23 +24 7 +25 32 +25 24 +25 26 +25 38 +26 0 +26 25 +26 27 +26 15 +27 32 +27 26 +27 28 +27 63 +28 27 +28 92 +28 29 +28 39 +29 10 +29 52 +29 28 +29 30 +30 66 +30 29 +30 13 +30 31 +31 32 +31 10 +31 36 +31 30 +32 25 +32 27 +32 31 +32 33 +33 32 +33 2 +33 84 +33 34 +34 33 +34 50 +34 35 +34 93 +35 57 +35 34 +35 43 +35 36 +36 35 +36 11 +36 37 +36 31 +37 88 +37 36 +37 38 +37 79 +38 25 +38 37 +38 39 +38 49 +39 40 +39 28 +39 77 +39 38 +40 41 +40 91 +40 39 +40 87 +41 16 +41 40 +41 42 +41 51 +42 41 +42 43 +42 5 +43 42 +43 35 +43 44 +44 72 +44 43 +44 75 +44 45 +45 67 +45 44 +45 46 +45 7 +46 76 +46 45 +46 54 +46 47 +47 48 +47 65 +47 14 +47 46 +48 56 +48 49 +48 61 +48 47 +49 48 +49 50 +49 38 +49 71 +50 49 +50 34 +50 51 +50 93 +51 41 +51 50 +51 52 +51 95 +52 51 +52 74 +52 53 +52 29 +53 9 +53 52 +53 13 +53 54 +54 75 +54 53 +54 46 +54 55 +55 56 +55 69 +55 85 +55 54 +56 48 +56 57 +56 69 +56 55 +57 56 +57 89 +57 58 +57 35 +58 57 +58 59 +58 21 +58 86 +59 73 +59 58 +59 20 +59 60 +60 62 +60 59 +60 61 +60 78 +61 48 +61 62 +61 60 +61 94 +62 60 +62 61 +62 6 +62 63 +63 64 +63 18 +63 27 +63 62 +64 65 +64 84 +64 5 +64 63 +65 64 +65 66 +65 23 +65 47 +66 65 +66 89 +66 67 +66 30 +67 80 +67 66 +67 68 +67 45 +68 67 +68 92 +68 69 +68 94 +69 56 +69 68 +69 70 +69 55 +70 90 +70 12 +70 69 +70 71 +71 72 +71 49 +71 70 +71 87 +72 73 +72 44 +72 21 +72 71 +73 72 +73 91 +73 59 +73 74 +74 73 +74 75 +74 52 +74 76 +75 74 +75 44 +75 54 +75 76 +76 74 +76 75 +76 77 +76 46 +77 81 +77 76 +77 78 +77 39 +78 88 +78 60 +78 77 +78 79 +79 80 +79 3 +79 37 +79 78 +80 81 +80 67 +80 11 +80 79 +81 8 +81 82 +81 80 +81 77 +82 81 +82 1 +82 83 +82 19 +83 1 +83 82 +83 84 +83 17 +84 64 +84 33 +84 83 +84 85 +85 84 +85 55 +85 86 +85 23 +86 58 +86 4 +86 85 +86 87 +87 40 +87 88 +87 86 +87 71 +88 89 +88 37 +88 78 +88 87 +89 88 +89 57 +89 66 +89 90 +90 89 +90 2 +90 91 +90 70 +91 40 +91 73 +91 90 +91 92 +92 93 +92 91 +92 68 +92 28 +93 50 +93 34 +93 94 +93 92 +94 93 +94 68 +94 61 +94 95 +95 0 +95 18 +95 51 +95 94 diff --git a/eval/plot.py b/eval/plot.py index d3c3a39..f354937 100644 --- a/eval/plot.py +++ b/eval/plot.py @@ -61,14 +61,20 @@ def plot_results(path): plt.figure(1) means, stdevs, mins, maxs = get_stats([x["train_loss"] for x in results]) plot(means, stdevs, mins, maxs, "Training Loss", folder, "upper right") + with open(os.path.join(path, "train_loss_" + folder + ".json"), "w") as f: + json.dump({"mean": means, "std": stdevs}, f) # Plot Testing loss plt.figure(2) means, stdevs, mins, maxs = get_stats([x["test_loss"] for x in results]) plot(means, stdevs, mins, maxs, "Testing Loss", folder, "upper right") + with open(os.path.join(path, "test_loss_" + folder + ".json"), "w") as f: + json.dump({"mean": means, "std": stdevs}, f) # Plot Testing Accuracy plt.figure(3) means, stdevs, mins, maxs = get_stats([x["test_acc"] for x in results]) plot(means, stdevs, mins, maxs, "Testing Accuracy", folder, "lower right") + with open(os.path.join(path, "test_acc_" + folder + ".json"), "w") as f: + json.dump({"mean": means, "std": stdevs}, f) plt.figure(6) means, stdevs, mins, maxs = get_stats([x["grad_std"] for x in results]) plot( diff --git a/eval/run.sh b/eval/run.sh index 9869a17..0198413 100755 --- a/eval/run.sh +++ b/eval/run.sh @@ -4,29 +4,21 @@ decpy_path=~/Gitlab/decentralizepy/eval cd $decpy_path env_python=~/miniconda3/envs/decpy/bin/python3 -graph=96_nodes_random1.edges +graph=96_regular.edges original_config=epoch_configs/config_celeba.ini config_file=/tmp/config.ini procs_per_machine=16 machines=6 -iterations=76 -test_after=2 +iterations=200 +test_after=10 eval_file=testing.py log_level=INFO +log_dir_base=/mnt/nfs/some_user/logs/test m=`cat $(grep addresses_filepath $original_config | awk '{print $3}') | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` -cp $original_config $config_file -echo "alpha = 0.75" >> $config_file -$env_python $eval_file -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level - -cp $original_config $config_file -echo "alpha = 0.50" >> $config_file -$env_python $eval_file -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level +log_dir=$log_dir_base$m cp $original_config $config_file -echo "alpha = 0.10" >> $config_file -$env_python $eval_file -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level - -config_file=epoch_configs/config_celeba_100.ini -$env_python $eval_file -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $original_config -ll $log_level +# echo "alpha = 0.10" >> $config_file +$env_python $eval_file -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level \ No newline at end of file diff --git a/eval/step_configs/config_femnist_fft.ini b/eval/step_configs/config_femnist_fft.ini new file mode 100644 index 0000000..32c5e17 --- /dev/null +++ b/eval/step_configs/config_femnist_fft.ini @@ -0,0 +1,37 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Femnist +dataset_class = Femnist +model_class = CNN +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +# There are 734463 femnist samples +[TRAIN_PARAMS] +training_package = decentralizepy.training.FrequencyAccumulator +training_class = FrequencyAccumulator +rounds = 47 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss +accumulation = True + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.FFT +sharing_class = FFT +alpha = 0.1 +change_based_selection = True +accumulation = True \ No newline at end of file diff --git a/eval/step_configs/config_femnist_sharing.ini b/eval/step_configs/config_femnist_sharing.ini new file mode 100644 index 0000000..42ab50c --- /dev/null +++ b/eval/step_configs/config_femnist_sharing.ini @@ -0,0 +1,32 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Femnist +dataset_class = Femnist +model_class = CNN +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 10 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.Sharing +sharing_class = Sharing diff --git a/eval/step_configs/config_femnist_subsampling.ini b/eval/step_configs/config_femnist_subsampling.ini new file mode 100644 index 0000000..53121d8 --- /dev/null +++ b/eval/step_configs/config_femnist_subsampling.ini @@ -0,0 +1,34 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Femnist +dataset_class = Femnist +model_class = CNN +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +# There are 734463 femnist samples +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 47 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.SubSampling +sharing_class = SubSampling +alpha = 0.1 diff --git a/eval/step_configs/config_femnist_topk.ini b/eval/step_configs/config_femnist_topk.ini new file mode 100644 index 0000000..57ba8f0 --- /dev/null +++ b/eval/step_configs/config_femnist_topk.ini @@ -0,0 +1,36 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Femnist +dataset_class = Femnist +model_class = CNN +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +# There are 734463 femnist samples +[TRAIN_PARAMS] +training_package = decentralizepy.training.ModelChangeAccumulator +training_class = ModelChangeAccumulator +rounds = 47 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss +accumulation = True + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.TopK +sharing_class = TopK +alpha = 0.1 +accumulation = True \ No newline at end of file diff --git a/eval/step_configs/config_femnist_topkparam.ini b/eval/step_configs/config_femnist_topkparam.ini new file mode 100644 index 0000000..41c50c0 --- /dev/null +++ b/eval/step_configs/config_femnist_topkparam.ini @@ -0,0 +1,34 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Femnist +dataset_class = Femnist +model_class = CNN +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +# There are 734463 femnist samples +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 47 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.TopKParams +sharing_class = TopKParams +alpha = 0.1 diff --git a/eval/step_configs/config_femnist_wavelet.ini b/eval/step_configs/config_femnist_wavelet.ini new file mode 100644 index 0000000..e53e3ea --- /dev/null +++ b/eval/step_configs/config_femnist_wavelet.ini @@ -0,0 +1,41 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Femnist +dataset_class = Femnist +model_class = CNN +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +# There are 734463 femnist samples +[TRAIN_PARAMS] +training_package = decentralizepy.training.FrequencyWaveletAccumulator +training_class = FrequencyWaveletAccumulator +rounds = 47 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss +wavelet=sym2 +level= None +accumulation = True + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.Wavelet +sharing_class = Wavelet +change_based_selection = True +alpha = 0.1 +wavelet=sym2 +level= None +accumulation = True diff --git a/eval/testing.py b/eval/testing.py index abd6333..0ae70de 100644 --- a/eval/testing.py +++ b/eval/testing.py @@ -24,7 +24,8 @@ def read_ini(file_path): if __name__ == "__main__": args = utils.get_args() - Path(args.log_dir).mkdir(parents=True, exist_ok=True) + # prevents accidental log overwrites + Path(args.log_dir).mkdir(parents=True, exist_ok=False) log_level = { "INFO": logging.INFO, diff --git a/setup.cfg b/setup.cfg index 3faa1f3..2ffd572 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ install_requires = pillow smallworld localconfig + PyWavelets include_package_data = True python_requires = >=3.6 [options.packages.find] diff --git a/src/decentralizepy/models/Model.py b/src/decentralizepy/models/Model.py index f757500..e9e556b 100644 --- a/src/decentralizepy/models/Model.py +++ b/src/decentralizepy/models/Model.py @@ -17,6 +17,9 @@ class Model(nn.Module): self.accumulated_gradients = [] self._param_count_ot = None self._param_count_total = None + self.accumulated_frequency = None + self.prev_model_params = None + self.prev = None def count_params(self, only_trainable=False): """ diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index e5764ae..fd2c75f 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -92,6 +92,8 @@ class Node: The object containing the mapping rank <--> uid graph : decentralizepy.graphs The object containing the global graph + iterations : int + Number of iterations (communication steps) ) for which the model should be trained log_dir : str Logging directory reset_optimizer : int @@ -278,6 +280,8 @@ class Node: The object containing the global graph config : dict A dictionary of configurations. + iterations : int + Number of iterations (communication steps) ) for which the model should be trained log_dir : str Logging directory log_level : logging.Level @@ -443,6 +447,8 @@ class Node: training_class = Training epochs_per_round = 25 batch_size = 64 + iterations : int + Number of iterations (communication steps) ) for which the model should be trained log_dir : str Logging directory log_level : logging.Level diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py new file mode 100644 index 0000000..4a3ee36 --- /dev/null +++ b/src/decentralizepy/sharing/FFT.py @@ -0,0 +1,333 @@ +import base64 +import json +import logging +import os +import pickle +from pathlib import Path +from time import time + +import torch +import torch.fft as fft + +from decentralizepy.sharing.Sharing import Sharing + + +class FFT(Sharing): + """ + This class implements the fft version of model sharing + It is based on PartialModel.py + + """ + + def __init__( + self, + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha=1.0, + dict_ordered=True, + save_shared=False, + metadata_cap=1.0, + pickle=True, + change_based_selection=True, + accumulation=True, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Local rank + machine_id : int + Global machine id + communication : decentralizepy.communication.Communication + Communication module used to send and receive messages + mapping : decentralizepy.mappings.Mapping + Mapping (rank, machine_id) -> uid + graph : decentralizepy.graphs.Graph + Graph reprensenting neighbors + model : decentralizepy.models.Model + Model to train + dataset : decentralizepy.datasets.Dataset + Dataset for sharing data. Not implemented yet! TODO + log_dir : str + Location to write shared_params (only writing for 2 procs per machine) + alpha : float + Percentage of model to share + dict_ordered : bool + Specifies if the python dict maintains the order of insertion + save_shared : bool + Specifies if the indices of shared parameters should be logged + metadata_cap : float + Share full model when self.alpha > metadata_cap + pickle : bool + use pickle to serialize the model parameters + change_based_selection : bool + use frequency change to select topk frequencies + accumulation : bool + True if the the indices to share should be selected based on accumulated frequency change + """ + super().__init__( + rank, machine_id, communication, mapping, graph, model, dataset, log_dir + ) + self.alpha = alpha + self.dict_ordered = dict_ordered + self.save_shared = save_shared + self.metadata_cap = metadata_cap + self.total_meta = 0 + + self.pickle = pickle + + logging.info("subsampling pickling=" + str(pickle)) + + if self.save_shared: + # Only save for 2 procs: Save space + if rank != 0 or rank != 1: + self.save_shared = False + + if self.save_shared: + self.folder_path = os.path.join( + self.log_dir, "shared_params/{}".format(self.rank) + ) + Path(self.folder_path).mkdir(parents=True, exist_ok=True) + + self.change_based_selection = change_based_selection + self.accumulation = accumulation + + def apply_fft(self): + """ + Does fft transformation of the model parameters and selects topK (alpha) of them in the frequency domain + based on the undergone change during the current training step + + Returns + ------- + tuple + (a,b). a: selected fft frequencies (complex numbers), b: Their indices. + + """ + + logging.info("Returning fft compressed model weights") + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + + if self.change_based_selection: + flat_fft = fft.rfft(concated) + if self.accumulation: + logging.info( + "fft topk extract frequencies based on accumulated model frequency change" + ) + diff = self.model.accumulated_frequency + (flat_fft - self.model.prev) + else: + diff = flat_fft - self.model.accumulated_frequency + _, index = torch.topk( + diff.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False + ) + else: + flat_fft = fft.rfft(concated) + _, index = torch.topk( + flat_fft.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False + ) + + if self.accumulation: + self.model.accumulated_frequency[index] = 0.0 + return flat_fft[index], index + + def serialized_model(self): + """ + Convert model to json dict. self.alpha specifies the fraction of model to send. + + Returns + ------- + dict + Model converted to json dict + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().serialized_model() + + with torch.no_grad(): + topk, indices = self.apply_fft() + + if self.save_shared: + shared_params = dict() + shared_params["order"] = list(self.model.state_dict().keys()) + shapes = dict() + for k, v in self.model.state_dict().items(): + shapes[k] = list(v.shape) + shared_params["shapes"] = shapes + + shared_params[self.communication_round] = indices.tolist() # is slow + + shared_params["alpha"] = self.alpha + + with open( + os.path.join( + self.folder_path, + "{}_shared_params.json".format(self.communication_round + 1), + ), + "w", + ) as of: + json.dump(shared_params, of) + + m = dict() + + if not self.dict_ordered: + raise NotImplementedError + + m["alpha"] = self.alpha + m["params"] = topk.numpy() + m["indices"] = indices.numpy() + + self.total_data += len(self.communication.encrypt(m["params"])) + self.total_meta += len(self.communication.encrypt(m["indices"])) + len( + self.communication.encrypt(m["alpha"]) + ) + + return m + + def deserialized_model(self, m): + """ + Convert received json dict to state_dict. + + Parameters + ---------- + m : dict + json dict received + + Returns + ------- + state_dict + state_dict of received + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().deserialized_model(m) + + with torch.no_grad(): + state_dict = self.model.state_dict() + + if not self.dict_ordered: + raise NotImplementedError + + shapes = [] + lens = [] + tensors_to_cat = [] + for _, v in state_dict.items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + + T = torch.cat(tensors_to_cat, dim=0) + + indices = m["indices"] + alpha = m["alpha"] + params = m["params"] + + params_tensor = torch.tensor(params) + indices_tensor = torch.tensor(indices) + ret = dict() + ret["indices"] = indices_tensor + ret["params"] = params_tensor + return ret + + def step(self): + """ + Perform a sharing step. Implements D-PSGD. + + """ + t_start = time() + data = self.serialized_model() + t_post_serialize = time() + my_uid = self.mapping.get_uid(self.rank, self.machine_id) + all_neighbors = self.graph.neighbors(my_uid) + iter_neighbors = self.get_neighbors(all_neighbors) + data["degree"] = len(all_neighbors) + data["iteration"] = self.communication_round + for neighbor in iter_neighbors: + self.communication.send(neighbor, data) + t_post_send = time() + logging.info("Waiting for messages from neighbors") + while not self.received_from_all(): + sender, data = self.communication.receive() + logging.debug("Received model from {}".format(sender)) + degree = data["degree"] + iteration = data["iteration"] + del data["degree"] + del data["iteration"] + self.peer_deques[sender].append((degree, iteration, data)) + logging.info( + "Deserialized received model from {} of iteration {}".format( + sender, iteration + ) + ) + t_post_recv = time() + + logging.info("Starting model averaging after receiving from all neighbors") + total = None + weight_total = 0 + + # FFT of this model + shapes = [] + lens = [] + tensors_to_cat = [] + for _, v in self.model.state_dict().items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + concated = torch.cat(tensors_to_cat, dim=0) + flat_fft = fft.rfft(concated) + + for i, n in enumerate(self.peer_deques): + degree, iteration, data = self.peer_deques[n].popleft() + logging.debug( + "Averaging model from neighbor {} of iteration {}".format(n, iteration) + ) + data = self.deserialized_model(data) + params = data["params"] + indices = data["indices"] + # use local data to complement + topkf = flat_fft.clone().detach() + topkf[indices] = params + + weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings + weight_total += weight + if total is None: + total = weight * topkf + else: + total += weight * topkf + + # Metro-Hastings + total += (1 - weight_total) * flat_fft + reverse_total = fft.irfft(total) + + start_index = 0 + std_dict = {} + for i, key in enumerate(self.model.state_dict()): + end_index = start_index + lens[i] + std_dict[key] = reverse_total[start_index:end_index].reshape(shapes[i]) + start_index = end_index + + self.model.load_state_dict(std_dict) + + logging.info("Model averaging complete") + + self.communication_round += 1 + + t_end = time() + + logging.info( + "Sharing::step | Serialize: %f; Send: %f; Recv: %f; Averaging: %f; Total: %f", + t_post_serialize - t_start, + t_post_send - t_post_serialize, + t_post_recv - t_post_send, + t_end - t_post_recv, + t_end - t_start, + ) diff --git a/src/decentralizepy/sharing/SubSampling.py b/src/decentralizepy/sharing/SubSampling.py new file mode 100644 index 0000000..6fe3f93 --- /dev/null +++ b/src/decentralizepy/sharing/SubSampling.py @@ -0,0 +1,287 @@ +import base64 +import json +import logging +import os +import pickle +from pathlib import Path + +import torch + +from decentralizepy.sharing.Sharing import Sharing + + +class SubSampling(Sharing): + """ + This class implements the subsampling version of model sharing + It is based on PartialModel.py + + """ + + def __init__( + self, + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha=1.0, + dict_ordered=True, + save_shared=False, + metadata_cap=1.0, + pickle=True, + layerwise=False, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Local rank + machine_id : int + Global machine id + communication : decentralizepy.communication.Communication + Communication module used to send and receive messages + mapping : decentralizepy.mappings.Mapping + Mapping (rank, machine_id) -> uid + graph : decentralizepy.graphs.Graph + Graph reprensenting neighbors + model : decentralizepy.models.Model + Model to train + dataset : decentralizepy.datasets.Dataset + Dataset for sharing data. Not implemented yet! TODO + log_dir : str + Location to write shared_params (only writing for 2 procs per machine) + alpha : float + Percentage of model to share + dict_ordered : bool + Specifies if the python dict maintains the order of insertion + save_shared : bool + Specifies if the indices of shared parameters should be logged + metadata_cap : float + Share full model when self.alpha > metadata_cap + pickle : bool + use pickle to serialize the model parameters + + """ + super().__init__( + rank, machine_id, communication, mapping, graph, model, dataset, log_dir + ) + self.alpha = alpha + self.dict_ordered = dict_ordered + self.save_shared = save_shared + self.metadata_cap = metadata_cap + self.total_meta = 0 + + # self.random_seed_generator = torch.Generator() + # # Will use the random device if supported by CPU, else uses the system time + # # In the latter case we could get duplicate seeds on some of the machines + # self.random_seed_generator.seed() + + self.random_generator = torch.Generator() + # Will use the random device if supported by CPU, else uses the system time + # In the latter case we could get duplicate seeds on some of the machines + self.random_generator.seed() + self.seed = self.random_generator.initial_seed() + + self.pickle = pickle + self.layerwise = layerwise + + logging.info("subsampling pickling=" + str(pickle)) + + if self.save_shared: + # Only save for 2 procs: Save space + if rank != 0 or rank != 1: + self.save_shared = False + + if self.save_shared: + self.folder_path = os.path.join( + self.log_dir, "shared_params/{}".format(self.rank) + ) + Path(self.folder_path).mkdir(parents=True, exist_ok=True) + + def apply_subsampling(self): + """ + Creates a random binary mask that is used to subsample the parameters that will be shared + + Returns + ------- + tuple + (a,b,c). a: the selected parameters as flat vector, b: the random seed used to crate the binary mask + c: the alpha + + """ + + logging.info("Returning subsampling gradients") + if not self.layerwise: + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + concated = torch.cat(tensors_to_cat, dim=0) + + curr_seed = self.seed + self.communication_round # is increased in step + self.random_generator.manual_seed(curr_seed) + # logging.debug("Subsampling seed for uid = " + str(self.uid) + " is: " + str(curr_seed)) + # Or we could use torch.bernoulli + binary_mask = ( + torch.rand( + size=(concated.size(dim=0),), generator=self.random_generator + ) + <= self.alpha + ) + subsample = concated[binary_mask] + # logging.debug("Subsampling vector is of size: " + str(subsample.size(dim = 0))) + return (subsample, curr_seed, self.alpha) + else: + values_list = [] + offsets = [0] + off = 0 + curr_seed = self.seed + self.communication_round # is increased in step + self.random_generator.manual_seed(curr_seed) + for _, v in self.model.state_dict().items(): + flat = v.flatten() + binary_mask = ( + torch.rand( + size=(flat.size(dim=0),), generator=self.random_generator + ) + <= self.alpha + ) + selected = flat[binary_mask] + values_list.append(selected) + off += selected.size(dim=0) + offsets.append(off) + subsample = torch.cat(values_list, dim=0) + return (subsample, curr_seed, self.alpha) + + def serialized_model(self): + """ + Convert model to json dict. self.alpha specifies the fraction of model to send. + + Returns + ------- + dict + Model converted to json dict + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().serialized_model() + + with torch.no_grad(): + subsample, seed, alpha = self.apply_subsampling() + + if self.save_shared: + shared_params = dict() + shared_params["order"] = list(self.model.state_dict().keys()) + shapes = dict() + for k, v in self.model.state_dict().items(): + shapes[k] = list(v.shape) + shared_params["shapes"] = shapes + + # TODO: should store the shared indices and not the value + # shared_params[self.communication_round] = subsample.tolist() # is slow + + shared_params["seed"] = seed + + shared_params["alpha"] = alpha + + with open( + os.path.join( + self.folder_path, + "{}_shared_params.json".format(self.communication_round + 1), + ), + "w", + ) as of: + json.dump(shared_params, of) + + m = dict() + + if not self.dict_ordered: + raise NotImplementedError + + m["seed"] = seed + m["alpha"] = alpha + m["params"] = subsample.numpy() + + # logging.info("Converted dictionary to json") + self.total_data += len(self.communication.encrypt(m["params"])) + self.total_meta += len(self.communication.encrypt(m["seed"])) + len( + self.communication.encrypt(m["alpha"]) + ) + + return m + + def deserialized_model(self, m): + """ + Convert received json dict to state_dict. + + Parameters + ---------- + m : dict + json dict received + + Returns + ------- + state_dict + state_dict of received + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().deserialized_model(m) + + with torch.no_grad(): + state_dict = self.model.state_dict() + + if not self.dict_ordered: + raise NotImplementedError + + seed = m["seed"] + alpha = m["alpha"] + params = m["params"] + + random_generator = ( + torch.Generator() + ) # new generator, such that we do not overwrite the other one + random_generator.manual_seed(seed) + + shapes = [] + lens = [] + tensors_to_cat = [] + binary_submasks = [] + for _, v in state_dict.items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + if self.layerwise: + binary_mask = ( + torch.rand(size=(t.size(dim=0),), generator=random_generator) + <= alpha + ) + binary_submasks.append(binary_mask) + + T = torch.cat(tensors_to_cat, dim=0) + + params_tensor = torch.from_numpy(params) + + if not self.layerwise: + binary_mask = ( + torch.rand(size=(T.size(dim=0),), generator=random_generator) + <= alpha + ) + else: + binary_mask = torch.cat(binary_submasks, dim=0) + + logging.debug("Original tensor: {}".format(T[binary_mask])) + T[binary_mask] = params_tensor + logging.debug("Final tensor: {}".format(T[binary_mask])) + + start_index = 0 + for i, key in enumerate(state_dict): + end_index = start_index + lens[i] + state_dict[key] = T[start_index:end_index].reshape(shapes[i]) + start_index = end_index + + return state_dict diff --git a/src/decentralizepy/sharing/TopK.py b/src/decentralizepy/sharing/TopK.py new file mode 100644 index 0000000..47b4151 --- /dev/null +++ b/src/decentralizepy/sharing/TopK.py @@ -0,0 +1,227 @@ +import json +import logging +import os +from pathlib import Path + +import torch + +from decentralizepy.sharing.Sharing import Sharing + + +class TopK(Sharing): + """ + This class implements topk selection of model parameters based on the model change since the beginning of the + communication step: --> Use ModelChangeAccumulator + + """ + + def __init__( + self, + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha=1.0, + dict_ordered=True, + save_shared=False, + metadata_cap=1.0, + accumulation=False, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Local rank + machine_id : int + Global machine id + communication : decentralizepy.communication.Communication + Communication module used to send and receive messages + mapping : decentralizepy.mappings.Mapping + Mapping (rank, machine_id) -> uid + graph : decentralizepy.graphs.Graph + Graph reprensenting neighbors + model : decentralizepy.models.Model + Model to train + dataset : decentralizepy.datasets.Dataset + Dataset for sharing data. Not implemented yet! TODO + log_dir : str + Location to write shared_params (only writing for 2 procs per machine) + alpha : float + Percentage of model to share + dict_ordered : bool + Specifies if the python dict maintains the order of insertion + save_shared : bool + Specifies if the indices of shared parameters should be logged + metadata_cap : float + Share full model when self.alpha > metadata_cap + + """ + super().__init__( + rank, machine_id, communication, mapping, graph, model, dataset, log_dir + ) + self.alpha = alpha + self.dict_ordered = dict_ordered + self.save_shared = save_shared + self.metadata_cap = metadata_cap + self.total_meta = 0 + self.accumulation = accumulation + + if self.save_shared: + # Only save for 2 procs: Save space + if rank != 0 or rank != 1: + self.save_shared = False + + if self.save_shared: + self.folder_path = os.path.join( + self.log_dir, "shared_params/{}".format(self.rank) + ) + Path(self.folder_path).mkdir(parents=True, exist_ok=True) + + def extract_top_gradients(self): + """ + Extract the indices and values of the topK gradients. + The gradients must have been accumulationd. + + Returns + ------- + tuple + (a,b). a: The magnitudes of the topK gradients, b: Their indices. + + """ + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + if self.accumulation: + logging.info( + "TopK extract gradients based on accumulated model parameter change" + ) + diff = self.model.prev_model_params + (concated - self.model.prev) + else: + diff = concated - self.model.prev_model_params + G_topk = torch.abs(diff) + + std, mean = torch.std_mean(G_topk, unbiased=False) + self.std = std.item() + self.mean = mean.item() + value, ind = torch.topk( + G_topk, round(self.alpha * G_topk.shape[0]), dim=0, sorted=False + ) + + # only needed when ModelChangeAccumulator.accumulation = True + # does not cause problems otherwise + if self.accumulation: + self.model.prev_model_params[ind] = 0.0 # torch.zeros((len(G_topk),)) + return value, ind + + def serialized_model(self): + """ + Convert model to a dict. self.alpha specifies the fraction of model to send. + + Returns + ------- + dict + Model converted to a dict + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().serialized_model() + + with torch.no_grad(): + _, G_topk = self.extract_top_gradients() + + if self.save_shared: + shared_params = dict() + shared_params["order"] = list(self.model.state_dict().keys()) + shapes = dict() + for k, v in self.model.state_dict().items(): + shapes[k] = list(v.shape) + shared_params["shapes"] = shapes + + shared_params[self.communication_round] = G_topk.tolist() + + with open( + os.path.join( + self.folder_path, + "{}_shared_params.json".format(self.communication_round + 1), + ), + "w", + ) as of: + json.dump(shared_params, of) + + logging.info("Extracting topk params") + + tensors_to_cat = [v.data.flatten() for v in self.model.parameters()] + T = torch.cat(tensors_to_cat, dim=0) + T_topk = T[G_topk] + + logging.info("Generating dictionary to send") + + m = dict() + + if not self.dict_ordered: + raise NotImplementedError + + m["indices"] = G_topk.numpy() + m["params"] = T_topk.numpy() + + assert len(m["indices"]) == len(m["params"]) + logging.info("Elements sending: {}".format(len(m["indices"]))) + + logging.info("Generated dictionary to send") + + logging.info("Converted dictionary to pickle") + self.total_data += len(self.communication.encrypt(m["params"])) + self.total_meta += len(self.communication.encrypt(m["indices"])) + + return m + + def deserialized_model(self, m): + """ + Convert received dict to state_dict. + + Parameters + ---------- + m : dict + dict received + + Returns + ------- + state_dict + state_dict of received + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().deserialized_model(m) + + with torch.no_grad(): + state_dict = self.model.state_dict() + + if not self.dict_ordered: + raise NotImplementedError + + shapes = [] + lens = [] + tensors_to_cat = [] + for _, v in state_dict.items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + + T = torch.cat(tensors_to_cat, dim=0) + index_tensor = torch.tensor(m["indices"]) + logging.debug("Original tensor: {}".format(T[index_tensor])) + T[index_tensor] = torch.tensor(m["params"]) + logging.debug("Final tensor: {}".format(T[index_tensor])) + start_index = 0 + for i, key in enumerate(state_dict): + end_index = start_index + lens[i] + state_dict[key] = T[start_index:end_index].reshape(shapes[i]) + start_index = end_index + + return state_dict diff --git a/src/decentralizepy/sharing/TopKParams.py b/src/decentralizepy/sharing/TopKParams.py new file mode 100644 index 0000000..3beb10f --- /dev/null +++ b/src/decentralizepy/sharing/TopKParams.py @@ -0,0 +1,225 @@ +import json +import logging +import os +from pathlib import Path + +import torch + +from decentralizepy.sharing.Sharing import Sharing + + +class TopKParams(Sharing): + """ + This class implements the vanilla version of partial model sharing. + + """ + + def __init__( + self, + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha=1.0, + dict_ordered=True, + save_shared=False, + metadata_cap=1.0, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Local rank + machine_id : int + Global machine id + communication : decentralizepy.communication.Communication + Communication module used to send and receive messages + mapping : decentralizepy.mappings.Mapping + Mapping (rank, machine_id) -> uid + graph : decentralizepy.graphs.Graph + Graph reprensenting neighbors + model : decentralizepy.models.Model + Model to train + dataset : decentralizepy.datasets.Dataset + Dataset for sharing data. Not implemented yet! TODO + log_dir : str + Location to write shared_params (only writing for 2 procs per machine) + alpha : float + Percentage of model to share + dict_ordered : bool + Specifies if the python dict maintains the order of insertion + save_shared : bool + Specifies if the indices of shared parameters should be logged + metadata_cap : float + Share full model when self.alpha > metadata_cap + + """ + super().__init__( + rank, machine_id, communication, mapping, graph, model, dataset, log_dir + ) + self.alpha = alpha + self.dict_ordered = dict_ordered + self.save_shared = save_shared + self.metadata_cap = metadata_cap + self.total_meta = 0 + + if self.save_shared: + # Only save for 2 procs: Save space + if rank != 0 or rank != 1: + self.save_shared = False + + if self.save_shared: + self.folder_path = os.path.join( + self.log_dir, "shared_params/{}".format(self.rank) + ) + Path(self.folder_path).mkdir(parents=True, exist_ok=True) + + def extract_top_params(self): + """ + Extract the indices and values of the topK params layerwise. + The gradients must have been accumulated. + + Returns + ------- + tuple + (a,b,c). a: The topK params, b: Their indices, c: The offsets + + """ + + logging.info("Returning TopKParams gradients") + values_list = [] + index_list = [] + offsets = [0] + off = 0 + for _, v in self.model.state_dict().items(): + flat = v.flatten() + values, index = torch.topk( + flat.abs(), round(self.alpha * flat.size(dim=0)), dim=0, sorted=False + ) + values_list.append(flat[index]) + index_list.append(index) + off += values.size(dim=0) + offsets.append(off) + cat_values = torch.cat(values_list, dim=0) + cat_index = torch.cat(index_list, dim=0) + + # logging.debug("Subsampling vector is of size: " + str(subsample.size(dim = 0))) + return (cat_values, cat_index, offsets) + + def serialized_model(self): + """ + Convert model to json dict. self.alpha specifies the fraction of model to send. + + Returns + ------- + dict + Model converted to json dict + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().serialized_model() + + with torch.no_grad(): + values, index, offsets = self.extract_top_params() + + if self.save_shared: + shared_params = dict() + shared_params["order"] = list(self.model.state_dict().keys()) + shapes = dict() + for k, v in self.model.state_dict().items(): + shapes[k] = list(v.shape) + shared_params["shapes"] = shapes + + shared_params[self.communication_round] = index.tolist() + # TODO: store offsets + + with open( + os.path.join( + self.folder_path, + "{}_shared_params.json".format(self.communication_round + 1), + ), + "w", + ) as of: + json.dump(shared_params, of) + + logging.info("Extracting topk params") + + logging.info("Generating dictionary to send") + + m = dict() + + if not self.dict_ordered: + raise NotImplementedError + + m["indices"] = index.numpy() + m["params"] = values.numpy() + m["offsets"] = offsets + + assert len(m["indices"]) == len(m["params"]) + logging.info("Elements sending: {}".format(len(m["indices"]))) + + logging.info("Generated dictionary to send") + + # for key in m: + # m[key] = json.dumps(m[key]) + + logging.info("Converted dictionary to json") + self.total_data += len(self.communication.encrypt(m["params"])) + self.total_meta += len(self.communication.encrypt(m["indices"])) + len( + self.communication.encrypt(m["offsets"]) + ) + + return m + + def deserialized_model(self, m): + """ + Convert received json dict to state_dict. + + Parameters + ---------- + m : dict + json dict received + + Returns + ------- + state_dict + state_dict of received + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().deserialized_model(m) + + with torch.no_grad(): + state_dict = self.model.state_dict() + + if not self.dict_ordered: + raise NotImplementedError + + shapes = [] + lens = [] + tensors_to_cat = [] + offsets = m["offsets"] + params = torch.tensor(m["params"]) + indices = torch.tensor(m["indices"]) + + for i, (_, v) in enumerate(state_dict.items()): + shapes.append(v.shape) + t = v.flatten().clone().detach() # it is not always copied + lens.append(t.shape[0]) + index = indices[offsets[i] : offsets[i + 1]] + t[index] = params[offsets[i] : offsets[i + 1]] + tensors_to_cat.append(t) + + start_index = 0 + for i, key in enumerate(state_dict): + end_index = start_index + lens[i] + state_dict[key] = tensors_to_cat[i].reshape(shapes[i]) + start_index = end_index + + return state_dict diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py new file mode 100644 index 0000000..a6cccaf --- /dev/null +++ b/src/decentralizepy/sharing/Wavelet.py @@ -0,0 +1,370 @@ +import base64 +import json +import logging +import os +import pickle +from pathlib import Path +from time import time + +import pywt +import torch + +from decentralizepy.sharing.Sharing import Sharing + + +class Wavelet(Sharing): + """ + This class implements the wavelet version of model sharing + It is based on PartialModel.py + + """ + + def __init__( + self, + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha=1.0, + dict_ordered=True, + save_shared=False, + metadata_cap=1.0, + pickle=True, + wavelet="haar", + level=4, + change_based_selection=True, + accumulation=False, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Local rank + machine_id : int + Global machine id + communication : decentralizepy.communication.Communication + Communication module used to send and receive messages + mapping : decentralizepy.mappings.Mapping + Mapping (rank, machine_id) -> uid + graph : decentralizepy.graphs.Graph + Graph reprensenting neighbors + model : decentralizepy.models.Model + Model to train + dataset : decentralizepy.datasets.Dataset + Dataset for sharing data. Not implemented yet! TODO + log_dir : str + Location to write shared_params (only writing for 2 procs per machine) + alpha : float + Percentage of model to share + dict_ordered : bool + Specifies if the python dict maintains the order of insertion + save_shared : bool + Specifies if the indices of shared parameters should be logged + metadata_cap : float + Share full model when self.alpha > metadata_cap + pickle : bool + use pickle to serialize the model parameters + wavelet: str + name of the wavelet to be used in gradient compression + level: int + name of the wavelet to be used in gradient compression + change_based_selection : bool + use frequency change to select topk frequencies + accumulation : bool + True if the the indices to share should be selected based on accumulated frequency change + """ + super().__init__( + rank, machine_id, communication, mapping, graph, model, dataset, log_dir + ) + self.alpha = alpha + self.dict_ordered = dict_ordered + self.save_shared = save_shared + self.metadata_cap = metadata_cap + self.total_meta = 0 + + self.pickle = pickle + self.wavelet = wavelet + self.level = level + self.accumulation = accumulation + + logging.info("subsampling pickling=" + str(pickle)) + + if self.save_shared: + # Only save for 2 procs: Save space + if rank != 0 or rank != 1: + self.save_shared = False + + if self.save_shared: + self.folder_path = os.path.join( + self.log_dir, "shared_params/{}".format(self.rank) + ) + Path(self.folder_path).mkdir(parents=True, exist_ok=True) + + self.change_based_selection = change_based_selection + + def apply_wavelet(self): + """ + Does wavelet transformation of the model parameters and selects topK (alpha) of them in the frequency domain + based on the undergone change during the current training step + + Returns + ------- + tuple + (a,b). a: selected wavelet coefficients, b: Their indices. + + """ + + logging.info("Returning dwt compressed model weights") + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + if self.change_based_selection: + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array( + coeff + ) # coeff_slices will be reproduced on the receiver + data = data.ravel() + + if self.accumulation: + logging.info( + "wavelet topk extract frequencies based on accumulated model frequency change" + ) + diff = self.model.accumulated_frequency + (data - self.model.prev) + else: + diff = data - self.model.accumulated_frequency + _, index = torch.topk( + torch.from_numpy(diff).abs(), + round(self.alpha * len(data)), + dim=0, + sorted=False, + ) + else: + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array( + coeff + ) # coeff_slices will be reproduced on the receiver + data = data.ravel() + _, index = torch.topk( + torch.from_numpy(data).abs(), + round(self.alpha * len(data)), + dim=0, + sorted=False, + ) + + if self.accumulation: + self.model.accumulated_frequency[index] = 0.0 + return torch.from_numpy(data[index]), index + + def serialized_model(self): + """ + Convert model to json dict. self.alpha specifies the fraction of model to send. + + Returns + ------- + dict + Model converted to json dict + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().serialized_model() + + with torch.no_grad(): + topk, indices = self.apply_wavelet() + + if self.save_shared: + shared_params = dict() + shared_params["order"] = list(self.model.state_dict().keys()) + shapes = dict() + for k, v in self.model.state_dict().items(): + shapes[k] = list(v.shape) + shared_params["shapes"] = shapes + + shared_params[self.communication_round] = indices.tolist() # is slow + + shared_params["alpha"] = self.alpha + + with open( + os.path.join( + self.folder_path, + "{}_shared_params.json".format(self.communication_round + 1), + ), + "w", + ) as of: + json.dump(shared_params, of) + + m = dict() + + if not self.dict_ordered: + raise NotImplementedError + + m["alpha"] = self.alpha + + m["params"] = topk.numpy() + + m["indices"] = indices.numpy() + + self.total_data += len(self.communication.encrypt(m["params"])) + self.total_meta += len(self.communication.encrypt(m["indices"])) + len( + self.communication.encrypt(m["alpha"]) + ) + + return m + + def deserialized_model(self, m): + """ + Convert received json dict to state_dict. + + Parameters + ---------- + m : dict + json dict received + + Returns + ------- + state_dict + state_dict of received + + """ + if self.alpha > self.metadata_cap: # Share fully + return super().deserialized_model(m) + + with torch.no_grad(): + state_dict = self.model.state_dict() + + if not self.dict_ordered: + raise NotImplementedError + + shapes = [] + lens = [] + tensors_to_cat = [] + for _, v in state_dict.items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + + T = torch.cat(tensors_to_cat, dim=0) + + indices = m["indices"] + alpha = m["alpha"] + params = m["params"] + + params_tensor = torch.tensor(params) + indices_tensor = torch.tensor(indices) + ret = dict() + ret["indices"] = indices_tensor + ret["params"] = params_tensor + return ret + + def step(self): + """ + Perform a sharing step. Implements D-PSGD. + + """ + t_start = time() + data = self.serialized_model() + t_post_serialize = time() + my_uid = self.mapping.get_uid(self.rank, self.machine_id) + all_neighbors = self.graph.neighbors(my_uid) + iter_neighbors = self.get_neighbors(all_neighbors) + data["degree"] = len(all_neighbors) + data["iteration"] = self.communication_round + for neighbor in iter_neighbors: + self.communication.send(neighbor, data) + t_post_send = time() + logging.info("Waiting for messages from neighbors") + while not self.received_from_all(): + sender, data = self.communication.receive() + logging.debug("Received model from {}".format(sender)) + degree = data["degree"] + iteration = data["iteration"] + del data["degree"] + del data["iteration"] + self.peer_deques[sender].append((degree, iteration, data)) + logging.info( + "Deserialized received model from {} of iteration {}".format( + sender, iteration + ) + ) + t_post_recv = time() + + logging.info("Starting model averaging after receiving from all neighbors") + total = None + weight_total = 0 + + # FFT of this model + shapes = [] + lens = [] + tensors_to_cat = [] + # TODO: should we detach + for _, v in self.model.state_dict().items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + wt_params, coeff_slices = pywt.coeffs_to_array( + coeff + ) # coeff_slices will be reproduced on the receiver + shape = wt_params.shape + wt_params = wt_params.ravel() + + for i, n in enumerate(self.peer_deques): + degree, iteration, data = self.peer_deques[n].popleft() + logging.debug( + "Averaging model from neighbor {} of iteration {}".format(n, iteration) + ) + data = self.deserialized_model(data) + params = data["params"] + indices = data["indices"] + # use local data to complement + topkwf = wt_params.copy() # .clone().detach() + topkwf[indices] = params + topkwf = torch.from_numpy(topkwf.reshape(shape)) + + weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings + weight_total += weight + if total is None: + total = weight * topkwf + else: + total += weight * topkwf + + # Metro-Hastings + total += (1 - weight_total) * wt_params + + avg_wf_params = pywt.array_to_coeffs( + total, coeff_slices, output_format="wavedec" + ) + reverse_total = torch.from_numpy( + pywt.waverec(avg_wf_params, wavelet=self.wavelet) + ) + + start_index = 0 + std_dict = {} + for i, key in enumerate(self.model.state_dict()): + end_index = start_index + lens[i] + std_dict[key] = reverse_total[start_index:end_index].reshape(shapes[i]) + start_index = end_index + + self.model.load_state_dict(std_dict) + + logging.info("Model averaging complete") + + self.communication_round += 1 + + t_end = time() + + logging.info( + "Sharing::step | Serialize: %f; Send: %f; Recv: %f; Averaging: %f; Total: %f", + t_post_serialize - t_start, + t_post_send - t_post_serialize, + t_post_recv - t_post_send, + t_end - t_post_recv, + t_end - t_start, + ) diff --git a/src/decentralizepy/training/FrequencyAccumulator.py b/src/decentralizepy/training/FrequencyAccumulator.py new file mode 100644 index 0000000..9c264cc --- /dev/null +++ b/src/decentralizepy/training/FrequencyAccumulator.py @@ -0,0 +1,105 @@ +import logging + +import torch +from torch import fft + +from decentralizepy.training.Training import Training + + +class FrequencyAccumulator(Training): + """ + This class implements the training module which also accumulates the fft frequency at the beginning of steps a communication round. + + """ + + def __init__( + self, + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds="", + full_epochs="", + batch_size="", + shuffle="", + accumulation=True, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid + model : torch.nn.Module + Neural Network for training + optimizer : torch.optim + Optimizer to learn parameters + loss : function + Loss function + log_dir : str + Directory to log the model change. + rounds : int, optional + Number of steps/epochs per training call + full_epochs: bool, optional + True if 1 round = 1 epoch. False if 1 round = 1 minibatch + batch_size : int, optional + Number of items to learn over, in one batch + shuffle : bool + True if the dataset should be shuffled before training. + accumulation : bool + True if the model change should be accumulated across communication steps + """ + super().__init__( + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, + ) + self.accumulation = accumulation + + def train(self, dataset): + """ + Does one training iteration. + If self.accumulation is True then it accumulates model fft frequency changes in model.accumulated_frequency. + Otherwise it stores the current fft frequency representation of the model in model.accumulated_frequency. + + Parameters + ---------- + dataset : decentralizepy.datasets.Dataset + The training dataset. Should implement get_trainset(batch_size, shuffle) + + """ + + # this looks at the change from the last round averaging of the frequencies + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + flat_fft = fft.rfft(concated) + if self.accumulation: + if self.model.accumulated_frequency is None: + logging.info("Initialize fft frequency accumulation") + self.model.accumulated_frequency = torch.zeros_like(flat_fft) + self.model.prev = flat_fft + else: + logging.info("fft frequency accumulation step") + self.model.accumulated_frequency += flat_fft - self.model.prev + self.model.prev = flat_fft + else: + logging.info("fft frequency accumulation reset") + self.model.accumulated_frequency = flat_fft + + super().train(dataset) diff --git a/src/decentralizepy/training/FrequencyWaveletAccumulator.py b/src/decentralizepy/training/FrequencyWaveletAccumulator.py new file mode 100644 index 0000000..cf65724 --- /dev/null +++ b/src/decentralizepy/training/FrequencyWaveletAccumulator.py @@ -0,0 +1,113 @@ +import logging + +import numpy as np +import pywt +import torch + +from decentralizepy.training.Training import Training + + +class FrequencyWaveletAccumulator(Training): + """ + This class implements the training module which also accumulates the wavelet frequency at the beginning of steps a communication round. + + """ + + def __init__( + self, + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds="", + full_epochs="", + batch_size="", + shuffle="", + wavelet="haar", + level=4, + accumulation=True, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid + model : torch.nn.Module + Neural Network for training + optimizer : torch.optim + Optimizer to learn parameters + loss : function + Loss function + log_dir : str + Directory to log the model change. + rounds : int, optional + Number of steps/epochs per training call + full_epochs: bool, optional + True if 1 round = 1 epoch. False if 1 round = 1 minibatch + batch_size : int, optional + Number of items to learn over, in one batch + shuffle : bool + True if the dataset should be shuffled before training. + accumulation : bool + True if the model change should be accumulated across communication steps + """ + super().__init__( + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, + ) + self.wavelet = wavelet + self.level = level + self.accumulation = accumulation + + def train(self, dataset): + """ + Does one training iteration. + If self.accumulation is True then it accumulates model wavelet frequency changes in model.accumulated_frequency. + Otherwise it stores the current wavelet frequency representation of the model in model.accumulated_frequency. + + Parameters + ---------- + dataset : decentralizepy.datasets.Dataset + The training dataset. Should implement get_trainset(batch_size, shuffle) + + """ + + # this looks at the change from the last round averaging of the frequencies + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + data = data.ravel() + if self.accumulation: + if self.model.accumulated_frequency is None: + logging.info("Initialize wavelet frequency accumulation") + self.model.accumulated_frequency = np.zeros_like( + data + ) # torch.zeros_like(data) + self.model.prev = data + else: + logging.info("wavelet frequency accumulation step") + self.model.accumulated_frequency += data - self.model.prev + self.model.prev = data + else: + logging.info("wavelet frequency accumulation reset") + self.model.accumulated_frequency = data + super().train(dataset) diff --git a/src/decentralizepy/training/ModelChangeAccumulator.py b/src/decentralizepy/training/ModelChangeAccumulator.py new file mode 100644 index 0000000..1c70283 --- /dev/null +++ b/src/decentralizepy/training/ModelChangeAccumulator.py @@ -0,0 +1,103 @@ +import logging + +import torch +from torch import fft + +from decentralizepy.training.Training import Training + + +class ModelChangeAccumulator(Training): + """ + This class implements the training module which also accumulates the model change at the beginning of a communication round. + + """ + + def __init__( + self, + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds="", + full_epochs="", + batch_size="", + shuffle="", + accumulation=True, + ): + """ + Constructor + + Parameters + ---------- + rank : int + Rank of process local to the machine + machine_id : int + Machine ID on which the process in running + mapping : decentralizepy.mappings + The object containing the mapping rank <--> uid + model : torch.nn.Module + Neural Network for training + optimizer : torch.optim + Optimizer to learn parameters + loss : function + Loss function + log_dir : str + Directory to log the model change. + rounds : int, optional + Number of steps/epochs per training call + full_epochs: bool, optional + True if 1 round = 1 epoch. False if 1 round = 1 minibatch + batch_size : int, optional + Number of items to learn over, in one batch + shuffle : bool + True if the dataset should be shuffled before training. + accumulation : bool + True if the model change should be accumulated across communication steps + + """ + super().__init__( + rank, + machine_id, + mapping, + model, + optimizer, + loss, + log_dir, + rounds, + full_epochs, + batch_size, + shuffle, + ) + self.accumulation = accumulation + + def train(self, dataset): + """ + Does one training iteration. + If self.accumulation is True then it accumulates model parameter changes in model.prev_model_params. + Otherwise it stores the current model parameters in model.prev_model_params. + + Parameters + ---------- + dataset : decentralizepy.datasets.Dataset + The training dataset. Should implement get_trainset(batch_size, shuffle) + + """ + + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + if self.accumulation: + if self.model.prev_model_params is None: + logging.info("Initialize model parameter accumulation.") + self.model.prev_model_params = torch.zeros_like(concated) + self.model.prev = concated + else: + logging.info("model parameter accumulation step") + self.model.prev_model_params += concated - self.model.prev + self.model.prev = concated + else: + logging.info("model parameter reset") + self.model.prev_model_params = concated + super().train(dataset) diff --git a/src/decentralizepy/training/Training.py b/src/decentralizepy/training/Training.py index 3b99bef..5adc4a9 100644 --- a/src/decentralizepy/training/Training.py +++ b/src/decentralizepy/training/Training.py @@ -46,7 +46,7 @@ class Training: Directory to log the model change. rounds : int, optional Number of steps/epochs per training call - full_epochs: bool, optional + full_epochs : bool, optional True if 1 round = 1 epoch. False if 1 round = 1 minibatch batch_size : int, optional Number of items to learn over, in one batch -- GitLab From f77e0f94d0e10b004b000961cd11f6e268e14b49 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Wed, 9 Mar 2022 14:13:40 +0100 Subject: [PATCH 02/16] updated config files --- eval/run.sh | 2 +- eval/step_configs/config_femnist.ini | 1 + eval/step_configs/config_femnist_fft.ini | 1 + eval/step_configs/config_femnist_sharing.ini | 3 ++- eval/step_configs/config_femnist_subsampling.ini | 1 + eval/step_configs/config_femnist_topk.ini | 1 + eval/step_configs/config_femnist_topkparam.ini | 1 + eval/step_configs/config_femnist_wavelet.ini | 1 + 8 files changed, 9 insertions(+), 2 deletions(-) diff --git a/eval/run.sh b/eval/run.sh index 0198413..20a9d2b 100755 --- a/eval/run.sh +++ b/eval/run.sh @@ -21,4 +21,4 @@ log_dir=$log_dir_base$m cp $original_config $config_file # echo "alpha = 0.10" >> $config_file -$env_python $eval_file -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level \ No newline at end of file +$env_python $eval_file -ro 0 -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level \ No newline at end of file diff --git a/eval/step_configs/config_femnist.ini b/eval/step_configs/config_femnist.ini index 43bb07d..fa49fc9 100644 --- a/eval/step_configs/config_femnist.ini +++ b/eval/step_configs/config_femnist.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /home/risharma/leaf/data/femnist/per_user_data/train test_dir = /home/risharma/leaf/data/femnist/data/test diff --git a/eval/step_configs/config_femnist_fft.ini b/eval/step_configs/config_femnist_fft.ini index 32c5e17..b0eda41 100644 --- a/eval/step_configs/config_femnist_fft.ini +++ b/eval/step_configs/config_femnist_fft.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test diff --git a/eval/step_configs/config_femnist_sharing.ini b/eval/step_configs/config_femnist_sharing.ini index 42ab50c..c816302 100644 --- a/eval/step_configs/config_femnist_sharing.ini +++ b/eval/step_configs/config_femnist_sharing.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test @@ -15,7 +16,7 @@ lr = 0.001 [TRAIN_PARAMS] training_package = decentralizepy.training.Training training_class = Training -rounds = 10 +rounds = 47 full_epochs = False batch_size = 16 shuffle = True diff --git a/eval/step_configs/config_femnist_subsampling.ini b/eval/step_configs/config_femnist_subsampling.ini index 53121d8..61a1e9a 100644 --- a/eval/step_configs/config_femnist_subsampling.ini +++ b/eval/step_configs/config_femnist_subsampling.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test diff --git a/eval/step_configs/config_femnist_topk.ini b/eval/step_configs/config_femnist_topk.ini index 57ba8f0..7c90588 100644 --- a/eval/step_configs/config_femnist_topk.ini +++ b/eval/step_configs/config_femnist_topk.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test diff --git a/eval/step_configs/config_femnist_topkparam.ini b/eval/step_configs/config_femnist_topkparam.ini index 41c50c0..ada3c3f 100644 --- a/eval/step_configs/config_femnist_topkparam.ini +++ b/eval/step_configs/config_femnist_topkparam.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test diff --git a/eval/step_configs/config_femnist_wavelet.ini b/eval/step_configs/config_femnist_wavelet.ini index e53e3ea..5228709 100644 --- a/eval/step_configs/config_femnist_wavelet.ini +++ b/eval/step_configs/config_femnist_wavelet.ini @@ -1,6 +1,7 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test -- GitLab From 0fa9ba103b62257693b6f0b8c8f61bc5f283f27f Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Wed, 9 Mar 2022 15:57:16 +0100 Subject: [PATCH 03/16] encoding indices as np.int32 --- src/decentralizepy/sharing/FFT.py | 18 +++--------------- src/decentralizepy/sharing/SubSampling.py | 5 ++--- src/decentralizepy/sharing/TopK.py | 5 +++-- src/decentralizepy/sharing/TopKParams.py | 5 +++-- src/decentralizepy/sharing/Wavelet.py | 7 +++---- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index 4a3ee36..1cc8382 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -1,13 +1,12 @@ -import base64 import json import logging import os -import pickle from pathlib import Path from time import time import torch import torch.fft as fft +import numpy as np from decentralizepy.sharing.Sharing import Sharing @@ -182,7 +181,7 @@ class FFT(Sharing): m["alpha"] = self.alpha m["params"] = topk.numpy() - m["indices"] = indices.numpy() + m["indices"] = indices.numpy().astype(np.int32) self.total_data += len(self.communication.encrypt(m["params"])) self.total_meta += len(self.communication.encrypt(m["indices"])) + len( @@ -215,23 +214,12 @@ class FFT(Sharing): if not self.dict_ordered: raise NotImplementedError - shapes = [] - lens = [] - tensors_to_cat = [] - for _, v in state_dict.items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - - T = torch.cat(tensors_to_cat, dim=0) - indices = m["indices"] alpha = m["alpha"] params = m["params"] params_tensor = torch.tensor(params) - indices_tensor = torch.tensor(indices) + indices_tensor = torch.tensor(indices, dtype=torch.long) ret = dict() ret["indices"] = indices_tensor ret["params"] = params_tensor diff --git a/src/decentralizepy/sharing/SubSampling.py b/src/decentralizepy/sharing/SubSampling.py index 6fe3f93..1e956cd 100644 --- a/src/decentralizepy/sharing/SubSampling.py +++ b/src/decentralizepy/sharing/SubSampling.py @@ -1,11 +1,10 @@ -import base64 import json import logging import os -import pickle from pathlib import Path import torch +import numpy as np from decentralizepy.sharing.Sharing import Sharing @@ -203,7 +202,7 @@ class SubSampling(Sharing): m["seed"] = seed m["alpha"] = alpha - m["params"] = subsample.numpy() + m["params"] = subsample.numpy().astype(np.int32) # logging.info("Converted dictionary to json") self.total_data += len(self.communication.encrypt(m["params"])) diff --git a/src/decentralizepy/sharing/TopK.py b/src/decentralizepy/sharing/TopK.py index 47b4151..f50ba7e 100644 --- a/src/decentralizepy/sharing/TopK.py +++ b/src/decentralizepy/sharing/TopK.py @@ -3,6 +3,7 @@ import logging import os from pathlib import Path +import numpy as np import torch from decentralizepy.sharing.Sharing import Sharing @@ -166,7 +167,7 @@ class TopK(Sharing): if not self.dict_ordered: raise NotImplementedError - m["indices"] = G_topk.numpy() + m["indices"] = G_topk.numpy().astype(np.int32) m["params"] = T_topk.numpy() assert len(m["indices"]) == len(m["params"]) @@ -214,7 +215,7 @@ class TopK(Sharing): tensors_to_cat.append(t) T = torch.cat(tensors_to_cat, dim=0) - index_tensor = torch.tensor(m["indices"]) + index_tensor = torch.tensor(m["indices"], dtype=torch.long) logging.debug("Original tensor: {}".format(T[index_tensor])) T[index_tensor] = torch.tensor(m["params"]) logging.debug("Final tensor: {}".format(T[index_tensor])) diff --git a/src/decentralizepy/sharing/TopKParams.py b/src/decentralizepy/sharing/TopKParams.py index 3beb10f..c6535ce 100644 --- a/src/decentralizepy/sharing/TopKParams.py +++ b/src/decentralizepy/sharing/TopKParams.py @@ -3,6 +3,7 @@ import logging import os from pathlib import Path +import numpy as np import torch from decentralizepy.sharing.Sharing import Sharing @@ -157,7 +158,7 @@ class TopKParams(Sharing): if not self.dict_ordered: raise NotImplementedError - m["indices"] = index.numpy() + m["indices"] = index.numpy().astype(np.int32) m["params"] = values.numpy() m["offsets"] = offsets @@ -206,7 +207,7 @@ class TopKParams(Sharing): tensors_to_cat = [] offsets = m["offsets"] params = torch.tensor(m["params"]) - indices = torch.tensor(m["indices"]) + indices = torch.tensor(m["indices"], dtype=torch.long) for i, (_, v) in enumerate(state_dict.items()): shapes.append(v.shape) diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index a6cccaf..774dfe0 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -1,11 +1,10 @@ -import base64 import json import logging import os -import pickle from pathlib import Path from time import time +import numpy as np import pywt import torch @@ -206,7 +205,7 @@ class Wavelet(Sharing): m["params"] = topk.numpy() - m["indices"] = indices.numpy() + m["indices"] = indices.numpy().astype(np.int32) self.total_data += len(self.communication.encrypt(m["params"])) self.total_meta += len(self.communication.encrypt(m["indices"])) + len( @@ -255,7 +254,7 @@ class Wavelet(Sharing): params = m["params"] params_tensor = torch.tensor(params) - indices_tensor = torch.tensor(indices) + indices_tensor = torch.tensor(indices, dtype=torch.long) ret = dict() ret["indices"] = indices_tensor ret["params"] = params_tensor -- GitLab From 5c138837ba671000d6110c265290088141b84774 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Wed, 9 Mar 2022 17:07:00 +0100 Subject: [PATCH 04/16] subsampling fix --- src/decentralizepy/sharing/FFT.py | 2 +- src/decentralizepy/sharing/SubSampling.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index 1cc8382..1cdf701 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -4,9 +4,9 @@ import os from pathlib import Path from time import time +import numpy as np import torch import torch.fft as fft -import numpy as np from decentralizepy.sharing.Sharing import Sharing diff --git a/src/decentralizepy/sharing/SubSampling.py b/src/decentralizepy/sharing/SubSampling.py index 1e956cd..5ec0c44 100644 --- a/src/decentralizepy/sharing/SubSampling.py +++ b/src/decentralizepy/sharing/SubSampling.py @@ -4,7 +4,6 @@ import os from pathlib import Path import torch -import numpy as np from decentralizepy.sharing.Sharing import Sharing @@ -202,7 +201,7 @@ class SubSampling(Sharing): m["seed"] = seed m["alpha"] = alpha - m["params"] = subsample.numpy().astype(np.int32) + m["params"] = subsample.numpy() # logging.info("Converted dictionary to json") self.total_data += len(self.communication.encrypt(m["params"])) -- GitLab From 90772eb9df86dcf606fccdfd1dd054f88bfc0a06 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Mon, 14 Mar 2022 17:03:16 +0100 Subject: [PATCH 05/16] implemented suggested changes --- eval/plot.py | 13 +- eval/run.sh | 5 +- eval/step_configs/config_femnist_100.ini | 11 +- eval/step_configs/config_femnist_fft.ini | 3 +- eval/step_configs/config_femnist_sharing.ini | 33 --- ...st_topk.ini => config_femnist_topkacc.ini} | 7 +- eval/step_configs/config_femnist_wavelet.ini | 1 - eval/testing.py | 2 +- setup.cfg | 1 + src/decentralizepy/models/Model.py | 17 +- src/decentralizepy/node/Node.py | 6 +- src/decentralizepy/sharing/FFT.py | 18 +- src/decentralizepy/sharing/PartialModel.py | 1 + src/decentralizepy/sharing/TopK.py | 228 ------------------ src/decentralizepy/sharing/Wavelet.py | 33 +-- .../training/ChangeAccumulator.py | 25 ++ .../training/FrequencyAccumulator.py | 41 ++-- .../training/FrequencyWaveletAccumulator.py | 46 ++-- .../training/ModelChangeAccumulator.py | 103 -------- 19 files changed, 133 insertions(+), 461 deletions(-) delete mode 100644 eval/step_configs/config_femnist_sharing.ini rename eval/step_configs/{config_femnist_topk.ini => config_femnist_topkacc.ini} (84%) delete mode 100644 src/decentralizepy/sharing/TopK.py delete mode 100644 src/decentralizepy/training/ModelChangeAccumulator.py diff --git a/eval/plot.py b/eval/plot.py index f354937..0b7b66c 100644 --- a/eval/plot.py +++ b/eval/plot.py @@ -4,6 +4,7 @@ import sys import numpy as np from matplotlib import pyplot as plt +import pandas as pd def get_stats(l): @@ -61,20 +62,20 @@ def plot_results(path): plt.figure(1) means, stdevs, mins, maxs = get_stats([x["train_loss"] for x in results]) plot(means, stdevs, mins, maxs, "Training Loss", folder, "upper right") - with open(os.path.join(path, "train_loss_" + folder + ".json"), "w") as f: - json.dump({"mean": means, "std": stdevs}, f) + df = pd.DataFrame({"mean": list(means.values()), "std": list(stdevs.values()), "nr_nodes": [len(results)]*len(means)}, list(means.keys()), columns=["mean", "std", "nr_nodes"]) + df.to_csv(os.path.join(path, "train_loss_" + folder + ".csv")) # Plot Testing loss plt.figure(2) means, stdevs, mins, maxs = get_stats([x["test_loss"] for x in results]) plot(means, stdevs, mins, maxs, "Testing Loss", folder, "upper right") - with open(os.path.join(path, "test_loss_" + folder + ".json"), "w") as f: - json.dump({"mean": means, "std": stdevs}, f) + df = pd.DataFrame({"mean": list(means.values()), "std": list(stdevs.values()), "nr_nodes": [len(results)]*len(means)}, list(means.keys()), columns=["mean", "std", "nr_nodes"]) + df.to_csv(os.path.join(path, "test_loss_" + folder + ".csv")) # Plot Testing Accuracy plt.figure(3) means, stdevs, mins, maxs = get_stats([x["test_acc"] for x in results]) plot(means, stdevs, mins, maxs, "Testing Accuracy", folder, "lower right") - with open(os.path.join(path, "test_acc_" + folder + ".json"), "w") as f: - json.dump({"mean": means, "std": stdevs}, f) + df = pd.DataFrame({"mean": list(means.values()), "std": list(stdevs.values()), "nr_nodes": [len(results)]*len(means)}, list(means.keys()), columns=["mean", "std", "nr_nodes"]) + df.to_csv(os.path.join(path, "test_acc_" + folder + ".csv")) plt.figure(6) means, stdevs, mins, maxs = get_stats([x["grad_std"] for x in results]) plot( diff --git a/eval/run.sh b/eval/run.sh index 20a9d2b..528bdc9 100755 --- a/eval/run.sh +++ b/eval/run.sh @@ -13,11 +13,10 @@ iterations=200 test_after=10 eval_file=testing.py log_level=INFO -log_dir_base=/mnt/nfs/some_user/logs/test m=`cat $(grep addresses_filepath $original_config | awk '{print $3}') | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` - -log_dir=$log_dir_base$m +log_dir=$(date '+%Y-%m-%dT%H:%M')/machine$m +mkdir -p $log_dir cp $original_config $config_file # echo "alpha = 0.10" >> $config_file diff --git a/eval/step_configs/config_femnist_100.ini b/eval/step_configs/config_femnist_100.ini index 4e3e9ba..e1af10b 100644 --- a/eval/step_configs/config_femnist_100.ini +++ b/eval/step_configs/config_femnist_100.ini @@ -1,11 +1,12 @@ [DATASET] dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist +random_seed = 97 model_class = CNN -train_dir = /home/risharma/leaf/data/femnist/per_user_data/train -test_dir = /home/risharma/leaf/data/femnist/data/test +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test ; python list of fractions below -sizes = +sizes = [OPTIMIZER_PARAMS] optimizer_package = torch.optim @@ -15,9 +16,9 @@ lr = 0.001 [TRAIN_PARAMS] training_package = decentralizepy.training.Training training_class = Training -rounds = 20 +rounds = 47 full_epochs = False -batch_size = 64 +batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss diff --git a/eval/step_configs/config_femnist_fft.ini b/eval/step_configs/config_femnist_fft.ini index b0eda41..13a769c 100644 --- a/eval/step_configs/config_femnist_fft.ini +++ b/eval/step_configs/config_femnist_fft.ini @@ -34,5 +34,4 @@ addresses_filepath = ip_addr_6Machines.json sharing_package = decentralizepy.sharing.FFT sharing_class = FFT alpha = 0.1 -change_based_selection = True -accumulation = True \ No newline at end of file +change_based_selection = True \ No newline at end of file diff --git a/eval/step_configs/config_femnist_sharing.ini b/eval/step_configs/config_femnist_sharing.ini deleted file mode 100644 index c816302..0000000 --- a/eval/step_configs/config_femnist_sharing.ini +++ /dev/null @@ -1,33 +0,0 @@ -[DATASET] -dataset_package = decentralizepy.datasets.Femnist -dataset_class = Femnist -random_seed = 97 -model_class = CNN -train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train -test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test -; python list of fractions below -sizes = - -[OPTIMIZER_PARAMS] -optimizer_package = torch.optim -optimizer_class = Adam -lr = 0.001 - -[TRAIN_PARAMS] -training_package = decentralizepy.training.Training -training_class = Training -rounds = 47 -full_epochs = False -batch_size = 16 -shuffle = True -loss_package = torch.nn -loss_class = CrossEntropyLoss - -[COMMUNICATION] -comm_package = decentralizepy.communication.TCP -comm_class = TCP -addresses_filepath = ip_addr_6Machines.json - -[SHARING] -sharing_package = decentralizepy.sharing.Sharing -sharing_class = Sharing diff --git a/eval/step_configs/config_femnist_topk.ini b/eval/step_configs/config_femnist_topkacc.ini similarity index 84% rename from eval/step_configs/config_femnist_topk.ini rename to eval/step_configs/config_femnist_topkacc.ini index 7c90588..e65f225 100644 --- a/eval/step_configs/config_femnist_topk.ini +++ b/eval/step_configs/config_femnist_topkacc.ini @@ -15,8 +15,8 @@ lr = 0.001 # There are 734463 femnist samples [TRAIN_PARAMS] -training_package = decentralizepy.training.ModelChangeAccumulator -training_class = ModelChangeAccumulator +training_package = decentralizepy.training.ChangeAccumulator +training_class = ChangeAccumulator rounds = 47 full_epochs = False batch_size = 16 @@ -33,5 +33,4 @@ addresses_filepath = ip_addr_6Machines.json [SHARING] sharing_package = decentralizepy.sharing.TopK sharing_class = TopK -alpha = 0.1 -accumulation = True \ No newline at end of file +alpha = 0.1 \ No newline at end of file diff --git a/eval/step_configs/config_femnist_wavelet.ini b/eval/step_configs/config_femnist_wavelet.ini index 5228709..ac3bac2 100644 --- a/eval/step_configs/config_femnist_wavelet.ini +++ b/eval/step_configs/config_femnist_wavelet.ini @@ -39,4 +39,3 @@ change_based_selection = True alpha = 0.1 wavelet=sym2 level= None -accumulation = True diff --git a/eval/testing.py b/eval/testing.py index 0ae70de..bb16c2f 100644 --- a/eval/testing.py +++ b/eval/testing.py @@ -25,7 +25,7 @@ if __name__ == "__main__": args = utils.get_args() # prevents accidental log overwrites - Path(args.log_dir).mkdir(parents=True, exist_ok=False) + Path(args.log_dir).mkdir(parents=True, exist_ok=True) log_level = { "INFO": logging.INFO, diff --git a/setup.cfg b/setup.cfg index 2ffd572..0b85f72 100644 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,7 @@ install_requires = smallworld localconfig PyWavelets + pandas include_package_data = True python_requires = >=3.6 [options.packages.find] diff --git a/src/decentralizepy/models/Model.py b/src/decentralizepy/models/Model.py index e9e556b..643eec5 100644 --- a/src/decentralizepy/models/Model.py +++ b/src/decentralizepy/models/Model.py @@ -17,9 +17,7 @@ class Model(nn.Module): self.accumulated_gradients = [] self._param_count_ot = None self._param_count_total = None - self.accumulated_frequency = None - self.prev_model_params = None - self.prev = None + self.accumulated_changes = None def count_params(self, only_trainable=False): """ @@ -46,3 +44,16 @@ class Model(nn.Module): if not self._param_count_total: self._param_count_total = sum(p.numel() for p in self.parameters()) return self._param_count_total + + def rewind_accumulation(self, indices): + """ + resets accumulated_changes at the given indices + + Parameters + ---------- + indices : torch.Tensor + Tensor that contains indices corresponding to the flatten model + + """ + if self.accumulated_changes is not None: + self.accumulated_changes[indices] = 0.0 \ No newline at end of file diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index fd2c75f..7854c38 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -93,7 +93,7 @@ class Node: graph : decentralizepy.graphs The object containing the global graph iterations : int - Number of iterations (communication steps) ) for which the model should be trained + Number of iterations (communication steps) for which the model should be trained log_dir : str Logging directory reset_optimizer : int @@ -281,7 +281,7 @@ class Node: config : dict A dictionary of configurations. iterations : int - Number of iterations (communication steps) ) for which the model should be trained + Number of iterations (communication steps) for which the model should be trained log_dir : str Logging directory log_level : logging.Level @@ -448,7 +448,7 @@ class Node: epochs_per_round = 25 batch_size = 64 iterations : int - Number of iterations (communication steps) ) for which the model should be trained + Number of iterations (communication steps) for which the model should be trained log_dir : str Logging directory log_level : logging.Level diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index 1cdf701..a4c3b59 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -114,27 +114,19 @@ class FFT(Sharing): logging.info("Returning fft compressed model weights") tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] concated = torch.cat(tensors_to_cat, dim=0) - + flat_fft = fft.rfft(concated) if self.change_based_selection: - flat_fft = fft.rfft(concated) - if self.accumulation: - logging.info( - "fft topk extract frequencies based on accumulated model frequency change" - ) - diff = self.model.accumulated_frequency + (flat_fft - self.model.prev) - else: - diff = flat_fft - self.model.accumulated_frequency + + assert len(self.model.accumulated_gradients) == 1 + diff = self.model.accumulated_gradients[0] _, index = torch.topk( diff.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False ) else: - flat_fft = fft.rfft(concated) _, index = torch.topk( flat_fft.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False ) - if self.accumulation: - self.model.accumulated_frequency[index] = 0.0 return flat_fft[index], index def serialized_model(self): @@ -153,6 +145,8 @@ class FFT(Sharing): with torch.no_grad(): topk, indices = self.apply_fft() + self.model.rewind_accumulation(indices) + if self.save_shared: shared_params = dict() shared_params["order"] = list(self.model.state_dict().keys()) diff --git a/src/decentralizepy/sharing/PartialModel.py b/src/decentralizepy/sharing/PartialModel.py index 6a8f0cb..204ee23 100644 --- a/src/decentralizepy/sharing/PartialModel.py +++ b/src/decentralizepy/sharing/PartialModel.py @@ -124,6 +124,7 @@ class PartialModel(Sharing): with torch.no_grad(): _, G_topk = self.extract_top_gradients() + self.model.rewind_accumulation(G_topk) if self.save_shared: shared_params = dict() shared_params["order"] = list(self.model.state_dict().keys()) diff --git a/src/decentralizepy/sharing/TopK.py b/src/decentralizepy/sharing/TopK.py deleted file mode 100644 index f50ba7e..0000000 --- a/src/decentralizepy/sharing/TopK.py +++ /dev/null @@ -1,228 +0,0 @@ -import json -import logging -import os -from pathlib import Path - -import numpy as np -import torch - -from decentralizepy.sharing.Sharing import Sharing - - -class TopK(Sharing): - """ - This class implements topk selection of model parameters based on the model change since the beginning of the - communication step: --> Use ModelChangeAccumulator - - """ - - def __init__( - self, - rank, - machine_id, - communication, - mapping, - graph, - model, - dataset, - log_dir, - alpha=1.0, - dict_ordered=True, - save_shared=False, - metadata_cap=1.0, - accumulation=False, - ): - """ - Constructor - - Parameters - ---------- - rank : int - Local rank - machine_id : int - Global machine id - communication : decentralizepy.communication.Communication - Communication module used to send and receive messages - mapping : decentralizepy.mappings.Mapping - Mapping (rank, machine_id) -> uid - graph : decentralizepy.graphs.Graph - Graph reprensenting neighbors - model : decentralizepy.models.Model - Model to train - dataset : decentralizepy.datasets.Dataset - Dataset for sharing data. Not implemented yet! TODO - log_dir : str - Location to write shared_params (only writing for 2 procs per machine) - alpha : float - Percentage of model to share - dict_ordered : bool - Specifies if the python dict maintains the order of insertion - save_shared : bool - Specifies if the indices of shared parameters should be logged - metadata_cap : float - Share full model when self.alpha > metadata_cap - - """ - super().__init__( - rank, machine_id, communication, mapping, graph, model, dataset, log_dir - ) - self.alpha = alpha - self.dict_ordered = dict_ordered - self.save_shared = save_shared - self.metadata_cap = metadata_cap - self.total_meta = 0 - self.accumulation = accumulation - - if self.save_shared: - # Only save for 2 procs: Save space - if rank != 0 or rank != 1: - self.save_shared = False - - if self.save_shared: - self.folder_path = os.path.join( - self.log_dir, "shared_params/{}".format(self.rank) - ) - Path(self.folder_path).mkdir(parents=True, exist_ok=True) - - def extract_top_gradients(self): - """ - Extract the indices and values of the topK gradients. - The gradients must have been accumulationd. - - Returns - ------- - tuple - (a,b). a: The magnitudes of the topK gradients, b: Their indices. - - """ - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] - concated = torch.cat(tensors_to_cat, dim=0) - if self.accumulation: - logging.info( - "TopK extract gradients based on accumulated model parameter change" - ) - diff = self.model.prev_model_params + (concated - self.model.prev) - else: - diff = concated - self.model.prev_model_params - G_topk = torch.abs(diff) - - std, mean = torch.std_mean(G_topk, unbiased=False) - self.std = std.item() - self.mean = mean.item() - value, ind = torch.topk( - G_topk, round(self.alpha * G_topk.shape[0]), dim=0, sorted=False - ) - - # only needed when ModelChangeAccumulator.accumulation = True - # does not cause problems otherwise - if self.accumulation: - self.model.prev_model_params[ind] = 0.0 # torch.zeros((len(G_topk),)) - return value, ind - - def serialized_model(self): - """ - Convert model to a dict. self.alpha specifies the fraction of model to send. - - Returns - ------- - dict - Model converted to a dict - - """ - if self.alpha > self.metadata_cap: # Share fully - return super().serialized_model() - - with torch.no_grad(): - _, G_topk = self.extract_top_gradients() - - if self.save_shared: - shared_params = dict() - shared_params["order"] = list(self.model.state_dict().keys()) - shapes = dict() - for k, v in self.model.state_dict().items(): - shapes[k] = list(v.shape) - shared_params["shapes"] = shapes - - shared_params[self.communication_round] = G_topk.tolist() - - with open( - os.path.join( - self.folder_path, - "{}_shared_params.json".format(self.communication_round + 1), - ), - "w", - ) as of: - json.dump(shared_params, of) - - logging.info("Extracting topk params") - - tensors_to_cat = [v.data.flatten() for v in self.model.parameters()] - T = torch.cat(tensors_to_cat, dim=0) - T_topk = T[G_topk] - - logging.info("Generating dictionary to send") - - m = dict() - - if not self.dict_ordered: - raise NotImplementedError - - m["indices"] = G_topk.numpy().astype(np.int32) - m["params"] = T_topk.numpy() - - assert len(m["indices"]) == len(m["params"]) - logging.info("Elements sending: {}".format(len(m["indices"]))) - - logging.info("Generated dictionary to send") - - logging.info("Converted dictionary to pickle") - self.total_data += len(self.communication.encrypt(m["params"])) - self.total_meta += len(self.communication.encrypt(m["indices"])) - - return m - - def deserialized_model(self, m): - """ - Convert received dict to state_dict. - - Parameters - ---------- - m : dict - dict received - - Returns - ------- - state_dict - state_dict of received - - """ - if self.alpha > self.metadata_cap: # Share fully - return super().deserialized_model(m) - - with torch.no_grad(): - state_dict = self.model.state_dict() - - if not self.dict_ordered: - raise NotImplementedError - - shapes = [] - lens = [] - tensors_to_cat = [] - for _, v in state_dict.items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - - T = torch.cat(tensors_to_cat, dim=0) - index_tensor = torch.tensor(m["indices"], dtype=torch.long) - logging.debug("Original tensor: {}".format(T[index_tensor])) - T[index_tensor] = torch.tensor(m["params"]) - logging.debug("Final tensor: {}".format(T[index_tensor])) - start_index = 0 - for i, key in enumerate(state_dict): - end_index = start_index + lens[i] - state_dict[key] = T[start_index:end_index].reshape(shapes[i]) - start_index = end_index - - return state_dict diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index 774dfe0..2ec700a 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -122,32 +122,23 @@ class Wavelet(Sharing): logging.info("Returning dwt compressed model weights") tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] concated = torch.cat(tensors_to_cat, dim=0) + + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array( + coeff + ) # coeff_slices will be reproduced on the receiver + data = data.ravel() + if self.change_based_selection: - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array( - coeff - ) # coeff_slices will be reproduced on the receiver - data = data.ravel() - - if self.accumulation: - logging.info( - "wavelet topk extract frequencies based on accumulated model frequency change" - ) - diff = self.model.accumulated_frequency + (data - self.model.prev) - else: - diff = data - self.model.accumulated_frequency + assert len(self.model.accumulated_gradients) == 1 + diff = self.model.accumulated_gradients[0] _, index = torch.topk( - torch.from_numpy(diff).abs(), + diff.abs(), round(self.alpha * len(data)), dim=0, sorted=False, ) else: - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array( - coeff - ) # coeff_slices will be reproduced on the receiver - data = data.ravel() _, index = torch.topk( torch.from_numpy(data).abs(), round(self.alpha * len(data)), @@ -155,8 +146,6 @@ class Wavelet(Sharing): sorted=False, ) - if self.accumulation: - self.model.accumulated_frequency[index] = 0.0 return torch.from_numpy(data[index]), index def serialized_model(self): @@ -175,6 +164,8 @@ class Wavelet(Sharing): with torch.no_grad(): topk, indices = self.apply_wavelet() + self.model.rewind_accumulation(indices) + if self.save_shared: shared_params = dict() shared_params["order"] = list(self.model.state_dict().keys()) diff --git a/src/decentralizepy/training/ChangeAccumulator.py b/src/decentralizepy/training/ChangeAccumulator.py index 6ee5dc7..c3bc81a 100644 --- a/src/decentralizepy/training/ChangeAccumulator.py +++ b/src/decentralizepy/training/ChangeAccumulator.py @@ -28,6 +28,7 @@ class ChangeAccumulator(Training): batch_size="", shuffle="", save_accumulated="", + accumulation=True, ): """ Constructor @@ -58,6 +59,8 @@ class ChangeAccumulator(Training): True if the dataset should be shuffled before training. save_accumulated : bool True if accumulated weight change should be written to file + accumulation : bool + True if the model change should be accumulated across communication steps """ super().__init__( @@ -85,6 +88,9 @@ class ChangeAccumulator(Training): self.log_dir, "model_val/{}".format(self.rank) ) Path(self.model_val_path).mkdir(parents=True, exist_ok=True) + self.accumulation = accumulation + self.init_model = None + self.prev = None def save_vector(self, v, s): """ @@ -152,12 +158,31 @@ class ChangeAccumulator(Training): k: v.data.clone().detach() for k, v in zip(self.model.state_dict(), self.model.parameters()) } + if self.accumulation: + if self.model.accumulated_changes is None: + flats = [v.data.flatten() for _, v in self.init_model.items()] + flat = torch.cat(flats) + self.model.accumulated_changes = torch.zeros_like(flat) + self.prev = flat + else: + flats = [v.data.flatten() for _, v in self.init_model.items()] + flat = torch.cat(flats) + self.model.accumulated_changes += (flat - self.prev) + self.prev = flat + super().train(dataset) with torch.no_grad(): change = { k: v.data.clone().detach() - self.init_model[k] for k, v in zip(self.model.state_dict(), self.model.parameters()) } + if self.accumulation: + flats_change = [v.data.flatten() for _, v in change.items()] + flat_change = torch.cat(flats_change) + # flatten does not copy data if input is already flattened + # however cat copies + change = {"flat" : self.model.accumulated_changes + flat_change} + self.model.accumulated_gradients.append(change) if self.save_accumulated: diff --git a/src/decentralizepy/training/FrequencyAccumulator.py b/src/decentralizepy/training/FrequencyAccumulator.py index 9c264cc..7d7c9ab 100644 --- a/src/decentralizepy/training/FrequencyAccumulator.py +++ b/src/decentralizepy/training/FrequencyAccumulator.py @@ -71,6 +71,8 @@ class FrequencyAccumulator(Training): shuffle, ) self.accumulation = accumulation + self.init_model = None + self.prev = None def train(self, dataset): """ @@ -84,22 +86,27 @@ class FrequencyAccumulator(Training): The training dataset. Should implement get_trainset(batch_size, shuffle) """ - - # this looks at the change from the last round averaging of the frequencies - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] - concated = torch.cat(tensors_to_cat, dim=0) - flat_fft = fft.rfft(concated) - if self.accumulation: - if self.model.accumulated_frequency is None: - logging.info("Initialize fft frequency accumulation") - self.model.accumulated_frequency = torch.zeros_like(flat_fft) - self.model.prev = flat_fft - else: - logging.info("fft frequency accumulation step") - self.model.accumulated_frequency += flat_fft - self.model.prev - self.model.prev = flat_fft - else: - logging.info("fft frequency accumulation reset") - self.model.accumulated_frequency = flat_fft + with torch.no_grad(): + self.model.accumulated_gradients = [] + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + self.init_model = fft.rfft(concated) + if self.accumulation: + if self.model.accumulated_changes is None: + self.model.accumulated_changes = torch.zeros_like(self.init_model) + self.prev = self.init_model + else: + self.model.accumulated_changes += (self.init_model - self.prev) + self.prev = self.init_model super().train(dataset) + + with torch.no_grad(): + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + end_model = fft.rfft(concated) + change = end_model - self.init_model + if self.accumulation: + change += self.model.accumulated_changes + + self.model.accumulated_gradients.append(change) \ No newline at end of file diff --git a/src/decentralizepy/training/FrequencyWaveletAccumulator.py b/src/decentralizepy/training/FrequencyWaveletAccumulator.py index cf65724..ee36894 100644 --- a/src/decentralizepy/training/FrequencyWaveletAccumulator.py +++ b/src/decentralizepy/training/FrequencyWaveletAccumulator.py @@ -91,23 +91,31 @@ class FrequencyWaveletAccumulator(Training): """ # this looks at the change from the last round averaging of the frequencies - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - data = data.ravel() - if self.accumulation: - if self.model.accumulated_frequency is None: - logging.info("Initialize wavelet frequency accumulation") - self.model.accumulated_frequency = np.zeros_like( - data - ) # torch.zeros_like(data) - self.model.prev = data - else: - logging.info("wavelet frequency accumulation step") - self.model.accumulated_frequency += data - self.model.prev - self.model.prev = data - else: - logging.info("wavelet frequency accumulation reset") - self.model.accumulated_frequency = data + with torch.no_grad(): + self.model.accumulated_gradients = [] + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + self.init_model = torch.from_numpy(data.ravel()) + if self.accumulation: + if self.model.accumulated_changes is None: + self.model.accumulated_changes = torch.zeros_like(self.init_model) + self.prev = self.init_model + else: + self.model.accumulated_changes += (self.init_model - self.prev) + self.prev = self.init_model + super().train(dataset) + + with torch.no_grad(): + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + end_model = torch.from_numpy(data.ravel()) + change = end_model - self.init_model + if self.accumulation: + change += self.model.accumulated_changes + + self.model.accumulated_gradients.append(change) diff --git a/src/decentralizepy/training/ModelChangeAccumulator.py b/src/decentralizepy/training/ModelChangeAccumulator.py deleted file mode 100644 index 1c70283..0000000 --- a/src/decentralizepy/training/ModelChangeAccumulator.py +++ /dev/null @@ -1,103 +0,0 @@ -import logging - -import torch -from torch import fft - -from decentralizepy.training.Training import Training - - -class ModelChangeAccumulator(Training): - """ - This class implements the training module which also accumulates the model change at the beginning of a communication round. - - """ - - def __init__( - self, - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds="", - full_epochs="", - batch_size="", - shuffle="", - accumulation=True, - ): - """ - Constructor - - Parameters - ---------- - rank : int - Rank of process local to the machine - machine_id : int - Machine ID on which the process in running - mapping : decentralizepy.mappings - The object containing the mapping rank <--> uid - model : torch.nn.Module - Neural Network for training - optimizer : torch.optim - Optimizer to learn parameters - loss : function - Loss function - log_dir : str - Directory to log the model change. - rounds : int, optional - Number of steps/epochs per training call - full_epochs: bool, optional - True if 1 round = 1 epoch. False if 1 round = 1 minibatch - batch_size : int, optional - Number of items to learn over, in one batch - shuffle : bool - True if the dataset should be shuffled before training. - accumulation : bool - True if the model change should be accumulated across communication steps - - """ - super().__init__( - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds, - full_epochs, - batch_size, - shuffle, - ) - self.accumulation = accumulation - - def train(self, dataset): - """ - Does one training iteration. - If self.accumulation is True then it accumulates model parameter changes in model.prev_model_params. - Otherwise it stores the current model parameters in model.prev_model_params. - - Parameters - ---------- - dataset : decentralizepy.datasets.Dataset - The training dataset. Should implement get_trainset(batch_size, shuffle) - - """ - - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] - concated = torch.cat(tensors_to_cat, dim=0) - if self.accumulation: - if self.model.prev_model_params is None: - logging.info("Initialize model parameter accumulation.") - self.model.prev_model_params = torch.zeros_like(concated) - self.model.prev = concated - else: - logging.info("model parameter accumulation step") - self.model.prev_model_params += concated - self.model.prev - self.model.prev = concated - else: - logging.info("model parameter reset") - self.model.prev_model_params = concated - super().train(dataset) -- GitLab From 1a56aadc4a579595db0b6db4353ce760350421c6 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Mon, 14 Mar 2022 20:11:15 +0100 Subject: [PATCH 06/16] config fixes --- eval/step_configs/config_femnist.ini | 8 ++++---- eval/step_configs/config_femnist_topkacc.ini | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/eval/step_configs/config_femnist.ini b/eval/step_configs/config_femnist.ini index fa49fc9..4814b8a 100644 --- a/eval/step_configs/config_femnist.ini +++ b/eval/step_configs/config_femnist.ini @@ -3,8 +3,8 @@ dataset_package = decentralizepy.datasets.Femnist dataset_class = Femnist random_seed = 97 model_class = CNN -train_dir = /home/risharma/leaf/data/femnist/per_user_data/train -test_dir = /home/risharma/leaf/data/femnist/data/test +train_dir = /mnt/nfs/shared/leaf/data/femnist/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/femnist/data/test ; python list of fractions below sizes = @@ -16,9 +16,9 @@ lr = 0.001 [TRAIN_PARAMS] training_package = decentralizepy.training.GradientAccumulator training_class = GradientAccumulator -rounds = 20 +rounds = 47 full_epochs = False -batch_size = 64 +batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss diff --git a/eval/step_configs/config_femnist_topkacc.ini b/eval/step_configs/config_femnist_topkacc.ini index e65f225..805004b 100644 --- a/eval/step_configs/config_femnist_topkacc.ini +++ b/eval/step_configs/config_femnist_topkacc.ini @@ -31,6 +31,6 @@ comm_class = TCP addresses_filepath = ip_addr_6Machines.json [SHARING] -sharing_package = decentralizepy.sharing.TopK -sharing_class = TopK +sharing_package = decentralizepy.sharing.PartialModel +sharing_class = PartialModel alpha = 0.1 \ No newline at end of file -- GitLab From ed4148ea0d797637df4f5b2fe6bd7bf9fa1eb598 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Mon, 14 Mar 2022 20:33:49 +0100 Subject: [PATCH 07/16] reformatting --- eval/plot.py | 44 ++++++++++++++++--- src/decentralizepy/models/Model.py | 2 +- .../training/ChangeAccumulator.py | 4 +- .../training/FrequencyAccumulator.py | 12 +++-- .../training/FrequencyWaveletAccumulator.py | 10 +++-- 5 files changed, 55 insertions(+), 17 deletions(-) diff --git a/eval/plot.py b/eval/plot.py index 0b7b66c..f3f82c7 100644 --- a/eval/plot.py +++ b/eval/plot.py @@ -3,8 +3,8 @@ import os import sys import numpy as np -from matplotlib import pyplot as plt import pandas as pd +from matplotlib import pyplot as plt def get_stats(l): @@ -62,20 +62,50 @@ def plot_results(path): plt.figure(1) means, stdevs, mins, maxs = get_stats([x["train_loss"] for x in results]) plot(means, stdevs, mins, maxs, "Training Loss", folder, "upper right") - df = pd.DataFrame({"mean": list(means.values()), "std": list(stdevs.values()), "nr_nodes": [len(results)]*len(means)}, list(means.keys()), columns=["mean", "std", "nr_nodes"]) - df.to_csv(os.path.join(path, "train_loss_" + folder + ".csv")) + df = pd.DataFrame( + { + "mean": list(means.values()), + "std": list(stdevs.values()), + "nr_nodes": [len(results)] * len(means), + }, + list(means.keys()), + columns=["mean", "std", "nr_nodes"], + ) + df.to_csv( + os.path.join(path, "train_loss_" + folder + ".csv"), index_label="rounds" + ) # Plot Testing loss plt.figure(2) means, stdevs, mins, maxs = get_stats([x["test_loss"] for x in results]) plot(means, stdevs, mins, maxs, "Testing Loss", folder, "upper right") - df = pd.DataFrame({"mean": list(means.values()), "std": list(stdevs.values()), "nr_nodes": [len(results)]*len(means)}, list(means.keys()), columns=["mean", "std", "nr_nodes"]) - df.to_csv(os.path.join(path, "test_loss_" + folder + ".csv")) + df = pd.DataFrame( + { + "mean": list(means.values()), + "std": list(stdevs.values()), + "nr_nodes": [len(results)] * len(means), + }, + list(means.keys()), + columns=["mean", "std", "nr_nodes"], + ) + df.to_csv( + os.path.join(path, "test_loss_" + folder + ".csv"), index_label="rounds" + ) # Plot Testing Accuracy plt.figure(3) means, stdevs, mins, maxs = get_stats([x["test_acc"] for x in results]) plot(means, stdevs, mins, maxs, "Testing Accuracy", folder, "lower right") - df = pd.DataFrame({"mean": list(means.values()), "std": list(stdevs.values()), "nr_nodes": [len(results)]*len(means)}, list(means.keys()), columns=["mean", "std", "nr_nodes"]) - df.to_csv(os.path.join(path, "test_acc_" + folder + ".csv")) + df = pd.DataFrame( + { + "mean": list(means.values()), + "std": list(stdevs.values()), + "nr_nodes": [len(results)] * len(means), + }, + list(means.keys()), + columns=["mean", "std", "nr_nodes"], + ) + df.to_csv( + os.path.join(path, "test_acc_" + folder + ".csv"), index_label="rounds" + ) plt.figure(6) means, stdevs, mins, maxs = get_stats([x["grad_std"] for x in results]) plot( diff --git a/src/decentralizepy/models/Model.py b/src/decentralizepy/models/Model.py index 643eec5..1965608 100644 --- a/src/decentralizepy/models/Model.py +++ b/src/decentralizepy/models/Model.py @@ -56,4 +56,4 @@ class Model(nn.Module): """ if self.accumulated_changes is not None: - self.accumulated_changes[indices] = 0.0 \ No newline at end of file + self.accumulated_changes[indices] = 0.0 diff --git a/src/decentralizepy/training/ChangeAccumulator.py b/src/decentralizepy/training/ChangeAccumulator.py index c3bc81a..5e55621 100644 --- a/src/decentralizepy/training/ChangeAccumulator.py +++ b/src/decentralizepy/training/ChangeAccumulator.py @@ -167,7 +167,7 @@ class ChangeAccumulator(Training): else: flats = [v.data.flatten() for _, v in self.init_model.items()] flat = torch.cat(flats) - self.model.accumulated_changes += (flat - self.prev) + self.model.accumulated_changes += flat - self.prev self.prev = flat super().train(dataset) @@ -181,7 +181,7 @@ class ChangeAccumulator(Training): flat_change = torch.cat(flats_change) # flatten does not copy data if input is already flattened # however cat copies - change = {"flat" : self.model.accumulated_changes + flat_change} + change = {"flat": self.model.accumulated_changes + flat_change} self.model.accumulated_gradients.append(change) diff --git a/src/decentralizepy/training/FrequencyAccumulator.py b/src/decentralizepy/training/FrequencyAccumulator.py index 7d7c9ab..91e74b3 100644 --- a/src/decentralizepy/training/FrequencyAccumulator.py +++ b/src/decentralizepy/training/FrequencyAccumulator.py @@ -88,7 +88,9 @@ class FrequencyAccumulator(Training): """ with torch.no_grad(): self.model.accumulated_gradients = [] - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] concated = torch.cat(tensors_to_cat, dim=0) self.init_model = fft.rfft(concated) if self.accumulation: @@ -96,17 +98,19 @@ class FrequencyAccumulator(Training): self.model.accumulated_changes = torch.zeros_like(self.init_model) self.prev = self.init_model else: - self.model.accumulated_changes += (self.init_model - self.prev) + self.model.accumulated_changes += self.init_model - self.prev self.prev = self.init_model super().train(dataset) with torch.no_grad(): - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] concated = torch.cat(tensors_to_cat, dim=0) end_model = fft.rfft(concated) change = end_model - self.init_model if self.accumulation: change += self.model.accumulated_changes - self.model.accumulated_gradients.append(change) \ No newline at end of file + self.model.accumulated_gradients.append(change) diff --git a/src/decentralizepy/training/FrequencyWaveletAccumulator.py b/src/decentralizepy/training/FrequencyWaveletAccumulator.py index ee36894..54238ab 100644 --- a/src/decentralizepy/training/FrequencyWaveletAccumulator.py +++ b/src/decentralizepy/training/FrequencyWaveletAccumulator.py @@ -93,7 +93,9 @@ class FrequencyWaveletAccumulator(Training): # this looks at the change from the last round averaging of the frequencies with torch.no_grad(): self.model.accumulated_gradients = [] - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] concated = torch.cat(tensors_to_cat, dim=0) coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) data, coeff_slices = pywt.coeffs_to_array(coeff) @@ -103,13 +105,15 @@ class FrequencyWaveletAccumulator(Training): self.model.accumulated_changes = torch.zeros_like(self.init_model) self.prev = self.init_model else: - self.model.accumulated_changes += (self.init_model - self.prev) + self.model.accumulated_changes += self.init_model - self.prev self.prev = self.init_model super().train(dataset) with torch.no_grad(): - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] concated = torch.cat(tensors_to_cat, dim=0) coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) data, coeff_slices = pywt.coeffs_to_array(coeff) -- GitLab From 573e433159fa7c0718fa934d597330a650a43db8 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Tue, 15 Mar 2022 22:28:13 +0100 Subject: [PATCH 08/16] fft and wavelet moving everything to sharing --- eval/step_configs/config_femnist_fft.ini | 8 +- eval/step_configs/config_femnist_wavelet.ini | 8 +- src/decentralizepy/sharing/FFT.py | 59 +++++++-- src/decentralizepy/sharing/Wavelet.py | 74 ++++++++--- .../training/FrequencyAccumulator.py | 116 ---------------- .../training/FrequencyWaveletAccumulator.py | 125 ------------------ 6 files changed, 111 insertions(+), 279 deletions(-) delete mode 100644 src/decentralizepy/training/FrequencyAccumulator.py delete mode 100644 src/decentralizepy/training/FrequencyWaveletAccumulator.py diff --git a/eval/step_configs/config_femnist_fft.ini b/eval/step_configs/config_femnist_fft.ini index 13a769c..afac1f4 100644 --- a/eval/step_configs/config_femnist_fft.ini +++ b/eval/step_configs/config_femnist_fft.ini @@ -15,15 +15,14 @@ lr = 0.001 # There are 734463 femnist samples [TRAIN_PARAMS] -training_package = decentralizepy.training.FrequencyAccumulator -training_class = FrequencyAccumulator +training_package = decentralizepy.training.Training +training_class = Training rounds = 47 full_epochs = False batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss -accumulation = True [COMMUNICATION] comm_package = decentralizepy.communication.TCP @@ -34,4 +33,5 @@ addresses_filepath = ip_addr_6Machines.json sharing_package = decentralizepy.sharing.FFT sharing_class = FFT alpha = 0.1 -change_based_selection = True \ No newline at end of file +change_based_selection = True +accumulation = True diff --git a/eval/step_configs/config_femnist_wavelet.ini b/eval/step_configs/config_femnist_wavelet.ini index ac3bac2..68704a3 100644 --- a/eval/step_configs/config_femnist_wavelet.ini +++ b/eval/step_configs/config_femnist_wavelet.ini @@ -15,17 +15,14 @@ lr = 0.001 # There are 734463 femnist samples [TRAIN_PARAMS] -training_package = decentralizepy.training.FrequencyWaveletAccumulator -training_class = FrequencyWaveletAccumulator +training_package = decentralizepy.training.Training +training_class = Training rounds = 47 full_epochs = False batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss -wavelet=sym2 -level= None -accumulation = True [COMMUNICATION] comm_package = decentralizepy.communication.TCP @@ -39,3 +36,4 @@ change_based_selection = True alpha = 0.1 wavelet=sym2 level= None +accumulation = True diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index a4c3b59..80b5a5d 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -99,6 +99,23 @@ class FFT(Sharing): self.change_based_selection = change_based_selection self.accumulation = accumulation + # getting the initial model + with torch.no_grad(): + self.model.accumulated_gradients = [] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + concated = torch.cat(tensors_to_cat, dim=0) + self.init_model = fft.rfft(concated) + self.prev = None + if self.accumulation: + if self.model.accumulated_changes is None: + self.model.accumulated_changes = torch.zeros_like(self.init_model) + self.prev = self.init_model + else: + self.model.accumulated_changes += self.init_model - self.prev + self.prev = self.init_model + def apply_fft(self): """ Does fft transformation of the model parameters and selects topK (alpha) of them in the frequency domain @@ -225,6 +242,25 @@ class FFT(Sharing): """ t_start = time() + shapes = [] + lens = [] + end_model = None + change = 0 + self.model.accumulated_gradients = [] + with torch.no_grad(): + # FFT of this model + tensors_to_cat = [] + for _, v in self.model.state_dict().items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + concated = torch.cat(tensors_to_cat, dim=0) + end_model = fft.rfft(concated) + change = end_model - self.init_model + if self.accumulation: + change += self.model.accumulated_changes + self.model.accumulated_gradients.append(change) data = self.serialized_model() t_post_serialize = time() my_uid = self.mapping.get_uid(self.rank, self.machine_id) @@ -255,17 +291,7 @@ class FFT(Sharing): total = None weight_total = 0 - # FFT of this model - shapes = [] - lens = [] - tensors_to_cat = [] - for _, v in self.model.state_dict().items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - concated = torch.cat(tensors_to_cat, dim=0) - flat_fft = fft.rfft(concated) + flat_fft = end_model for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() @@ -303,6 +329,17 @@ class FFT(Sharing): self.communication_round += 1 + with torch.no_grad(): + self.model.accumulated_gradients = [] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + concated = torch.cat(tensors_to_cat, dim=0) + self.init_model = fft.rfft(concated) + if self.accumulation: + self.model.accumulated_changes += self.init_model - self.prev + self.prev = self.init_model + t_end = time() logging.info( diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index 2ec700a..2d651b0 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -106,6 +106,26 @@ class Wavelet(Sharing): Path(self.folder_path).mkdir(parents=True, exist_ok=True) self.change_based_selection = change_based_selection + self.accumulation = accumulation + + # getting the initial model + with torch.no_grad(): + self.model.accumulated_gradients = [] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + self.init_model = torch.from_numpy(data.ravel()) + self.prev = None + if self.accumulation: + if self.model.accumulated_changes is None: + self.model.accumulated_changes = torch.zeros_like(self.init_model) + self.prev = self.init_model + else: + self.model.accumulated_changes += self.init_model - self.prev + self.prev = self.init_model def apply_wavelet(self): """ @@ -257,6 +277,29 @@ class Wavelet(Sharing): """ t_start = time() + shapes = [] + lens = [] + end_model = None + change = 0 + self.model.accumulated_gradients = [] + with torch.no_grad(): + # FFT of this model + tensors_to_cat = [] + for _, v in self.model.state_dict().items(): + shapes.append(v.shape) + t = v.flatten() + lens.append(t.shape[0]) + tensors_to_cat.append(t) + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + shape = data.shape + wt_params = data.ravel() + end_model = torch.from_numpy(wt_params) + change = end_model - self.init_model + if self.accumulation: + change += self.model.accumulated_changes + self.model.accumulated_gradients.append(change) data = self.serialized_model() t_post_serialize = time() my_uid = self.mapping.get_uid(self.rank, self.machine_id) @@ -287,24 +330,6 @@ class Wavelet(Sharing): total = None weight_total = 0 - # FFT of this model - shapes = [] - lens = [] - tensors_to_cat = [] - # TODO: should we detach - for _, v in self.model.state_dict().items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - wt_params, coeff_slices = pywt.coeffs_to_array( - coeff - ) # coeff_slices will be reproduced on the receiver - shape = wt_params.shape - wt_params = wt_params.ravel() - for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() logging.debug( @@ -348,6 +373,19 @@ class Wavelet(Sharing): self.communication_round += 1 + with torch.no_grad(): + self.model.accumulated_gradients = [] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + concated = torch.cat(tensors_to_cat, dim=0) + coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + self.init_model = torch.from_numpy(data.ravel()) + if self.accumulation: + self.model.accumulated_changes += self.init_model - self.prev + self.prev = self.init_model + t_end = time() logging.info( diff --git a/src/decentralizepy/training/FrequencyAccumulator.py b/src/decentralizepy/training/FrequencyAccumulator.py deleted file mode 100644 index 91e74b3..0000000 --- a/src/decentralizepy/training/FrequencyAccumulator.py +++ /dev/null @@ -1,116 +0,0 @@ -import logging - -import torch -from torch import fft - -from decentralizepy.training.Training import Training - - -class FrequencyAccumulator(Training): - """ - This class implements the training module which also accumulates the fft frequency at the beginning of steps a communication round. - - """ - - def __init__( - self, - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds="", - full_epochs="", - batch_size="", - shuffle="", - accumulation=True, - ): - """ - Constructor - - Parameters - ---------- - rank : int - Rank of process local to the machine - machine_id : int - Machine ID on which the process in running - mapping : decentralizepy.mappings - The object containing the mapping rank <--> uid - model : torch.nn.Module - Neural Network for training - optimizer : torch.optim - Optimizer to learn parameters - loss : function - Loss function - log_dir : str - Directory to log the model change. - rounds : int, optional - Number of steps/epochs per training call - full_epochs: bool, optional - True if 1 round = 1 epoch. False if 1 round = 1 minibatch - batch_size : int, optional - Number of items to learn over, in one batch - shuffle : bool - True if the dataset should be shuffled before training. - accumulation : bool - True if the model change should be accumulated across communication steps - """ - super().__init__( - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds, - full_epochs, - batch_size, - shuffle, - ) - self.accumulation = accumulation - self.init_model = None - self.prev = None - - def train(self, dataset): - """ - Does one training iteration. - If self.accumulation is True then it accumulates model fft frequency changes in model.accumulated_frequency. - Otherwise it stores the current fft frequency representation of the model in model.accumulated_frequency. - - Parameters - ---------- - dataset : decentralizepy.datasets.Dataset - The training dataset. Should implement get_trainset(batch_size, shuffle) - - """ - with torch.no_grad(): - self.model.accumulated_gradients = [] - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - self.init_model = fft.rfft(concated) - if self.accumulation: - if self.model.accumulated_changes is None: - self.model.accumulated_changes = torch.zeros_like(self.init_model) - self.prev = self.init_model - else: - self.model.accumulated_changes += self.init_model - self.prev - self.prev = self.init_model - - super().train(dataset) - - with torch.no_grad(): - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - end_model = fft.rfft(concated) - change = end_model - self.init_model - if self.accumulation: - change += self.model.accumulated_changes - - self.model.accumulated_gradients.append(change) diff --git a/src/decentralizepy/training/FrequencyWaveletAccumulator.py b/src/decentralizepy/training/FrequencyWaveletAccumulator.py deleted file mode 100644 index 54238ab..0000000 --- a/src/decentralizepy/training/FrequencyWaveletAccumulator.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging - -import numpy as np -import pywt -import torch - -from decentralizepy.training.Training import Training - - -class FrequencyWaveletAccumulator(Training): - """ - This class implements the training module which also accumulates the wavelet frequency at the beginning of steps a communication round. - - """ - - def __init__( - self, - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds="", - full_epochs="", - batch_size="", - shuffle="", - wavelet="haar", - level=4, - accumulation=True, - ): - """ - Constructor - - Parameters - ---------- - rank : int - Rank of process local to the machine - machine_id : int - Machine ID on which the process in running - mapping : decentralizepy.mappings - The object containing the mapping rank <--> uid - model : torch.nn.Module - Neural Network for training - optimizer : torch.optim - Optimizer to learn parameters - loss : function - Loss function - log_dir : str - Directory to log the model change. - rounds : int, optional - Number of steps/epochs per training call - full_epochs: bool, optional - True if 1 round = 1 epoch. False if 1 round = 1 minibatch - batch_size : int, optional - Number of items to learn over, in one batch - shuffle : bool - True if the dataset should be shuffled before training. - accumulation : bool - True if the model change should be accumulated across communication steps - """ - super().__init__( - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds, - full_epochs, - batch_size, - shuffle, - ) - self.wavelet = wavelet - self.level = level - self.accumulation = accumulation - - def train(self, dataset): - """ - Does one training iteration. - If self.accumulation is True then it accumulates model wavelet frequency changes in model.accumulated_frequency. - Otherwise it stores the current wavelet frequency representation of the model in model.accumulated_frequency. - - Parameters - ---------- - dataset : decentralizepy.datasets.Dataset - The training dataset. Should implement get_trainset(batch_size, shuffle) - - """ - - # this looks at the change from the last round averaging of the frequencies - with torch.no_grad(): - self.model.accumulated_gradients = [] - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - self.init_model = torch.from_numpy(data.ravel()) - if self.accumulation: - if self.model.accumulated_changes is None: - self.model.accumulated_changes = torch.zeros_like(self.init_model) - self.prev = self.init_model - else: - self.model.accumulated_changes += self.init_model - self.prev - self.prev = self.init_model - - super().train(dataset) - - with torch.no_grad(): - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - end_model = torch.from_numpy(data.ravel()) - change = end_model - self.init_model - if self.accumulation: - change += self.model.accumulated_changes - - self.model.accumulated_gradients.append(change) -- GitLab From 72a9f1a67db82f98cee949296be3d7b34f612970 Mon Sep 17 00:00:00 2001 From: Rishi Sharma Date: Wed, 16 Mar 2022 13:49:40 +0000 Subject: [PATCH 09/16] Remove misleading comment --- eval/testing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eval/testing.py b/eval/testing.py index bb16c2f..abd6333 100644 --- a/eval/testing.py +++ b/eval/testing.py @@ -24,7 +24,6 @@ def read_ini(file_path): if __name__ == "__main__": args = utils.get_args() - # prevents accidental log overwrites Path(args.log_dir).mkdir(parents=True, exist_ok=True) log_level = { -- GitLab From 0c2b8989bf17797d94d867176a398066c60ccf67 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Thu, 17 Mar 2022 12:25:33 +0100 Subject: [PATCH 10/16] fixed from_torch bug in wavelet; fixed circular dependency deadlock in fft and wavelet; added run_all.sh run_all.sh and more config files crudini moving everything to sharing --- eval/run_all.sh | 39 +++ eval/step_configs/config_celeba.ini | 14 +- eval/step_configs/config_celeba_100.ini | 10 +- eval/step_configs/config_celeba_fft.ini | 36 +++ eval/step_configs/config_celeba_grow.ini | 14 +- .../config_celeba_manualadapt.ini | 35 +++ .../config_celeba_randomalpha.ini | 33 +++ .../config_celeba_randomalphainc.ini | 33 +++ .../step_configs/config_celeba_roundrobin.ini | 34 +++ .../config_celeba_subsampling.ini | 34 +++ eval/step_configs/config_celeba_topkacc.ini | 35 +++ eval/step_configs/config_celeba_topkparam.ini | 34 +++ .../step_configs/config_celeba_topkrandom.ini | 34 +++ eval/step_configs/config_celeba_wavelet.ini | 38 +++ eval/step_configs/config_femnist.ini | 4 +- eval/step_configs/config_femnist_grow.ini | 4 +- eval/step_configs/config_femnist_topkacc.ini | 4 +- eval/testing.py | 1 + setup.cfg | 1 + src/decentralizepy/models/Model.py | 2 +- src/decentralizepy/node/Node.py | 1 + src/decentralizepy/sharing/FFT.py | 238 +++++---------- src/decentralizepy/sharing/PartialModel.py | 127 +++++++- src/decentralizepy/sharing/Sharing.py | 67 +++-- src/decentralizepy/sharing/TopKPlusRandom.py | 10 +- src/decentralizepy/sharing/Wavelet.py | 273 ++++++------------ .../training/GradientAccumulator.py | 112 ------- src/decentralizepy/utils.py | 14 + 28 files changed, 747 insertions(+), 534 deletions(-) create mode 100755 eval/run_all.sh create mode 100644 eval/step_configs/config_celeba_fft.ini create mode 100644 eval/step_configs/config_celeba_manualadapt.ini create mode 100644 eval/step_configs/config_celeba_randomalpha.ini create mode 100644 eval/step_configs/config_celeba_randomalphainc.ini create mode 100644 eval/step_configs/config_celeba_roundrobin.ini create mode 100644 eval/step_configs/config_celeba_subsampling.ini create mode 100644 eval/step_configs/config_celeba_topkacc.ini create mode 100644 eval/step_configs/config_celeba_topkparam.ini create mode 100644 eval/step_configs/config_celeba_topkrandom.ini create mode 100644 eval/step_configs/config_celeba_wavelet.ini delete mode 100644 src/decentralizepy/training/GradientAccumulator.py diff --git a/eval/run_all.sh b/eval/run_all.sh new file mode 100755 index 0000000..1afdf02 --- /dev/null +++ b/eval/run_all.sh @@ -0,0 +1,39 @@ +#!/bin/bash +nfs_home=$1 +python_bin=$2 +decpy_path=$nfs_home/decentralizepy/eval +cd $decpy_path + +env_python=$python_bin/python3 +graph=96_regular.edges #4_node_fullyConnected.edges +config_file=~/tmp/config.ini +procs_per_machine=16 +machines=6 +iterations=5 +test_after=21 # we do not test +eval_file=testing.py +log_level=INFO + +ip_machines=$nfs_home/configs/ip_addr_6Machines.json + +m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` +export PYTHONFAULTHANDLER=1 +tests=("step_configs/config_celeba.ini" "step_configs/config_celeba_100.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini" +"step_configs/config_celeba_grow.ini" "step_configs/config_celeba_manualadapt.ini" "step_configs/config_celeba_randomalpha.ini" +"step_configs/config_celeba_randomalphainc.ini" "step_configs/config_celeba_roundrobin.ini" "step_configs/config_celeba_subsampling.ini" +"step_configs/config_celeba_topkrandom.ini" "step_configs/config_celeba_topkacc.ini" "step_configs/config_celeba_topkparam.ini") + +for i in "${tests[@]}" +do + echo $i + IFS='_' read -ra NAMES <<< $i + IFS='.' read -ra NAME <<< ${NAMES[-1]} + log_dir=$nfs_home/logs/testing/${NAME[0]}$(date '+%Y-%m-%dT%H:%M')/machine$m + mkdir -p $log_dir + cp $i $config_file + $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines + $env_python $eval_file -ro 0 -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level + echo $i is done + sleep 3 + echo end of sleep +done diff --git a/eval/step_configs/config_celeba.ini b/eval/step_configs/config_celeba.ini index 5cadf01..6c9a4b5 100644 --- a/eval/step_configs/config_celeba.ini +++ b/eval/step_configs/config_celeba.ini @@ -2,9 +2,9 @@ dataset_package = decentralizepy.datasets.Celeba dataset_class = Celeba model_class = CNN -images_dir = /home/risharma/leaf/data/celeba/data/raw/img_align_celeba -train_dir = /home/risharma/leaf/data/celeba/per_user_data/train -test_dir = /home/risharma/leaf/data/celeba/data/test +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test ; python list of fractions below sizes = @@ -14,11 +14,11 @@ optimizer_class = Adam lr = 0.001 [TRAIN_PARAMS] -training_package = decentralizepy.training.GradientAccumulator -training_class = GradientAccumulator -rounds = 20 +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 full_epochs = False -batch_size = 64 +batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss diff --git a/eval/step_configs/config_celeba_100.ini b/eval/step_configs/config_celeba_100.ini index 70e14bb..caf05fa 100644 --- a/eval/step_configs/config_celeba_100.ini +++ b/eval/step_configs/config_celeba_100.ini @@ -2,9 +2,9 @@ dataset_package = decentralizepy.datasets.Celeba dataset_class = Celeba model_class = CNN -images_dir = /home/risharma/leaf/data/celeba/data/raw/img_align_celeba -train_dir = /home/risharma/leaf/data/celeba/per_user_data/train -test_dir = /home/risharma/leaf/data/celeba/data/test +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test ; python list of fractions below sizes = @@ -16,9 +16,9 @@ lr = 0.001 [TRAIN_PARAMS] training_package = decentralizepy.training.Training training_class = Training -rounds = 20 +rounds = 4 full_epochs = False -batch_size = 64 +batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss diff --git a/eval/step_configs/config_celeba_fft.ini b/eval/step_configs/config_celeba_fft.ini new file mode 100644 index 0000000..e8d6a70 --- /dev/null +++ b/eval/step_configs/config_celeba_fft.ini @@ -0,0 +1,36 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.FFT +sharing_class = FFT +alpha = 0.1 +change_based_selection = True +accumulation = True diff --git a/eval/step_configs/config_celeba_grow.ini b/eval/step_configs/config_celeba_grow.ini index be0812e..37e74ae 100644 --- a/eval/step_configs/config_celeba_grow.ini +++ b/eval/step_configs/config_celeba_grow.ini @@ -2,9 +2,9 @@ dataset_package = decentralizepy.datasets.Celeba dataset_class = Celeba model_class = CNN -images_dir = /home/risharma/leaf/data/celeba/data/raw/img_align_celeba -train_dir = /home/risharma/leaf/data/celeba/per_user_data/train -test_dir = /home/risharma/leaf/data/celeba/data/test +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test ; python list of fractions below sizes = @@ -14,11 +14,11 @@ optimizer_class = Adam lr = 0.001 [TRAIN_PARAMS] -training_package = decentralizepy.training.GradientAccumulator -training_class = GradientAccumulator -rounds = 20 +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 full_epochs = False -batch_size = 64 +batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss diff --git a/eval/step_configs/config_celeba_manualadapt.ini b/eval/step_configs/config_celeba_manualadapt.ini new file mode 100644 index 0000000..1c117e2 --- /dev/null +++ b/eval/step_configs/config_celeba_manualadapt.ini @@ -0,0 +1,35 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.ManualAdapt +sharing_class = ManualAdapt +change_alpha = [0.1, 0.5] +change_rounds = [10,30] diff --git a/eval/step_configs/config_celeba_randomalpha.ini b/eval/step_configs/config_celeba_randomalpha.ini new file mode 100644 index 0000000..1c4b989 --- /dev/null +++ b/eval/step_configs/config_celeba_randomalpha.ini @@ -0,0 +1,33 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.RandomAlpha +sharing_class = RandomAlpha diff --git a/eval/step_configs/config_celeba_randomalphainc.ini b/eval/step_configs/config_celeba_randomalphainc.ini new file mode 100644 index 0000000..5171b64 --- /dev/null +++ b/eval/step_configs/config_celeba_randomalphainc.ini @@ -0,0 +1,33 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.RandomAlphaIncremental +sharing_class = RandomAlphaIncremental diff --git a/eval/step_configs/config_celeba_roundrobin.ini b/eval/step_configs/config_celeba_roundrobin.ini new file mode 100644 index 0000000..3dadf32 --- /dev/null +++ b/eval/step_configs/config_celeba_roundrobin.ini @@ -0,0 +1,34 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.RoundRobinPartial +sharing_class = RoundRobinPartial +alpha = 0.1 diff --git a/eval/step_configs/config_celeba_subsampling.ini b/eval/step_configs/config_celeba_subsampling.ini new file mode 100644 index 0000000..b806898 --- /dev/null +++ b/eval/step_configs/config_celeba_subsampling.ini @@ -0,0 +1,34 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.SubSampling +sharing_class = SubSampling +alpha = 0.1 diff --git a/eval/step_configs/config_celeba_topkacc.ini b/eval/step_configs/config_celeba_topkacc.ini new file mode 100644 index 0000000..89eef29 --- /dev/null +++ b/eval/step_configs/config_celeba_topkacc.ini @@ -0,0 +1,35 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.PartialModel +sharing_class = PartialModel +alpha = 0.1 +accumulation = True diff --git a/eval/step_configs/config_celeba_topkparam.ini b/eval/step_configs/config_celeba_topkparam.ini new file mode 100644 index 0000000..babc3e9 --- /dev/null +++ b/eval/step_configs/config_celeba_topkparam.ini @@ -0,0 +1,34 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.TopKParams +sharing_class = TopKParams +alpha = 0.1 diff --git a/eval/step_configs/config_celeba_topkrandom.ini b/eval/step_configs/config_celeba_topkrandom.ini new file mode 100644 index 0000000..7674955 --- /dev/null +++ b/eval/step_configs/config_celeba_topkrandom.ini @@ -0,0 +1,34 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.TopKPlusRandom +sharing_class = TopKPlusRandom +alpha = 0.1 diff --git a/eval/step_configs/config_celeba_wavelet.ini b/eval/step_configs/config_celeba_wavelet.ini new file mode 100644 index 0000000..70e9f15 --- /dev/null +++ b/eval/step_configs/config_celeba_wavelet.ini @@ -0,0 +1,38 @@ +[DATASET] +dataset_package = decentralizepy.datasets.Celeba +dataset_class = Celeba +model_class = CNN +images_dir = /mnt/nfs/shared/leaf/data/celeba/data/raw/img_align_celeba +train_dir = /mnt/nfs/shared/leaf/data/celeba/per_user_data/train +test_dir = /mnt/nfs/shared/leaf/data/celeba/data/test +; python list of fractions below +sizes = + +[OPTIMIZER_PARAMS] +optimizer_package = torch.optim +optimizer_class = Adam +lr = 0.001 + +[TRAIN_PARAMS] +training_package = decentralizepy.training.Training +training_class = Training +rounds = 4 +full_epochs = False +batch_size = 16 +shuffle = True +loss_package = torch.nn +loss_class = CrossEntropyLoss + +[COMMUNICATION] +comm_package = decentralizepy.communication.TCP +comm_class = TCP +addresses_filepath = ip_addr_6Machines.json + +[SHARING] +sharing_package = decentralizepy.sharing.Wavelet +sharing_class = Wavelet +change_based_selection = True +alpha = 0.1 +wavelet=sym2 +level= None +accumulation = True diff --git a/eval/step_configs/config_femnist.ini b/eval/step_configs/config_femnist.ini index 4814b8a..8063181 100644 --- a/eval/step_configs/config_femnist.ini +++ b/eval/step_configs/config_femnist.ini @@ -14,8 +14,8 @@ optimizer_class = Adam lr = 0.001 [TRAIN_PARAMS] -training_package = decentralizepy.training.GradientAccumulator -training_class = GradientAccumulator +training_package = decentralizepy.training.Training +training_class = Training rounds = 47 full_epochs = False batch_size = 16 diff --git a/eval/step_configs/config_femnist_grow.ini b/eval/step_configs/config_femnist_grow.ini index 9f18ad9..2a779c4 100644 --- a/eval/step_configs/config_femnist_grow.ini +++ b/eval/step_configs/config_femnist_grow.ini @@ -13,8 +13,8 @@ optimizer_class = Adam lr = 0.001 [TRAIN_PARAMS] -training_package = decentralizepy.training.GradientAccumulator -training_class = GradientAccumulator +training_package = decentralizepy.training.Training +training_class = Training rounds = 20 full_epochs = False batch_size = 64 diff --git a/eval/step_configs/config_femnist_topkacc.ini b/eval/step_configs/config_femnist_topkacc.ini index 805004b..2705fe7 100644 --- a/eval/step_configs/config_femnist_topkacc.ini +++ b/eval/step_configs/config_femnist_topkacc.ini @@ -23,7 +23,6 @@ batch_size = 16 shuffle = True loss_package = torch.nn loss_class = CrossEntropyLoss -accumulation = True [COMMUNICATION] comm_package = decentralizepy.communication.TCP @@ -33,4 +32,5 @@ addresses_filepath = ip_addr_6Machines.json [SHARING] sharing_package = decentralizepy.sharing.PartialModel sharing_class = PartialModel -alpha = 0.1 \ No newline at end of file +alpha = 0.1 +accumulation = True \ No newline at end of file diff --git a/eval/testing.py b/eval/testing.py index abd6333..b9c4081 100644 --- a/eval/testing.py +++ b/eval/testing.py @@ -65,3 +65,4 @@ if __name__ == "__main__": args.reset_optimizer, ], ) + print("after spawn") diff --git a/setup.cfg b/setup.cfg index 0b85f72..e174dd4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,6 +44,7 @@ install_requires = localconfig PyWavelets pandas + crudini include_package_data = True python_requires = >=3.6 [options.packages.find] diff --git a/src/decentralizepy/models/Model.py b/src/decentralizepy/models/Model.py index 1965608..f3635a9 100644 --- a/src/decentralizepy/models/Model.py +++ b/src/decentralizepy/models/Model.py @@ -14,7 +14,7 @@ class Model(nn.Module): """ super().__init__() - self.accumulated_gradients = [] + self.model_change = None self._param_count_ot = None self._param_count_total = None self.accumulated_changes = None diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index 7854c38..463f57f 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -481,3 +481,4 @@ class Node: ) self.run() + logging.info("Node finished running") diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index 80b5a5d..1bc7e0e 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -8,10 +8,26 @@ import numpy as np import torch import torch.fft as fft -from decentralizepy.sharing.Sharing import Sharing +from decentralizepy.sharing.PartialModel import PartialModel -class FFT(Sharing): +def change_transformer_fft(x): + """ + Transforms the model changes into frequency domain + + Parameters + ---------- + x : torch.Tensor + Model change in the space domain + + Returns + ------- + x : torch.Tensor + Representation of the change int the frequency domain + """ + return fft.rfft(x) + +class FFT(PartialModel): """ This class implements the fft version of model sharing It is based on PartialModel.py @@ -32,8 +48,8 @@ class FFT(Sharing): dict_ordered=True, save_shared=False, metadata_cap=1.0, - pickle=True, change_based_selection=True, + save_accumulated="", accumulation=True, ): """ @@ -65,56 +81,19 @@ class FFT(Sharing): Specifies if the indices of shared parameters should be logged metadata_cap : float Share full model when self.alpha > metadata_cap - pickle : bool - use pickle to serialize the model parameters change_based_selection : bool use frequency change to select topk frequencies + save_accumulated : bool + True if accumulated weight change in the frequency domain should be written to file. In case of accumulation + the accumulated change is stored. accumulation : bool True if the the indices to share should be selected based on accumulated frequency change """ super().__init__( - rank, machine_id, communication, mapping, graph, model, dataset, log_dir + rank, machine_id, communication, mapping, graph, model, dataset, log_dir, alpha, dict_ordered, save_shared, + metadata_cap, accumulation, save_accumulated, change_transformer_fft ) - self.alpha = alpha - self.dict_ordered = dict_ordered - self.save_shared = save_shared - self.metadata_cap = metadata_cap - self.total_meta = 0 - - self.pickle = pickle - - logging.info("subsampling pickling=" + str(pickle)) - - if self.save_shared: - # Only save for 2 procs: Save space - if rank != 0 or rank != 1: - self.save_shared = False - - if self.save_shared: - self.folder_path = os.path.join( - self.log_dir, "shared_params/{}".format(self.rank) - ) - Path(self.folder_path).mkdir(parents=True, exist_ok=True) - self.change_based_selection = change_based_selection - self.accumulation = accumulation - - # getting the initial model - with torch.no_grad(): - self.model.accumulated_gradients = [] - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - self.init_model = fft.rfft(concated) - self.prev = None - if self.accumulation: - if self.model.accumulated_changes is None: - self.model.accumulated_changes = torch.zeros_like(self.init_model) - self.prev = self.init_model - else: - self.model.accumulated_changes += self.init_model - self.prev - self.prev = self.init_model def apply_fft(self): """ @@ -129,20 +108,19 @@ class FFT(Sharing): """ logging.info("Returning fft compressed model weights") - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] - concated = torch.cat(tensors_to_cat, dim=0) - flat_fft = fft.rfft(concated) - if self.change_based_selection: - - assert len(self.model.accumulated_gradients) == 1 - diff = self.model.accumulated_gradients[0] - _, index = torch.topk( - diff.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False - ) - else: - _, index = torch.topk( - flat_fft.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False - ) + with torch.no_grad(): + tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + concated = torch.cat(tensors_to_cat, dim=0) + flat_fft = self.change_transformer(concated) + if self.change_based_selection: + diff = self.model.model_change + _, index = torch.topk( + diff.abs(), round(self.alpha * len(diff)), dim=0, sorted=False + ) + else: + _, index = torch.topk( + flat_fft.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False + ) return flat_fft[index], index @@ -199,7 +177,7 @@ class FFT(Sharing): self.communication.encrypt(m["alpha"]) ) - return m + return m def deserialized_model(self, m): """ @@ -220,8 +198,6 @@ class FFT(Sharing): return super().deserialized_model(m) with torch.no_grad(): - state_dict = self.model.state_dict() - if not self.dict_ordered: raise NotImplementedError @@ -234,119 +210,47 @@ class FFT(Sharing): ret = dict() ret["indices"] = indices_tensor ret["params"] = params_tensor - return ret + return ret - def step(self): + def _averaging(self): """ - Perform a sharing step. Implements D-PSGD. + Averages the received model with the local model """ - t_start = time() - shapes = [] - lens = [] - end_model = None - change = 0 - self.model.accumulated_gradients = [] with torch.no_grad(): - # FFT of this model - tensors_to_cat = [] - for _, v in self.model.state_dict().items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - concated = torch.cat(tensors_to_cat, dim=0) - end_model = fft.rfft(concated) - change = end_model - self.init_model - if self.accumulation: - change += self.model.accumulated_changes - self.model.accumulated_gradients.append(change) - data = self.serialized_model() - t_post_serialize = time() - my_uid = self.mapping.get_uid(self.rank, self.machine_id) - all_neighbors = self.graph.neighbors(my_uid) - iter_neighbors = self.get_neighbors(all_neighbors) - data["degree"] = len(all_neighbors) - data["iteration"] = self.communication_round - for neighbor in iter_neighbors: - self.communication.send(neighbor, data) - t_post_send = time() - logging.info("Waiting for messages from neighbors") - while not self.received_from_all(): - sender, data = self.communication.receive() - logging.debug("Received model from {}".format(sender)) - degree = data["degree"] - iteration = data["iteration"] - del data["degree"] - del data["iteration"] - self.peer_deques[sender].append((degree, iteration, data)) - logging.info( - "Deserialized received model from {} of iteration {}".format( - sender, iteration - ) - ) - t_post_recv = time() + total = None + weight_total = 0 - logging.info("Starting model averaging after receiving from all neighbors") - total = None - weight_total = 0 + flat_fft = self.change_transformer(self.init_model) - flat_fft = end_model - - for i, n in enumerate(self.peer_deques): - degree, iteration, data = self.peer_deques[n].popleft() - logging.debug( - "Averaging model from neighbor {} of iteration {}".format(n, iteration) - ) - data = self.deserialized_model(data) - params = data["params"] - indices = data["indices"] - # use local data to complement - topkf = flat_fft.clone().detach() - topkf[indices] = params - - weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings - weight_total += weight - if total is None: - total = weight * topkf - else: - total += weight * topkf + for i, n in enumerate(self.peer_deques): + degree, iteration, data = self.peer_deques[n].popleft() + logging.debug( + "Averaging model from neighbor {} of iteration {}".format(n, iteration) + ) + data = self.deserialized_model(data) + params = data["params"] + indices = data["indices"] + # use local data to complement + topkf = flat_fft.clone().detach() + topkf[indices] = params + + weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings + weight_total += weight + if total is None: + total = weight * topkf + else: + total += weight * topkf - # Metro-Hastings - total += (1 - weight_total) * flat_fft - reverse_total = fft.irfft(total) + # Metro-Hastings + total += (1 - weight_total) * flat_fft + reverse_total = fft.irfft(total) - start_index = 0 - std_dict = {} - for i, key in enumerate(self.model.state_dict()): - end_index = start_index + lens[i] - std_dict[key] = reverse_total[start_index:end_index].reshape(shapes[i]) - start_index = end_index + start_index = 0 + std_dict = {} + for i, key in enumerate(self.model.state_dict()): + end_index = start_index + self.lens[i] + std_dict[key] = reverse_total[start_index:end_index].reshape(self.shapes[i]) + start_index = end_index self.model.load_state_dict(std_dict) - - logging.info("Model averaging complete") - - self.communication_round += 1 - - with torch.no_grad(): - self.model.accumulated_gradients = [] - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - self.init_model = fft.rfft(concated) - if self.accumulation: - self.model.accumulated_changes += self.init_model - self.prev - self.prev = self.init_model - - t_end = time() - - logging.info( - "Sharing::step | Serialize: %f; Send: %f; Recv: %f; Averaging: %f; Total: %f", - t_post_serialize - t_start, - t_post_send - t_post_serialize, - t_post_recv - t_post_send, - t_end - t_post_recv, - t_end - t_start, - ) diff --git a/src/decentralizepy/sharing/PartialModel.py b/src/decentralizepy/sharing/PartialModel.py index 204ee23..c961c43 100644 --- a/src/decentralizepy/sharing/PartialModel.py +++ b/src/decentralizepy/sharing/PartialModel.py @@ -7,6 +7,7 @@ import numpy as np import torch from decentralizepy.sharing.Sharing import Sharing +from decentralizepy.utils import conditional_value, identity class PartialModel(Sharing): @@ -29,6 +30,9 @@ class PartialModel(Sharing): dict_ordered=True, save_shared=False, metadata_cap=1.0, + accumulation = False, + save_accumulated="", + change_transformer = identity ): """ Constructor @@ -59,6 +63,13 @@ class PartialModel(Sharing): Specifies if the indices of shared parameters should be logged metadata_cap : float Share full model when self.alpha > metadata_cap + accumulation : bool + True if the the indices to share should be selected based on accumulated frequency change + save_accumulated : bool + True if accumulated weight change should be written to file. In case of accumulation the accumulated change + is stored. If a change_transformer is used then the transformed change is stored. + change_transformer : (x: Tensor) -> Tensor + A function that transforms the model change into other domains. Default: identity function """ super().__init__( @@ -69,6 +80,35 @@ class PartialModel(Sharing): self.save_shared = save_shared self.metadata_cap = metadata_cap self.total_meta = 0 + self.accumulation = accumulation + self.save_accumulated = conditional_value(save_accumulated, "", False) + self.change_transformer = change_transformer + + # getting the initial model + self.shapes = [] + self.lens = [] + with torch.no_grad(): + tensors_to_cat = [] + for _, v in self.model.state_dict().items(): + self.shapes.append(v.shape) + t = v.flatten() + self.lens.append(t.shape[0]) + tensors_to_cat.append(t) + self.init_model = torch.cat(tensors_to_cat, dim=0) + if self.accumulation: + self.model.accumulated_changes = torch.zeros_like(self.change_transformer(self.init_model)) + self.prev = self.init_model + + if self.save_accumulated: + self.model_change_path = os.path.join( + self.log_dir, "model_change/{}".format(self.rank) + ) + Path(self.model_change_path).mkdir(parents=True, exist_ok=True) + + self.model_val_path = os.path.join( + self.log_dir, "model_val/{}".format(self.rank) + ) + Path(self.model_val_path).mkdir(parents=True, exist_ok=True) # Only save for 2 procs: Save space if self.save_shared and not (rank == 0 or rank == 1): @@ -91,16 +131,9 @@ class PartialModel(Sharing): (a,b). a: The magnitudes of the topK gradients, b: Their indices. """ - logging.info("Summing up gradients") - assert len(self.model.accumulated_gradients) > 0 - gradient_sum = self.model.accumulated_gradients[0] - for i in range(1, len(self.model.accumulated_gradients)): - for key in self.model.accumulated_gradients[i]: - gradient_sum[key] += self.model.accumulated_gradients[i][key] logging.info("Returning topk gradients") - tensors_to_cat = [v.data.flatten() for _, v in gradient_sum.items()] - G_topk = torch.abs(torch.cat(tensors_to_cat, dim=0)) + G_topk = torch.abs(self.model.model_change) std, mean = torch.std_mean(G_topk, unbiased=False) self.std = std.item() self.mean = mean.item() @@ -123,8 +156,8 @@ class PartialModel(Sharing): with torch.no_grad(): _, G_topk = self.extract_top_gradients() - - self.model.rewind_accumulation(G_topk) + if self.accumulation: + self.model.rewind_accumulation(G_topk) if self.save_shared: shared_params = dict() shared_params["order"] = list(self.model.state_dict().keys()) @@ -219,3 +252,77 @@ class PartialModel(Sharing): start_index = end_index return state_dict + + def _pre_step(self): + """ + Called at the beginning of step. + + """ + logging.info("PartialModel _pre_step") + with torch.no_grad(): + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + pre_share_model = torch.cat(tensors_to_cat, dim=0) + change = self.change_transformer(pre_share_model - self.init_model) + if self.accumulation: + change += self.model.accumulated_changes + # stores change of the model due to training, change due to averaging is not accounted + self.model.model_change = change + + def _post_step(self): + """ + Called at the end of step. + + """ + logging.info("PartialModel _post_step") + with torch.no_grad(): + self.model.model_change = None + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + post_share_model = torch.cat(tensors_to_cat, dim=0) + self.init_model = post_share_model + if self.accumulation: + self.model.accumulated_changes += self.change_transformer(self.init_model - self.prev) + self.prev = self.init_model + + if self.save_accumulated: + self.save_change() + + def save_vector(self, v, s): + """ + Saves the given vector to the file. + + Parameters + ---------- + v : torch.tensor + The torch tensor to write to file + s : str + Path to folder to write to + + """ + output_dict = dict() + output_dict["order"] = list(self.model.state_dict().keys()) + shapes = dict() + for k, v1 in self.model.state_dict().items(): + shapes[k] = list(v1.shape) + output_dict["shapes"] = shapes + + output_dict["tensor"] = v.tolist() + + with open( + os.path.join( + s, + "{}.json".format(self.communication_round + 1), + ), + "w", + ) as of: + json.dump(output_dict, of) + + def save_change(self): + """ + Saves the change and the gradient values for every iteration + + """ + self.save_vector(self.model.model_change, self.model_change_path) \ No newline at end of file diff --git a/src/decentralizepy/sharing/Sharing.py b/src/decentralizepy/sharing/Sharing.py index 85fc07b..c998f40 100644 --- a/src/decentralizepy/sharing/Sharing.py +++ b/src/decentralizepy/sharing/Sharing.py @@ -31,7 +31,7 @@ class Sharing: model : decentralizepy.models.Model Model to train dataset : decentralizepy.datasets.Dataset - Dataset for sharing data. Not implemented yer! TODO + Dataset for sharing data. Not implemented yet! TODO log_dir : str Location to write shared_params (only writing for 2 procs per machine) @@ -122,11 +122,53 @@ class Sharing: state_dict[key] = torch.from_numpy(value) return state_dict + def _pre_step(self): + """ + Called at the beginning of step. + + """ + pass + + def _post_step(self): + """ + Called at the end of step. + + """ + pass + + def _averaging(self): + """ + Averages the received model with the local model + + """ + with torch.no_grad(): + total = dict() + weight_total = 0 + for i, n in enumerate(self.peer_deques): + degree, iteration, data = self.peer_deques[n].popleft() + logging.debug( + "Averaging model from neighbor {} of iteration {}".format(n, iteration) + ) + data = self.deserialized_model(data) + weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings + weight_total += weight + for key, value in data.items(): + if key in total: + total[key] += value * weight + else: + total[key] = value * weight + + for key, value in self.model.state_dict().items(): + total[key] += (1 - weight_total) * value # Metro-Hastings + + self.model.load_state_dict(total) + def step(self): """ Perform a sharing step. Implements D-PSGD. """ + self._pre_step() data = self.serialized_model() my_uid = self.mapping.get_uid(self.rank, self.machine_id) all_neighbors = self.graph.neighbors(my_uid) @@ -152,27 +194,8 @@ class Sharing: ) logging.info("Starting model averaging after receiving from all neighbors") - total = dict() - weight_total = 0 - for i, n in enumerate(self.peer_deques): - degree, iteration, data = self.peer_deques[n].popleft() - logging.debug( - "Averaging model from neighbor {} of iteration {}".format(n, iteration) - ) - data = self.deserialized_model(data) - weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings - weight_total += weight - for key, value in data.items(): - if key in total: - total[key] += value * weight - else: - total[key] = value * weight - - for key, value in self.model.state_dict().items(): - total[key] += (1 - weight_total) * value # Metro-Hastings - - self.model.load_state_dict(total) - + self._averaging() logging.info("Model averaging complete") self.communication_round += 1 + self._post_step() diff --git a/src/decentralizepy/sharing/TopKPlusRandom.py b/src/decentralizepy/sharing/TopKPlusRandom.py index 1a31e43..728d5bf 100644 --- a/src/decentralizepy/sharing/TopKPlusRandom.py +++ b/src/decentralizepy/sharing/TopKPlusRandom.py @@ -84,16 +84,8 @@ class TopKPlusRandom(PartialModel): (a,b). a: The magnitudes of the topK gradients, b: Their indices. """ - logging.info("Summing up gradients") - assert len(self.model.accumulated_gradients) > 0 - gradient_sum = self.model.accumulated_gradients[0] - for i in range(1, len(self.model.accumulated_gradients)): - for key in self.model.accumulated_gradients[i]: - gradient_sum[key] += self.model.accumulated_gradients[i][key] - logging.info("Returning topk gradients") - tensors_to_cat = [v.data.flatten() for _, v in gradient_sum.items()] - G = torch.abs(torch.cat(tensors_to_cat, dim=0)) + G = torch.abs(self.model.model_change) std, mean = torch.std_mean(G, unbiased=False) self.std = std.item() self.mean = mean.item() diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index 2d651b0..1b73b29 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -8,10 +8,31 @@ import numpy as np import pywt import torch -from decentralizepy.sharing.Sharing import Sharing +from decentralizepy.sharing.PartialModel import PartialModel +def change_transformer_wavelet(x, wavelet, level): + """ + Transforms the model changes into wavelet frequency domain + + Parameters + ---------- + x : torch.Tensor + Model change in the space domain + wavelet : str + name of the wavelet to be used in gradient compression + level: int + name of the wavelet to be used in gradient compression -class Wavelet(Sharing): + Returns + ------- + x : torch.Tensor + Representation of the change int the wavelet domain + """ + coeff = pywt.wavedec(x, wavelet, level=level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + return torch.from_numpy(data.ravel()) + +class Wavelet(PartialModel): """ This class implements the wavelet version of model sharing It is based on PartialModel.py @@ -32,10 +53,10 @@ class Wavelet(Sharing): dict_ordered=True, save_shared=False, metadata_cap=1.0, - pickle=True, wavelet="haar", level=4, change_based_selection=True, + save_accumulated="", accumulation=False, ): """ @@ -67,65 +88,33 @@ class Wavelet(Sharing): Specifies if the indices of shared parameters should be logged metadata_cap : float Share full model when self.alpha > metadata_cap - pickle : bool - use pickle to serialize the model parameters wavelet: str name of the wavelet to be used in gradient compression level: int name of the wavelet to be used in gradient compression change_based_selection : bool use frequency change to select topk frequencies + save_accumulated : bool + True if accumulated weight change in the wavelet domain should be written to file. In case of accumulation + the accumulated change is stored. accumulation : bool True if the the indices to share should be selected based on accumulated frequency change """ - super().__init__( - rank, machine_id, communication, mapping, graph, model, dataset, log_dir - ) - self.alpha = alpha - self.dict_ordered = dict_ordered - self.save_shared = save_shared - self.metadata_cap = metadata_cap - self.total_meta = 0 - - self.pickle = pickle self.wavelet = wavelet self.level = level - self.accumulation = accumulation - - logging.info("subsampling pickling=" + str(pickle)) - if self.save_shared: - # Only save for 2 procs: Save space - if rank != 0 or rank != 1: - self.save_shared = False - - if self.save_shared: - self.folder_path = os.path.join( - self.log_dir, "shared_params/{}".format(self.rank) - ) - Path(self.folder_path).mkdir(parents=True, exist_ok=True) + super().__init__( + rank, machine_id, communication, mapping, graph, model, dataset, log_dir, alpha, dict_ordered, save_shared, + metadata_cap, accumulation, save_accumulated, lambda x : change_transformer_wavelet(x, wavelet, level) + ) self.change_based_selection = change_based_selection - self.accumulation = accumulation - # getting the initial model - with torch.no_grad(): - self.model.accumulated_gradients = [] - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - self.init_model = torch.from_numpy(data.ravel()) - self.prev = None - if self.accumulation: - if self.model.accumulated_changes is None: - self.model.accumulated_changes = torch.zeros_like(self.init_model) - self.prev = self.init_model - else: - self.model.accumulated_changes += self.init_model - self.prev - self.prev = self.init_model + # Do a dummy transform to get the shape and coefficents slices + coeff = pywt.wavedec(self.init_model.numpy(), self.wavelet, level=self.level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + self.wt_shape = data.shape + self.coeff_slices = coeff_slices def apply_wavelet(self): """ @@ -142,31 +131,27 @@ class Wavelet(Sharing): logging.info("Returning dwt compressed model weights") tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] concated = torch.cat(tensors_to_cat, dim=0) - - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array( - coeff - ) # coeff_slices will be reproduced on the receiver - data = data.ravel() - + data = self.change_transformer(concated) + logging.info("produced wavelet representation of current model") if self.change_based_selection: - assert len(self.model.accumulated_gradients) == 1 - diff = self.model.accumulated_gradients[0] + logging.info("changed based selection") + diff = self.model.model_change _, index = torch.topk( diff.abs(), - round(self.alpha * len(data)), + round(self.alpha * len(diff)), dim=0, sorted=False, ) + logging.info("finished change based selection") else: _, index = torch.topk( - torch.from_numpy(data).abs(), + data.abs(), round(self.alpha * len(data)), dim=0, sorted=False, ) - return torch.from_numpy(data[index]), index + return data[index], index def serialized_model(self): """ @@ -178,6 +163,7 @@ class Wavelet(Sharing): Model converted to json dict """ + logging.info("serializing wavelet model") if self.alpha > self.metadata_cap: # Share fully return super().serialized_model() @@ -185,7 +171,7 @@ class Wavelet(Sharing): topk, indices = self.apply_wavelet() self.model.rewind_accumulation(indices) - + logging.info("finished rewind") if self.save_shared: shared_params = dict() shared_params["order"] = list(self.model.state_dict().keys()) @@ -227,12 +213,12 @@ class Wavelet(Sharing): def deserialized_model(self, m): """ - Convert received json dict to state_dict. + Convert received dict to state_dict. Parameters ---------- m : dict - json dict received + received dict Returns ------- @@ -240,26 +226,14 @@ class Wavelet(Sharing): state_dict of received """ + logging.info("deserializing wavelet model") if self.alpha > self.metadata_cap: # Share fully return super().deserialized_model(m) with torch.no_grad(): - state_dict = self.model.state_dict() - if not self.dict_ordered: raise NotImplementedError - shapes = [] - lens = [] - tensors_to_cat = [] - for _, v in state_dict.items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - - T = torch.cat(tensors_to_cat, dim=0) - indices = m["indices"] alpha = m["alpha"] params = m["params"] @@ -271,128 +245,51 @@ class Wavelet(Sharing): ret["params"] = params_tensor return ret - def step(self): + def _averaging(self): """ - Perform a sharing step. Implements D-PSGD. + Averages the received model with the local model """ - t_start = time() - shapes = [] - lens = [] - end_model = None - change = 0 - self.model.accumulated_gradients = [] with torch.no_grad(): - # FFT of this model - tensors_to_cat = [] - for _, v in self.model.state_dict().items(): - shapes.append(v.shape) - t = v.flatten() - lens.append(t.shape[0]) - tensors_to_cat.append(t) - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - shape = data.shape - wt_params = data.ravel() - end_model = torch.from_numpy(wt_params) - change = end_model - self.init_model - if self.accumulation: - change += self.model.accumulated_changes - self.model.accumulated_gradients.append(change) - data = self.serialized_model() - t_post_serialize = time() - my_uid = self.mapping.get_uid(self.rank, self.machine_id) - all_neighbors = self.graph.neighbors(my_uid) - iter_neighbors = self.get_neighbors(all_neighbors) - data["degree"] = len(all_neighbors) - data["iteration"] = self.communication_round - for neighbor in iter_neighbors: - self.communication.send(neighbor, data) - t_post_send = time() - logging.info("Waiting for messages from neighbors") - while not self.received_from_all(): - sender, data = self.communication.receive() - logging.debug("Received model from {}".format(sender)) - degree = data["degree"] - iteration = data["iteration"] - del data["degree"] - del data["iteration"] - self.peer_deques[sender].append((degree, iteration, data)) - logging.info( - "Deserialized received model from {} of iteration {}".format( - sender, iteration + total = None + weight_total = 0 + wt_params = self.change_transformer(self.init_model) + for i, n in enumerate(self.peer_deques): + degree, iteration, data = self.peer_deques[n].popleft() + logging.debug( + "Averaging model from neighbor {} of iteration {}".format(n, iteration) ) - ) - t_post_recv = time() + data = self.deserialized_model(data) + params = data["params"] + indices = data["indices"] + # use local data to complement + topkwf = wt_params.clone().detach() + topkwf[indices] = params + topkwf = topkwf.reshape(self.wt_shape) + + weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings + weight_total += weight + if total is None: + total = weight * topkwf + else: + total += weight * topkwf - logging.info("Starting model averaging after receiving from all neighbors") - total = None - weight_total = 0 + # Metro-Hastings + total += (1 - weight_total) * wt_params - for i, n in enumerate(self.peer_deques): - degree, iteration, data = self.peer_deques[n].popleft() - logging.debug( - "Averaging model from neighbor {} of iteration {}".format(n, iteration) + avg_wf_params = pywt.array_to_coeffs( + total.numpy(), self.coeff_slices, output_format="wavedec" + ) + reverse_total = torch.from_numpy( + pywt.waverec(avg_wf_params, wavelet=self.wavelet) ) - data = self.deserialized_model(data) - params = data["params"] - indices = data["indices"] - # use local data to complement - topkwf = wt_params.copy() # .clone().detach() - topkwf[indices] = params - topkwf = torch.from_numpy(topkwf.reshape(shape)) - - weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings - weight_total += weight - if total is None: - total = weight * topkwf - else: - total += weight * topkwf - - # Metro-Hastings - total += (1 - weight_total) * wt_params - - avg_wf_params = pywt.array_to_coeffs( - total, coeff_slices, output_format="wavedec" - ) - reverse_total = torch.from_numpy( - pywt.waverec(avg_wf_params, wavelet=self.wavelet) - ) - start_index = 0 - std_dict = {} - for i, key in enumerate(self.model.state_dict()): - end_index = start_index + lens[i] - std_dict[key] = reverse_total[start_index:end_index].reshape(shapes[i]) - start_index = end_index + start_index = 0 + std_dict = {} + for i, key in enumerate(self.model.state_dict()): + end_index = start_index + self.lens[i] + std_dict[key] = reverse_total[start_index:end_index].reshape(self.shapes[i]) + start_index = end_index self.model.load_state_dict(std_dict) - logging.info("Model averaging complete") - - self.communication_round += 1 - - with torch.no_grad(): - self.model.accumulated_gradients = [] - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.state_dict().items() - ] - concated = torch.cat(tensors_to_cat, dim=0) - coeff = pywt.wavedec(concated.numpy(), self.wavelet, level=self.level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - self.init_model = torch.from_numpy(data.ravel()) - if self.accumulation: - self.model.accumulated_changes += self.init_model - self.prev - self.prev = self.init_model - - t_end = time() - - logging.info( - "Sharing::step | Serialize: %f; Send: %f; Recv: %f; Averaging: %f; Total: %f", - t_post_serialize - t_start, - t_post_send - t_post_serialize, - t_post_recv - t_post_send, - t_end - t_post_recv, - t_end - t_start, - ) diff --git a/src/decentralizepy/training/GradientAccumulator.py b/src/decentralizepy/training/GradientAccumulator.py deleted file mode 100644 index fcff8e6..0000000 --- a/src/decentralizepy/training/GradientAccumulator.py +++ /dev/null @@ -1,112 +0,0 @@ -import logging - -from decentralizepy.training.Training import Training - - -class GradientAccumulator(Training): - """ - This class implements the training module which also accumulates gradients of steps in a list. - - """ - - def __init__( - self, - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds="", - full_epochs="", - batch_size="", - shuffle="", - ): - """ - Constructor - - Parameters - ---------- - rank : int - Rank of process local to the machine - machine_id : int - Machine ID on which the process in running - mapping : decentralizepy.mappings - The object containing the mapping rank <--> uid - model : torch.nn.Module - Neural Network for training - optimizer : torch.optim - Optimizer to learn parameters - loss : function - Loss function - log_dir : str - Directory to log the model change. - rounds : int, optional - Number of steps/epochs per training call - full_epochs: bool, optional - True if 1 round = 1 epoch. False if 1 round = 1 minibatch - batch_size : int, optional - Number of items to learn over, in one batch - shuffle : bool - True if the dataset should be shuffled before training. - - """ - super().__init__( - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds, - full_epochs, - batch_size, - shuffle, - ) - - def trainstep(self, data, target): - """ - One training step on a minibatch. - - Parameters - ---------- - data : any - Data item - target : any - Label - - Returns - ------- - int - Loss Value for the step - - """ - self.model.zero_grad() - output = self.model(data) - loss_val = self.loss(output, target) - loss_val.backward() - logging.debug("Accumulating Gradients") - self.model.accumulated_gradients.append( - { - k: v.grad.clone().detach() - for k, v in zip(self.model.state_dict(), self.model.parameters()) - } - ) - self.optimizer.step() - return loss_val.item() - - def train(self, dataset): - """ - One training iteration with accumulation of gradients in model.accumulated_gradients. - Goes through the entire dataset. - - Parameters - ---------- - dataset : decentralizepy.datasets.Dataset - The training dataset. Should implement get_trainset(batch_size, shuffle) - - """ - self.model.accumulated_gradients = [] - super().train(dataset) diff --git a/src/decentralizepy/utils.py b/src/decentralizepy/utils.py index 996e4bc..f919468 100644 --- a/src/decentralizepy/utils.py +++ b/src/decentralizepy/utils.py @@ -108,3 +108,17 @@ def write_args(args, path): } with open(os.path.join(path, "args.json"), "w") as of: json.dump(data, of) + +def identity(obj): + """ + Identity function + Parameters + ---------- + obj + Some object + Returns + ------- + obj + The same object + """ + return obj \ No newline at end of file -- GitLab From 9c9efb16d80c430b47957bea34e5316e9122947e Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Sat, 19 Mar 2022 12:27:19 +0100 Subject: [PATCH 11/16] deleting ChangeAccumulator.py and fixing configs --- eval/step_configs/config_femnist_topkacc.ini | 4 +- .../training/ChangeAccumulator.py | 192 ------------------ 2 files changed, 2 insertions(+), 194 deletions(-) delete mode 100644 src/decentralizepy/training/ChangeAccumulator.py diff --git a/eval/step_configs/config_femnist_topkacc.ini b/eval/step_configs/config_femnist_topkacc.ini index 2705fe7..c9155d1 100644 --- a/eval/step_configs/config_femnist_topkacc.ini +++ b/eval/step_configs/config_femnist_topkacc.ini @@ -15,8 +15,8 @@ lr = 0.001 # There are 734463 femnist samples [TRAIN_PARAMS] -training_package = decentralizepy.training.ChangeAccumulator -training_class = ChangeAccumulator +training_package = decentralizepy.training.Training +training_class = Training rounds = 47 full_epochs = False batch_size = 16 diff --git a/src/decentralizepy/training/ChangeAccumulator.py b/src/decentralizepy/training/ChangeAccumulator.py deleted file mode 100644 index 5e55621..0000000 --- a/src/decentralizepy/training/ChangeAccumulator.py +++ /dev/null @@ -1,192 +0,0 @@ -import json -import os -from pathlib import Path - -import torch - -from decentralizepy.training.Training import Training -from decentralizepy.utils import conditional_value - - -class ChangeAccumulator(Training): - """ - This class implements the training module which also accumulates model change in a list. - - """ - - def __init__( - self, - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds="", - full_epochs="", - batch_size="", - shuffle="", - save_accumulated="", - accumulation=True, - ): - """ - Constructor - - Parameters - ---------- - rank : int - Rank of process local to the machine - machine_id : int - Machine ID on which the process in running - mapping : decentralizepy.mappings - The object containing the mapping rank <--> uid - model : torch.nn.Module - Neural Network for training - optimizer : torch.optim - Optimizer to learn parameters - loss : function - Loss function - log_dir : str - Directory to log the model change. - rounds : int, optional - Number of steps/epochs per training call - full_epochs: bool, optional - True if 1 round = 1 epoch. False if 1 round = 1 minibatch - batch_size : int, optional - Number of items to learn over, in one batch - shuffle : bool - True if the dataset should be shuffled before training. - save_accumulated : bool - True if accumulated weight change should be written to file - accumulation : bool - True if the model change should be accumulated across communication steps - - """ - super().__init__( - rank, - machine_id, - mapping, - model, - optimizer, - loss, - log_dir, - rounds, - full_epochs, - batch_size, - shuffle, - ) - self.save_accumulated = conditional_value(save_accumulated, "", False) - self.communication_round = 0 - if self.save_accumulated: - self.model_change_path = os.path.join( - self.log_dir, "model_change/{}".format(self.rank) - ) - Path(self.model_change_path).mkdir(parents=True, exist_ok=True) - - self.model_val_path = os.path.join( - self.log_dir, "model_val/{}".format(self.rank) - ) - Path(self.model_val_path).mkdir(parents=True, exist_ok=True) - self.accumulation = accumulation - self.init_model = None - self.prev = None - - def save_vector(self, v, s): - """ - Saves the given vector to the file. - - Parameters - ---------- - v : torch.tensor - The torch tensor to write to file - s : str - Path to folder to write to - - """ - output_dict = dict() - output_dict["order"] = list(self.model.state_dict().keys()) - shapes = dict() - for k, v1 in self.model.state_dict().items(): - shapes[k] = list(v1.shape) - output_dict["shapes"] = shapes - - output_dict["tensor"] = v.tolist() - - with open( - os.path.join( - s, - "{}.json".format(self.communication_round + 1), - ), - "w", - ) as of: - json.dump(output_dict, of) - - def save_change(self): - """ - Saves the change and the gradient values for every iteration - - """ - tensors_to_cat = [ - v.data.flatten() for _, v in self.model.accumulated_gradients[0].items() - ] - change = torch.abs(torch.cat(tensors_to_cat, dim=0)) - self.save_vector(change, self.model_change_path) - - def save_model_params(self): - """ - Saves the change and the gradient values for every iteration - - """ - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] - params = torch.abs(torch.cat(tensors_to_cat, dim=0)) - self.save_vector(params, self.model_val_path) - - def train(self, dataset): - """ - One training iteration with accumulation of model change in model.accumulated_gradients. - Goes through the entire dataset. - - Parameters - ---------- - dataset : decentralizepy.datasets.Dataset - The training dataset. Should implement get_trainset(batch_size, shuffle) - - """ - self.model.accumulated_gradients = [] - self.init_model = { - k: v.data.clone().detach() - for k, v in zip(self.model.state_dict(), self.model.parameters()) - } - if self.accumulation: - if self.model.accumulated_changes is None: - flats = [v.data.flatten() for _, v in self.init_model.items()] - flat = torch.cat(flats) - self.model.accumulated_changes = torch.zeros_like(flat) - self.prev = flat - else: - flats = [v.data.flatten() for _, v in self.init_model.items()] - flat = torch.cat(flats) - self.model.accumulated_changes += flat - self.prev - self.prev = flat - - super().train(dataset) - with torch.no_grad(): - change = { - k: v.data.clone().detach() - self.init_model[k] - for k, v in zip(self.model.state_dict(), self.model.parameters()) - } - if self.accumulation: - flats_change = [v.data.flatten() for _, v in change.items()] - flat_change = torch.cat(flats_change) - # flatten does not copy data if input is already flattened - # however cat copies - change = {"flat": self.model.accumulated_changes + flat_change} - - self.model.accumulated_gradients.append(change) - - if self.save_accumulated: - self.save_change() - self.save_model_params() - - self.communication_round += 1 -- GitLab From 2efaec4a2261f339a279d7ee8ed19f14bcae91be Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Sat, 19 Mar 2022 13:23:11 +0100 Subject: [PATCH 12/16] Changing the accumulation implementation --- src/decentralizepy/sharing/FFT.py | 6 +++++- src/decentralizepy/sharing/PartialModel.py | 21 ++++++++++++++++----- src/decentralizepy/sharing/Wavelet.py | 6 +++++- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index 1bc7e0e..6af75e3 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -51,6 +51,7 @@ class FFT(PartialModel): change_based_selection=True, save_accumulated="", accumulation=True, + accumulate_averaging_changes=False ): """ Constructor @@ -88,10 +89,13 @@ class FFT(PartialModel): the accumulated change is stored. accumulation : bool True if the the indices to share should be selected based on accumulated frequency change + accumulate_averaging_changes: bool + True if the accumulation should account the model change due to averaging + """ super().__init__( rank, machine_id, communication, mapping, graph, model, dataset, log_dir, alpha, dict_ordered, save_shared, - metadata_cap, accumulation, save_accumulated, change_transformer_fft + metadata_cap, accumulation, save_accumulated, change_transformer_fft, accumulate_averaging_changes ) self.change_based_selection = change_based_selection diff --git a/src/decentralizepy/sharing/PartialModel.py b/src/decentralizepy/sharing/PartialModel.py index c961c43..935f302 100644 --- a/src/decentralizepy/sharing/PartialModel.py +++ b/src/decentralizepy/sharing/PartialModel.py @@ -32,7 +32,8 @@ class PartialModel(Sharing): metadata_cap=1.0, accumulation = False, save_accumulated="", - change_transformer = identity + change_transformer = identity, + accumulate_averaging_changes = False ): """ Constructor @@ -70,6 +71,8 @@ class PartialModel(Sharing): is stored. If a change_transformer is used then the transformed change is stored. change_transformer : (x: Tensor) -> Tensor A function that transforms the model change into other domains. Default: identity function + accumulate_averaging_changes: bool + True if the accumulation should account the model change due to averaging """ super().__init__( @@ -83,6 +86,7 @@ class PartialModel(Sharing): self.accumulation = accumulation self.save_accumulated = conditional_value(save_accumulated, "", False) self.change_transformer = change_transformer + self.accumulate_averaging_changes = accumulate_averaging_changes # getting the initial model self.shapes = [] @@ -266,7 +270,14 @@ class PartialModel(Sharing): pre_share_model = torch.cat(tensors_to_cat, dim=0) change = self.change_transformer(pre_share_model - self.init_model) if self.accumulation: - change += self.model.accumulated_changes + if not self.accumulate_averaging_changes: + # Need to accumulate in _pre_step as the accumulation gets rewind during the step + self.model.accumulated_changes += change + change = self.model.accumulated_changes.clone().detach() + else: + # For the legacy implementation, we will only rewind currently accumulated values + # and add the model change due to averaging in the end + change += self.model.accumulated_changes # stores change of the model due to training, change due to averaging is not accounted self.model.model_change = change @@ -277,16 +288,16 @@ class PartialModel(Sharing): """ logging.info("PartialModel _post_step") with torch.no_grad(): - self.model.model_change = None tensors_to_cat = [ v.data.flatten() for _, v in self.model.state_dict().items() ] post_share_model = torch.cat(tensors_to_cat, dim=0) self.init_model = post_share_model if self.accumulation: - self.model.accumulated_changes += self.change_transformer(self.init_model - self.prev) + if self.accumulate_averaging_changes: + self.model.accumulated_changes += self.change_transformer(self.init_model - self.prev) self.prev = self.init_model - + self.model.model_change = None if self.save_accumulated: self.save_change() diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index 1b73b29..cd039f8 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -58,6 +58,7 @@ class Wavelet(PartialModel): change_based_selection=True, save_accumulated="", accumulation=False, + accumulate_averaging_changes = False ): """ Constructor @@ -99,13 +100,16 @@ class Wavelet(PartialModel): the accumulated change is stored. accumulation : bool True if the the indices to share should be selected based on accumulated frequency change + accumulate_averaging_changes: bool + True if the accumulation should account the model change due to averaging """ self.wavelet = wavelet self.level = level super().__init__( rank, machine_id, communication, mapping, graph, model, dataset, log_dir, alpha, dict_ordered, save_shared, - metadata_cap, accumulation, save_accumulated, lambda x : change_transformer_wavelet(x, wavelet, level) + metadata_cap, accumulation, save_accumulated, lambda x : change_transformer_wavelet(x, wavelet, level), + accumulate_averaging_changes ) self.change_based_selection = change_based_selection -- GitLab From 1b7936b126a572677ab459143bdb6a5e545f94aa Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Sat, 19 Mar 2022 18:18:50 +0100 Subject: [PATCH 13/16] wavelet and fft fix --- eval/step_configs/config_celeba_wavelet.ini | 2 +- eval/step_configs/config_femnist.ini | 1 + eval/step_configs/config_femnist_wavelet.ini | 2 +- src/decentralizepy/sharing/FFT.py | 7 +++++-- src/decentralizepy/sharing/PartialModel.py | 2 +- src/decentralizepy/sharing/Wavelet.py | 6 +++++- 6 files changed, 14 insertions(+), 6 deletions(-) diff --git a/eval/step_configs/config_celeba_wavelet.ini b/eval/step_configs/config_celeba_wavelet.ini index 70e9f15..1c97eb9 100644 --- a/eval/step_configs/config_celeba_wavelet.ini +++ b/eval/step_configs/config_celeba_wavelet.ini @@ -34,5 +34,5 @@ sharing_class = Wavelet change_based_selection = True alpha = 0.1 wavelet=sym2 -level= None +level= 4 accumulation = True diff --git a/eval/step_configs/config_femnist.ini b/eval/step_configs/config_femnist.ini index 8063181..de4f1ce 100644 --- a/eval/step_configs/config_femnist.ini +++ b/eval/step_configs/config_femnist.ini @@ -31,3 +31,4 @@ addresses_filepath = ip_addr_6Machines.json [SHARING] sharing_package = decentralizepy.sharing.PartialModel sharing_class = PartialModel +alpha=0.1 diff --git a/eval/step_configs/config_femnist_wavelet.ini b/eval/step_configs/config_femnist_wavelet.ini index 68704a3..b6ff278 100644 --- a/eval/step_configs/config_femnist_wavelet.ini +++ b/eval/step_configs/config_femnist_wavelet.ini @@ -35,5 +35,5 @@ sharing_class = Wavelet change_based_selection = True alpha = 0.1 wavelet=sym2 -level= None +level= 4 accumulation = True diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index 6af75e3..e0e67fd 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -224,8 +224,11 @@ class FFT(PartialModel): with torch.no_grad(): total = None weight_total = 0 - - flat_fft = self.change_transformer(self.init_model) + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + pre_share_model = torch.cat(tensors_to_cat, dim=0) + flat_fft = self.change_transformer(pre_share_model) for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() diff --git a/src/decentralizepy/sharing/PartialModel.py b/src/decentralizepy/sharing/PartialModel.py index 935f302..dca5c75 100644 --- a/src/decentralizepy/sharing/PartialModel.py +++ b/src/decentralizepy/sharing/PartialModel.py @@ -155,7 +155,7 @@ class PartialModel(Sharing): Model converted to a dict """ - if self.alpha > self.metadata_cap: # Share fully + if self.alpha >= self.metadata_cap: # Share fully return super().serialized_model() with torch.no_grad(): diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index cd039f8..e41bd24 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -257,7 +257,11 @@ class Wavelet(PartialModel): with torch.no_grad(): total = None weight_total = 0 - wt_params = self.change_transformer(self.init_model) + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] + pre_share_model = torch.cat(tensors_to_cat, dim=0) + wt_params = self.change_transformer(pre_share_model) for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() logging.debug( -- GitLab From fc6ee11c287ac17e642ed3e6033b338db80123c8 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Sat, 19 Mar 2022 19:11:48 +0100 Subject: [PATCH 14/16] reformatting --- eval/run_all.sh | 2 +- ...eba.ini => config_celeba_partialmodel.ini} | 0 ...leba_100.ini => config_celeba_sharing.ini} | 0 ...st.ini => config_femnist_partialmodel.ini} | 0 ...ist_100.ini => config_femnist_sharing.ini} | 0 eval/testing.py | 1 - src/decentralizepy/node/Node.py | 1 - src/decentralizepy/sharing/FFT.py | 38 +++++++-- src/decentralizepy/sharing/PartialModel.py | 18 +++-- src/decentralizepy/sharing/Sharing.py | 4 +- src/decentralizepy/sharing/Wavelet.py | 78 +++++++++++-------- src/decentralizepy/utils.py | 3 +- 12 files changed, 93 insertions(+), 52 deletions(-) rename eval/step_configs/{config_celeba.ini => config_celeba_partialmodel.ini} (100%) rename eval/step_configs/{config_celeba_100.ini => config_celeba_sharing.ini} (100%) rename eval/step_configs/{config_femnist.ini => config_femnist_partialmodel.ini} (100%) rename eval/step_configs/{config_femnist_100.ini => config_femnist_sharing.ini} (100%) diff --git a/eval/run_all.sh b/eval/run_all.sh index 1afdf02..c9e5714 100755 --- a/eval/run_all.sh +++ b/eval/run_all.sh @@ -18,7 +18,7 @@ ip_machines=$nfs_home/configs/ip_addr_6Machines.json m=`cat $ip_machines | grep $(/sbin/ifconfig ens785 | grep 'inet ' | awk '{print $2}') | cut -d'"' -f2` export PYTHONFAULTHANDLER=1 -tests=("step_configs/config_celeba.ini" "step_configs/config_celeba_100.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini" +tests=("step_configs/config_celeba_partialmodel.ini" "step_configs/config_celeba_sharing.ini" "step_configs/config_celeba_fft.ini" "step_configs/config_celeba_wavelet.ini" "step_configs/config_celeba_grow.ini" "step_configs/config_celeba_manualadapt.ini" "step_configs/config_celeba_randomalpha.ini" "step_configs/config_celeba_randomalphainc.ini" "step_configs/config_celeba_roundrobin.ini" "step_configs/config_celeba_subsampling.ini" "step_configs/config_celeba_topkrandom.ini" "step_configs/config_celeba_topkacc.ini" "step_configs/config_celeba_topkparam.ini") diff --git a/eval/step_configs/config_celeba.ini b/eval/step_configs/config_celeba_partialmodel.ini similarity index 100% rename from eval/step_configs/config_celeba.ini rename to eval/step_configs/config_celeba_partialmodel.ini diff --git a/eval/step_configs/config_celeba_100.ini b/eval/step_configs/config_celeba_sharing.ini similarity index 100% rename from eval/step_configs/config_celeba_100.ini rename to eval/step_configs/config_celeba_sharing.ini diff --git a/eval/step_configs/config_femnist.ini b/eval/step_configs/config_femnist_partialmodel.ini similarity index 100% rename from eval/step_configs/config_femnist.ini rename to eval/step_configs/config_femnist_partialmodel.ini diff --git a/eval/step_configs/config_femnist_100.ini b/eval/step_configs/config_femnist_sharing.ini similarity index 100% rename from eval/step_configs/config_femnist_100.ini rename to eval/step_configs/config_femnist_sharing.ini diff --git a/eval/testing.py b/eval/testing.py index b9c4081..abd6333 100644 --- a/eval/testing.py +++ b/eval/testing.py @@ -65,4 +65,3 @@ if __name__ == "__main__": args.reset_optimizer, ], ) - print("after spawn") diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index 463f57f..7854c38 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -481,4 +481,3 @@ class Node: ) self.run() - logging.info("Node finished running") diff --git a/src/decentralizepy/sharing/FFT.py b/src/decentralizepy/sharing/FFT.py index e0e67fd..0c0172f 100644 --- a/src/decentralizepy/sharing/FFT.py +++ b/src/decentralizepy/sharing/FFT.py @@ -27,6 +27,7 @@ def change_transformer_fft(x): """ return fft.rfft(x) + class FFT(PartialModel): """ This class implements the fft version of model sharing @@ -51,7 +52,7 @@ class FFT(PartialModel): change_based_selection=True, save_accumulated="", accumulation=True, - accumulate_averaging_changes=False + accumulate_averaging_changes=False, ): """ Constructor @@ -94,8 +95,22 @@ class FFT(PartialModel): """ super().__init__( - rank, machine_id, communication, mapping, graph, model, dataset, log_dir, alpha, dict_ordered, save_shared, - metadata_cap, accumulation, save_accumulated, change_transformer_fft, accumulate_averaging_changes + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha, + dict_ordered, + save_shared, + metadata_cap, + accumulation, + save_accumulated, + change_transformer_fft, + accumulate_averaging_changes, ) self.change_based_selection = change_based_selection @@ -113,7 +128,9 @@ class FFT(PartialModel): logging.info("Returning fft compressed model weights") with torch.no_grad(): - tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] + tensors_to_cat = [ + v.data.flatten() for _, v in self.model.state_dict().items() + ] concated = torch.cat(tensors_to_cat, dim=0) flat_fft = self.change_transformer(concated) if self.change_based_selection: @@ -123,7 +140,10 @@ class FFT(PartialModel): ) else: _, index = torch.topk( - flat_fft.abs(), round(self.alpha * len(flat_fft)), dim=0, sorted=False + flat_fft.abs(), + round(self.alpha * len(flat_fft)), + dim=0, + sorted=False, ) return flat_fft[index], index @@ -233,7 +253,9 @@ class FFT(PartialModel): for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() logging.debug( - "Averaging model from neighbor {} of iteration {}".format(n, iteration) + "Averaging model from neighbor {} of iteration {}".format( + n, iteration + ) ) data = self.deserialized_model(data) params = data["params"] @@ -257,7 +279,9 @@ class FFT(PartialModel): std_dict = {} for i, key in enumerate(self.model.state_dict()): end_index = start_index + self.lens[i] - std_dict[key] = reverse_total[start_index:end_index].reshape(self.shapes[i]) + std_dict[key] = reverse_total[start_index:end_index].reshape( + self.shapes[i] + ) start_index = end_index self.model.load_state_dict(std_dict) diff --git a/src/decentralizepy/sharing/PartialModel.py b/src/decentralizepy/sharing/PartialModel.py index dca5c75..97c702b 100644 --- a/src/decentralizepy/sharing/PartialModel.py +++ b/src/decentralizepy/sharing/PartialModel.py @@ -30,10 +30,10 @@ class PartialModel(Sharing): dict_ordered=True, save_shared=False, metadata_cap=1.0, - accumulation = False, + accumulation=False, save_accumulated="", - change_transformer = identity, - accumulate_averaging_changes = False + change_transformer=identity, + accumulate_averaging_changes=False, ): """ Constructor @@ -100,9 +100,11 @@ class PartialModel(Sharing): tensors_to_cat.append(t) self.init_model = torch.cat(tensors_to_cat, dim=0) if self.accumulation: - self.model.accumulated_changes = torch.zeros_like(self.change_transformer(self.init_model)) + self.model.accumulated_changes = torch.zeros_like( + self.change_transformer(self.init_model) + ) self.prev = self.init_model - + if self.save_accumulated: self.model_change_path = os.path.join( self.log_dir, "model_change/{}".format(self.rank) @@ -295,7 +297,9 @@ class PartialModel(Sharing): self.init_model = post_share_model if self.accumulation: if self.accumulate_averaging_changes: - self.model.accumulated_changes += self.change_transformer(self.init_model - self.prev) + self.model.accumulated_changes += self.change_transformer( + self.init_model - self.prev + ) self.prev = self.init_model self.model.model_change = None if self.save_accumulated: @@ -336,4 +340,4 @@ class PartialModel(Sharing): Saves the change and the gradient values for every iteration """ - self.save_vector(self.model.model_change, self.model_change_path) \ No newline at end of file + self.save_vector(self.model.model_change, self.model_change_path) diff --git a/src/decentralizepy/sharing/Sharing.py b/src/decentralizepy/sharing/Sharing.py index c998f40..3fe189c 100644 --- a/src/decentralizepy/sharing/Sharing.py +++ b/src/decentralizepy/sharing/Sharing.py @@ -147,7 +147,9 @@ class Sharing: for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() logging.debug( - "Averaging model from neighbor {} of iteration {}".format(n, iteration) + "Averaging model from neighbor {} of iteration {}".format( + n, iteration + ) ) data = self.deserialized_model(data) weight = 1 / (max(len(self.peer_deques), degree) + 1) # Metro-Hastings diff --git a/src/decentralizepy/sharing/Wavelet.py b/src/decentralizepy/sharing/Wavelet.py index e41bd24..363a487 100644 --- a/src/decentralizepy/sharing/Wavelet.py +++ b/src/decentralizepy/sharing/Wavelet.py @@ -10,27 +10,29 @@ import torch from decentralizepy.sharing.PartialModel import PartialModel -def change_transformer_wavelet(x, wavelet, level): - """ - Transforms the model changes into wavelet frequency domain - Parameters - ---------- - x : torch.Tensor - Model change in the space domain - wavelet : str - name of the wavelet to be used in gradient compression - level: int - name of the wavelet to be used in gradient compression +def change_transformer_wavelet(x, wavelet, level): + """ + Transforms the model changes into wavelet frequency domain + + Parameters + ---------- + x : torch.Tensor + Model change in the space domain + wavelet : str + name of the wavelet to be used in gradient compression + level: int + name of the wavelet to be used in gradient compression + + Returns + ------- + x : torch.Tensor + Representation of the change int the wavelet domain + """ + coeff = pywt.wavedec(x, wavelet, level=level) + data, coeff_slices = pywt.coeffs_to_array(coeff) + return torch.from_numpy(data.ravel()) - Returns - ------- - x : torch.Tensor - Representation of the change int the wavelet domain - """ - coeff = pywt.wavedec(x, wavelet, level=level) - data, coeff_slices = pywt.coeffs_to_array(coeff) - return torch.from_numpy(data.ravel()) class Wavelet(PartialModel): """ @@ -58,7 +60,7 @@ class Wavelet(PartialModel): change_based_selection=True, save_accumulated="", accumulation=False, - accumulate_averaging_changes = False + accumulate_averaging_changes=False, ): """ Constructor @@ -107,9 +109,22 @@ class Wavelet(PartialModel): self.level = level super().__init__( - rank, machine_id, communication, mapping, graph, model, dataset, log_dir, alpha, dict_ordered, save_shared, - metadata_cap, accumulation, save_accumulated, lambda x : change_transformer_wavelet(x, wavelet, level), - accumulate_averaging_changes + rank, + machine_id, + communication, + mapping, + graph, + model, + dataset, + log_dir, + alpha, + dict_ordered, + save_shared, + metadata_cap, + accumulation, + save_accumulated, + lambda x: change_transformer_wavelet(x, wavelet, level), + accumulate_averaging_changes, ) self.change_based_selection = change_based_selection @@ -132,13 +147,11 @@ class Wavelet(PartialModel): """ - logging.info("Returning dwt compressed model weights") + logging.info("Returning wavelet compressed model weights") tensors_to_cat = [v.data.flatten() for _, v in self.model.state_dict().items()] concated = torch.cat(tensors_to_cat, dim=0) data = self.change_transformer(concated) - logging.info("produced wavelet representation of current model") if self.change_based_selection: - logging.info("changed based selection") diff = self.model.model_change _, index = torch.topk( diff.abs(), @@ -146,7 +159,6 @@ class Wavelet(PartialModel): dim=0, sorted=False, ) - logging.info("finished change based selection") else: _, index = torch.topk( data.abs(), @@ -167,7 +179,6 @@ class Wavelet(PartialModel): Model converted to json dict """ - logging.info("serializing wavelet model") if self.alpha > self.metadata_cap: # Share fully return super().serialized_model() @@ -175,7 +186,6 @@ class Wavelet(PartialModel): topk, indices = self.apply_wavelet() self.model.rewind_accumulation(indices) - logging.info("finished rewind") if self.save_shared: shared_params = dict() shared_params["order"] = list(self.model.state_dict().keys()) @@ -230,7 +240,6 @@ class Wavelet(PartialModel): state_dict of received """ - logging.info("deserializing wavelet model") if self.alpha > self.metadata_cap: # Share fully return super().deserialized_model(m) @@ -265,7 +274,9 @@ class Wavelet(PartialModel): for i, n in enumerate(self.peer_deques): degree, iteration, data = self.peer_deques[n].popleft() logging.debug( - "Averaging model from neighbor {} of iteration {}".format(n, iteration) + "Averaging model from neighbor {} of iteration {}".format( + n, iteration + ) ) data = self.deserialized_model(data) params = data["params"] @@ -296,8 +307,9 @@ class Wavelet(PartialModel): std_dict = {} for i, key in enumerate(self.model.state_dict()): end_index = start_index + self.lens[i] - std_dict[key] = reverse_total[start_index:end_index].reshape(self.shapes[i]) + std_dict[key] = reverse_total[start_index:end_index].reshape( + self.shapes[i] + ) start_index = end_index self.model.load_state_dict(std_dict) - diff --git a/src/decentralizepy/utils.py b/src/decentralizepy/utils.py index f919468..82f2068 100644 --- a/src/decentralizepy/utils.py +++ b/src/decentralizepy/utils.py @@ -109,6 +109,7 @@ def write_args(args, path): with open(os.path.join(path, "args.json"), "w") as of: json.dump(data, of) + def identity(obj): """ Identity function @@ -121,4 +122,4 @@ def identity(obj): obj The same object """ - return obj \ No newline at end of file + return obj -- GitLab From b6096c2e808ab732b5824af510f5133d7923d921 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Sat, 19 Mar 2022 19:39:39 +0100 Subject: [PATCH 15/16] option for when to evaluate on the trainset --- eval/run_all.sh | 3 ++- eval/testing.py | 1 + src/decentralizepy/node/Node.py | 44 +++++++++++++++++++++++++-------- src/decentralizepy/utils.py | 1 + 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/eval/run_all.sh b/eval/run_all.sh index c9e5714..d5d0c04 100755 --- a/eval/run_all.sh +++ b/eval/run_all.sh @@ -10,6 +10,7 @@ config_file=~/tmp/config.ini procs_per_machine=16 machines=6 iterations=5 +train_evaluate_after=5 test_after=21 # we do not test eval_file=testing.py log_level=INFO @@ -32,7 +33,7 @@ do mkdir -p $log_dir cp $i $config_file $python_bin/crudini --set $config_file COMMUNICATION addresses_filepath $ip_machines - $env_python $eval_file -ro 0 -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level + $env_python $eval_file -ro 0 -tea $train_evaluate_after -ld $log_dir -mid $m -ps $procs_per_machine -ms $machines -is $iterations -gf $graph -ta $test_after -cf $config_file -ll $log_level echo $i is done sleep 3 echo end of sleep diff --git a/eval/testing.py b/eval/testing.py index abd6333..efb80df 100644 --- a/eval/testing.py +++ b/eval/testing.py @@ -62,6 +62,7 @@ if __name__ == "__main__": args.log_dir, log_level[args.log_level], args.test_after, + args.train_evaluate_after, args.reset_optimizer, ], ) diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index 7854c38..a543261 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -77,6 +77,7 @@ class Node: iterations, log_dir, test_after, + train_evaluate_after, reset_optimizer, ): """ @@ -96,6 +97,10 @@ class Node: Number of iterations (communication steps) for which the model should be trained log_dir : str Logging directory + test_after : int + Number of iterations after which the test loss and accuracy arecalculated + train_evaluate_after : int + Number of iterations after which the train loss is calculated reset_optimizer : int 1 if optimizer should be reset every communication round, else 0 @@ -108,6 +113,7 @@ class Node: self.log_dir = log_dir self.iterations = iterations self.test_after = test_after + self.train_evaluate_after = train_evaluate_after self.reset_optimizer = reset_optimizer logging.debug("Rank: %d", self.rank) @@ -262,6 +268,7 @@ class Node: log_dir=".", log_level=logging.INFO, test_after=5, + train_evaluate_after = 1, reset_optimizer=1, *args ): @@ -286,6 +293,10 @@ class Node: Logging directory log_level : logging.Level One of DEBUG, INFO, WARNING, ERROR, CRITICAL + test_after : int + Number of iterations after which the test loss and accuracy arecalculated + train_evaluate_after : int + Number of iterations after which the train loss is calculated reset_optimizer : int 1 if optimizer should be reset every communication round, else 0 args : optional @@ -302,6 +313,7 @@ class Node: iterations, log_dir, test_after, + train_evaluate_after, reset_optimizer, ) self.init_log(log_dir, rank, log_level) @@ -319,6 +331,7 @@ class Node: self.testset = self.dataset.get_testset() self.communication.connect_neighbors(self.graph.neighbors(self.uid)) rounds_to_test = self.test_after + rounds_to_train_evaluate = self.train_evaluate_after for iteration in range(self.iterations): logging.info("Starting training iteration: %d", iteration) @@ -332,7 +345,6 @@ class Node: ) # Reset optimizer state self.trainer.reset_optimizer(self.optimizer) - loss_after_sharing = self.trainer.eval_loss(self.dataset) if iteration: with open( @@ -352,7 +364,6 @@ class Node: "grad_std": {}, } - results_dict["train_loss"][iteration + 1] = loss_after_sharing results_dict["total_bytes"][iteration + 1] = self.communication.total_bytes if hasattr(self.sharing, "total_meta"): @@ -365,14 +376,21 @@ class Node: results_dict["grad_mean"][iteration + 1] = self.sharing.mean if hasattr(self.sharing, "std"): results_dict["grad_std"][iteration + 1] = self.sharing.std - - self.save_plot( - results_dict["train_loss"], - "train_loss", - "Training Loss", - "Communication Rounds", - os.path.join(self.log_dir, "{}_train_loss.png".format(self.rank)), - ) + + rounds_to_train_evaluate -= 1 + + if rounds_to_test == 0: + logging.info("Evaluating on train set.") + rounds_to_train_evaluate = self.train_evaluate_after + loss_after_sharing = self.trainer.eval_loss(self.dataset) + results_dict["train_loss"][iteration + 1] = loss_after_sharing + self.save_plot( + results_dict["train_loss"], + "train_loss", + "Training Loss", + "Communication Rounds", + os.path.join(self.log_dir, "{}_train_loss.png".format(self.rank)), + ) rounds_to_test -= 1 @@ -417,6 +435,7 @@ class Node: log_dir=".", log_level=logging.INFO, test_after=5, + train_evaluate_after=1, reset_optimizer=1, *args ): @@ -453,6 +472,10 @@ class Node: Logging directory log_level : logging.Level One of DEBUG, INFO, WARNING, ERROR, CRITICAL + test_after : int + Number of iterations after which the test loss and accuracy arecalculated + train_evaluate_after : int + Number of iterations after which the train loss is calculated reset_optimizer : int 1 if optimizer should be reset every communication round, else 0 args : optional @@ -473,6 +496,7 @@ class Node: log_dir, log_level, test_after, + train_evaluate_after, reset_optimizer, *args ) diff --git a/src/decentralizepy/utils.py b/src/decentralizepy/utils.py index 82f2068..3ca85f5 100644 --- a/src/decentralizepy/utils.py +++ b/src/decentralizepy/utils.py @@ -75,6 +75,7 @@ def get_args(): parser.add_argument("-gf", "--graph_file", type=str, default="36_nodes.edges") parser.add_argument("-gt", "--graph_type", type=str, default="edges") parser.add_argument("-ta", "--test_after", type=int, default=5) + parser.add_argument("-tea", "--train_evaluate_after", type=int, default=1) parser.add_argument("-ro", "--reset_optimizer", type=int, default=1) args = parser.parse_args() -- GitLab From f353b78a9094e96e7cc5dbafa36b691654248525 Mon Sep 17 00:00:00 2001 From: Jeffrey Wigger Date: Sat, 19 Mar 2022 19:47:22 +0100 Subject: [PATCH 16/16] option for when to evaluate on the trainset fix --- src/decentralizepy/node/Node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/decentralizepy/node/Node.py b/src/decentralizepy/node/Node.py index a543261..74de4e1 100644 --- a/src/decentralizepy/node/Node.py +++ b/src/decentralizepy/node/Node.py @@ -379,7 +379,7 @@ class Node: rounds_to_train_evaluate -= 1 - if rounds_to_test == 0: + if rounds_to_train_evaluate == 0: logging.info("Evaluating on train set.") rounds_to_train_evaluate = self.train_evaluate_after loss_after_sharing = self.trainer.eval_loss(self.dataset) -- GitLab