Skip to content
Snippets Groups Projects
Commit 680a56dd authored by Mathis "what could possibly go wrong" Randl's avatar Mathis "what could possibly go wrong" Randl
Browse files

added 35-workers scenario

parent 55d72cb9
No related branches found
No related tags found
No related merge requests found
36
1 11
1 6
1 15
1 24
2 17
2 19
2 27
2 8
3 18
3 27
3 19
3 31
4 11
4 27
4 29
4 24
5 27
5 8
5 30
5 16
6 1
6 34
6 13
6 14
7 25
7 28
7 32
7 16
8 17
8 2
8 19
8 5
9 35
9 20
9 13
9 32
10 33
10 34
10 35
10 14
11 1
11 20
11 4
11 21
12 28
12 20
12 14
12 15
13 9
13 33
13 6
13 15
14 10
14 35
14 12
14 6
15 1
15 35
15 12
15 13
16 26
16 5
16 30
16 7
17 2
17 29
17 23
17 8
18 26
18 3
18 21
18 30
19 2
19 3
19 26
19 8
20 33
20 9
20 11
20 12
21 18
21 11
21 31
21 34
22 28
22 31
22 23
22 32
23 33
23 17
23 22
23 31
24 1
24 25
24 4
24 30
25 34
25 32
25 7
25 24
26 18
26 19
26 29
26 16
27 2
27 3
27 4
27 5
28 12
28 29
28 22
28 7
29 28
29 26
29 4
29 17
30 18
30 5
30 24
30 16
31 3
31 21
31 22
31 23
32 25
32 9
32 22
32 7
33 10
33 20
33 13
33 23
34 25
34 10
34 21
34 6
35 9
35 10
35 14
35 15
36
1 2
1 17
1 28
1 30
2 1
2 3
2 7
2 8
2 19
2 31
3 2
3 4
3 5
3 23
3 25
3 26
4 34
4 3
4 5
4 16
4 18
5 3
5 4
5 6
5 10
5 23
6 33
6 5
6 7
6 9
6 20
6 26
7 8
7 2
7 6
8 32
8 2
8 34
8 7
8 9
9 35
9 6
9 8
9 10
9 11
9 18
9 23
9 31
10 34
10 5
10 9
10 11
10 17
10 18
10 22
10 23
11 34
11 9
11 10
11 12
11 19
11 25
11 27
11 29
11 30
12 32
12 11
12 13
12 15
12 16
12 23
13 12
13 14
13 15
13 18
13 25
14 35
14 13
14 15
14 16
14 25
15 33
15 12
15 13
15 14
15 16
15 18
15 27
15 30
16 35
16 4
16 12
16 14
16 15
16 17
17 1
17 10
17 16
17 18
17 19
18 32
18 4
18 9
18 10
18 13
18 15
18 17
18 19
18 20
19 2
19 11
19 17
19 18
19 20
19 30
20 35
20 6
20 18
20 19
20 21
20 22
20 27
21 20
21 22
21 23
21 29
21 30
22 10
22 20
22 21
22 23
22 25
23 3
23 5
23 9
23 10
23 12
23 21
23 22
23 24
23 29
24 25
24 23
25 33
25 3
25 35
25 11
25 13
25 14
25 22
25 24
25 26
25 29
25 31
26 27
26 25
26 3
26 6
27 35
27 11
27 15
27 20
27 26
27 28
28 1
28 27
28 29
29 11
29 21
29 23
29 25
29 28
29 30
30 32
30 1
30 11
30 15
30 19
30 21
30 29
30 31
31 32
31 2
31 9
31 25
31 30
32 33
32 8
32 12
32 18
32 30
32 31
33 32
33 34
33 6
33 15
33 25
34 33
34 35
34 4
34 8
34 10
34 11
35 34
35 9
35 14
35 16
35 20
35 25
35 27
......@@ -76,6 +76,7 @@ class DPSGDNodeTimeout(Node):
_, data = rec
self.model.load_state_dict(data["model"])
self.sharing._post_step()
self.sharing.communication_round += 1
self.broadcast_weights_to_neighbors(ite)
if self.reset_optimizer:
......
......@@ -268,14 +268,23 @@ class TimeoutParameterServer(Node):
Start the federated parameter-serving service.
"""
self.testset = self.dataset.get_testset()
#global_epoch = 1
#change = 1
#to_send = dict()
self.current_workers = self.my_neighbors
logging.info("begin tps")
for ite in range(self.iterations):
#self.iteration = iteration
logging.info("begin iteration " + str(ite))
self.peer_deques = dict()
to_send = {"CHANNEL" : "WORKER_REQUEST", "model" : self.model.state_dict()}#, "iteration" : ite}
for worker in self.current_workers:
self.communication.send(worker, {"CHANNEL" : "WORKER_REQUEST", "model" : self.model.state_dict()})
self.communication.send(worker, to_send)
self.peer_deques = dict()
awaiting = True
while awaiting and len(self.peer_deques) != len(self.current_workers):
rec = self.receive_channel("WORKER_ANSWER", timeout = self.timeout)
......@@ -291,17 +300,10 @@ class TimeoutParameterServer(Node):
for worker in self.peer_deques.keys():
averaging_deque[worker] = self.peer_deques[worker]
to_add = self.sharing.get_data_to_send()
to_add["degree"] = 1
to_add["iteration"] = ite
to_add["CHANNEL"] = "localchan"
averaging_deque[self.uid] = deque()
averaging_deque[self.uid].append(to_add)
averaging_deque = dict(sorted(averaging_deque.items()))
self.sharing._pre_step()
self.sharing._averaging_server(averaging_deque)
if len(averaging_deque) > 0:
averaging_deque = dict(sorted(averaging_deque.items()))
self.sharing._pre_step()
self.sharing._averaging_server(averaging_deque)
rec = self.receive_channel("trash_chan", timeout = 3000.0)
......
......@@ -2,8 +2,8 @@
dataset_package = decentralizepy.datasets.CIFAR10
dataset_class = CIFAR10
model_class = LeNet
train_dir = ../CIFAR
test_dir = ../CIFAR
train_dir = /decentralizepy/CIFAR
test_dir = /decentralizepy/CIFAR
; python list of fractions below
sizes =
random_seed = 90
......@@ -13,14 +13,14 @@ shards = 5
[OPTIMIZER_PARAMS]
optimizer_package = torch.optim
optimizer_class = SGD
lr = 0.05
lr = 0.075
[TRAIN_PARAMS]
training_package = decentralizepy.training.Training
training_class = Training
rounds = 3
rounds = 10
full_epochs = False
batch_size = 300
batch_size = 32
shuffle = True
loss_package = torch.nn
loss_class = CrossEntropyLoss
......
......@@ -4,19 +4,19 @@ decpy_path=../eval
cd $decpy_path
env_python=python3
graph=./5-star-server.edges
graph=./35_reg_4_workers.edges
original_config=../tutorial/remote_config_celeba_sharing.ini
config_file=./config.ini
procs_per_machine=1
machines=6
iterations=50
test_after=50
machines=36
iterations=600
test_after=600
eval_file=testingTimeout.py
log_level=INFO
server_rank=-1
server_machine=0
working_rate=1
threads_per_worker=3
threads_per_worker=1
m=$(cat /identity.txt)
echo M is $m
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment