From 1bffe7963a2604e6c18fa900340b22390109ba99 Mon Sep 17 00:00:00 2001 From: Lukas Date: Tue, 2 May 2023 21:41:56 +0200 Subject: [PATCH 1/2] Set strategy to `ddp` in ddp config (#571) --- configs/trainer/ddp.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/configs/trainer/ddp.yaml b/configs/trainer/ddp.yaml index 4e5238e32..96bef3958 100644 --- a/configs/trainer/ddp.yaml +++ b/configs/trainer/ddp.yaml @@ -1,11 +1,7 @@ defaults: - default.yaml -# use "ddp_spawn" instead of "ddp", -# it's slower but normal "ddp" currently doesn't work ideally with hydra -# https://github.com/facebookresearch/hydra/issues/2070 -# https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn -strategy: ddp_spawn +strategy: ddp accelerator: gpu devices: 4 From 334271dd02458d6a13aed79a5725c3e034363dc3 Mon Sep 17 00:00:00 2001 From: Lukas Date: Tue, 2 May 2023 21:45:48 +0200 Subject: [PATCH 2/2] Set `sync_dist=True` when logging best so far validation accuracy (#572) --- src/models/mnist_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/mnist_module.py b/src/models/mnist_module.py index e1bb76d93..d27cc9f22 100644 --- a/src/models/mnist_module.py +++ b/src/models/mnist_module.py @@ -97,7 +97,7 @@ def on_validation_epoch_end(self): self.val_acc_best(acc) # update best so far val acc # log `val_acc_best` as a value through `.compute()` method, instead of as a metric object # otherwise metric would be reset by lightning after each epoch - self.log("val/acc_best", self.val_acc_best.compute(), prog_bar=True) + self.log("val/acc_best", self.val_acc_best.compute(), sync_dist=True, prog_bar=True) def test_step(self, batch: Any, batch_idx: int): loss, preds, targets = self.model_step(batch)