PT distributed, fix GPU0 memory

Fix #1469
rwth-i6 · Nov 29, 2023 · 8829b8f · 8829b8f
1 parent 5b29a8c
commit 8829b8f
Showing 1 changed file with 1 addition and 0 deletions.
diff --git a/returnn/torch/engine.py b/returnn/torch/engine.py
@@ -108,6 +108,7 @@ def __init__(self, config: Config):
             print(f"Start running torch distributed training on local rank {local_rank}.", file=log.v2)
             assert self._device == "cuda", f"torch distributed: unexpected device {self._device!r}"
             self._device = f"cuda:{local_rank}"
+            torch.cuda.set_device(local_rank)
 
         # Theano and TensorFlow print sth like: Using gpu device 2: GeForce GTX 980 (...)
         # Print in a similar format so that some scripts which grep our stdout work just as before.