From 8829b8f0a4f26de62ed027b3f23dd509bf4c79be Mon Sep 17 00:00:00 2001 From: Albert Zeyer Date: Wed, 29 Nov 2023 17:35:34 +0100 Subject: [PATCH] PT distributed, fix GPU0 memory Fix #1469 --- returnn/torch/engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/returnn/torch/engine.py b/returnn/torch/engine.py index b9c16c0996..07928c5e4a 100644 --- a/returnn/torch/engine.py +++ b/returnn/torch/engine.py @@ -108,6 +108,7 @@ def __init__(self, config: Config): print(f"Start running torch distributed training on local rank {local_rank}.", file=log.v2) assert self._device == "cuda", f"torch distributed: unexpected device {self._device!r}" self._device = f"cuda:{local_rank}" + torch.cuda.set_device(local_rank) # Theano and TensorFlow print sth like: Using gpu device 2: GeForce GTX 980 (...) # Print in a similar format so that some scripts which grep our stdout work just as before.