From 119f8d7b784690f1d14425f84fac8ce92abc14b0 Mon Sep 17 00:00:00 2001 From: priyakasimbeg Date: Tue, 31 Oct 2023 23:39:54 +0000 Subject: [PATCH] add flag for setting max split size --- README.md | 5 +++++ submission_runner.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/README.md b/README.md index 6ffbab6f7..de8ea060d 100644 --- a/README.md +++ b/README.md @@ -126,8 +126,13 @@ To use the Docker container as an interactive virtual environment, you can run a -v $HOME/algorithmic-efficiency:/algorithmic-efficiency \ --gpus all \ --ipc=host \ +<<<<<<< HEAD + \ + -keep_container_alive true +======= \ --keep_container_alive true +>>>>>>> ba5c6f6175a0ce12f23a7f035613d9d1edc0b74a ``` Note: You may have to use double quotes around `algorithmic-efficiency` [path] in the mounting `-v` flag. If the above command fails try replacing the following line: ```bash diff --git a/submission_runner.py b/submission_runner.py index 656599a42..6d4cc98e2 100644 --- a/submission_runner.py +++ b/submission_runner.py @@ -149,6 +149,11 @@ None, 'Value of rng seed. If None, a random seed will' 'be generated from hardware.') +flags.DEFINE_boolean( + 'set_pytorch_max_split_size', + None, + 'If true, set pytorch max_split_size_mb to 256' +) FLAGS = flags.FLAGS USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup() @@ -601,6 +606,9 @@ def main(_): # Prevent OOM on librispeech conformer. if FLAGS.workload == 'librispeech_conformer': os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85' + + if FLAGS.set_pytorch_max_split_size is True: + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256' # Extend path according to framework. workload_metadata['workload_path'] = os.path.join(