From adec73a527081b171477c44d2f92dfa0298c58c8 Mon Sep 17 00:00:00 2001 From: tooyosi Date: Thu, 19 Dec 2024 15:20:35 +0000 Subject: [PATCH] remove gpu all flag --- bajor/batch/train_finetuning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bajor/batch/train_finetuning.py b/bajor/batch/train_finetuning.py index e578032..60bebf2 100644 --- a/bajor/batch/train_finetuning.py +++ b/bajor/batch/train_finetuning.py @@ -234,7 +234,7 @@ def create_job_tasks(job_id, task_id=1, run_opts=''): '/bin/bash -c "' 'set -ex; ' f'{wait_for_preparation_task_completion}; ' - 'nvidia-smi; ' + 'nvidia-smi; || echo \\"nvidia-smi command failed\\"; ' 'python -c \\"import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())\\"; ' f'{setup_pytorch_kernel_cache_env_var}; ' f'{setup_hugging_face_cache_env_var}; ' @@ -251,7 +251,7 @@ def create_job_tasks(job_id, task_id=1, run_opts=''): container_settings=batchmodels.TaskContainerSettings( image_name=os.getenv('CONTAINER_IMAGE_NAME'), working_directory='taskWorkingDirectory', - container_run_options='--ipc=host --gpus all' + container_run_options='--ipc=host' ), user_identity = batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification(