diff --git a/bajor/batch/train_finetuning.py b/bajor/batch/train_finetuning.py index 5ab7217..dff047f 100644 --- a/bajor/batch/train_finetuning.py +++ b/bajor/batch/train_finetuning.py @@ -233,7 +233,7 @@ def create_job_tasks(job_id, task_id=1, run_opts=''): command = ( '/bin/bash -c "' 'set -ex; ' - 'nvidia-smi; ' + 'nvidia-smi; || echo \\"nvidia-smi command failed\\"; ' 'python -c \\"import torch; print(torch.cuda.is_available()); print(torch.cuda.device_count())\\"; ' f'{wait_for_preparation_task_completion}; ' f'{setup_pytorch_kernel_cache_env_var}; ' @@ -251,7 +251,7 @@ def create_job_tasks(job_id, task_id=1, run_opts=''): container_settings=batchmodels.TaskContainerSettings( image_name=os.getenv('CONTAINER_IMAGE_NAME'), working_directory='taskWorkingDirectory', - container_run_options='--ipc=host' + container_run_options='--ipc=host --gpus all' ), user_identity = batchmodels.UserIdentity( auto_user=batchmodels.AutoUserSpecification(