Skip to content

Commit

Permalink
add retry constraints and wait_for_success as true for training jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
Tooyosi committed Sep 11, 2024
1 parent a74e2d2 commit 01c5b05
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
3 changes: 2 additions & 1 deletion bajor/batch/train_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id):
setup_pytorch_kernel_cache_dir = 'mkdir -p $AZ_BATCH_NODE_SHARED_DIR/.cache/torch/kernels'
job.job_preparation_task = batchmodels.JobPreparationTask(
command_line=f'/bin/bash -c \"set -ex; {setup_pytorch_kernel_cache_dir}; {create_results_dir}; {copy_code_to_shared_dir}\"',
constraints=batchmodels.TaskConstraints(max_task_retry_count=3),
#
# A busted preparation task means the main task won't launch...ever!
# and leave the node in a scaled state costing $$ ££
Expand All @@ -123,7 +124,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id):
# Short term: avoid waiting for this prep task to complete before starting the main task
# https://learn.microsoft.com/en-us/python/api/azure-batch/azure.batch.models.JobPreparationTask?view=azure-python#constructor
# https://learn.microsoft.com/en-us/azure/batch/batch-job-task-error-checking#job-preparation-tasks
wait_for_success=False)
wait_for_success=True)


# Job release task that runs after the job completes
Expand Down
3 changes: 2 additions & 1 deletion bajor/batch/train_from_scratch.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id):
copy_code_to_shared_dir = 'cp -Rf $AZ_BATCH_NODE_MOUNTS_DIR/$TRAINING_CONTAINER_MOUNT_DIR/$CODE_DIR_PATH/* $AZ_BATCH_NODE_SHARED_DIR/'
job.job_preparation_task = batchmodels.JobPreparationTask(
command_line=f'/bin/bash -c \"set -ex; {create_results_dir}; {copy_code_to_shared_dir}\"',
constraints=batchmodels.TaskConstraints(max_task_retry_count=3),
#
# A busted preparation task means the main task won't launch...ever!
# and leave the node in a scaled state costing $$ ££
Expand All @@ -115,7 +116,7 @@ def create_batch_job(job_id, manifest_container_path, pool_id):
# Short term: avoid waiting for this prep task to complete before starting the main task
# https://learn.microsoft.com/en-us/python/api/azure-batch/azure.batch.models.JobPreparationTask?view=azure-python#constructor
# https://learn.microsoft.com/en-us/azure/batch/batch-job-task-error-checking#job-preparation-tasks
wait_for_success=False)
wait_for_success=True)


# Job release task that runs after the job completes
Expand Down

0 comments on commit 01c5b05

Please sign in to comment.