From 3fd93655e1a285eb72655a6551b389a443457f64 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Wed, 20 Nov 2024 20:18:19 -0500 Subject: [PATCH 1/7] Add --fail-fast --- disbatch/disBatch.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/disbatch/disBatch.py b/disbatch/disBatch.py index 4dc70e7..f2fdc0a 100644 --- a/disbatch/disBatch.py +++ b/disbatch/disBatch.py @@ -1217,6 +1217,7 @@ def __init__(self, kvs, db_info, tasks, trackResults=None): self.statusLastOffset = self.statusFile.tell() self.noMoreTasks = False self.tasksDone = False + self.failFast = db_info.args.fail_fast self.daemon = True self.start() @@ -1525,6 +1526,11 @@ def run(self): # Remember the first failure. Somewhat arbitrary. self.currentReturnCode = rc + if self.failed and self.failFast: + logger.info(f'Failing fast, task exited with code: {self.currentReturnCode}') + self.ageQ.put('CheckFailExit') + break + # Maybe we want to track results by streamIndex instead of taskId? But then there could be more than # one per key if self.trackResults: @@ -2234,6 +2240,7 @@ def shutdown(s=None, f=None): '--use-address', default=None, metavar='HOST:PORT', help='Specify hostname and port to use for this run.' ) argp.add_argument('-w', '--web', action='store_true', help='Enable web interface.') + argp.add_argument('-f', '--fail-fast', action='store_true', help='Exit on first task failure.') source = argp.add_mutually_exclusive_group(required=True) source.add_argument( '--taskcommand', From 07614181a8543b1beffde4d0d7a83df99d0b5724 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Wed, 20 Nov 2024 20:18:32 -0500 Subject: [PATCH 2/7] Add --fail-fast test --- tests/test_ssh/run.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_ssh/run.sh b/tests/test_ssh/run.sh index fab0f48..e61e82b 100755 --- a/tests/test_ssh/run.sh +++ b/tests/test_ssh/run.sh @@ -1,7 +1,7 @@ #!/bin/bash workdir=$(mktemp -d -p ./ disbatch-test.XXXX) -cp Tasks $workdir +cp -t $workdir Tasks Tasks_failfast cd $workdir # Run the test @@ -12,6 +12,11 @@ disBatch -s localhost:2 Tasks [[ -f A.txt && -f B.txt && -f C.txt ]] success=$? +rm A.txt B.txt C.txt +disbatch -s localhost:2 --fail-fast Tasks_failfast +[[ ! -f A.txt ]] +success=$((success + $?)) + cd - > /dev/null if [[ $success -eq 0 ]]; then From 73f404cc159455804575b1a4f931f366ef1e8571 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Wed, 20 Nov 2024 20:24:32 -0500 Subject: [PATCH 3/7] Fix test for macos --- tests/test_ssh/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_ssh/run.sh b/tests/test_ssh/run.sh index e61e82b..56211be 100755 --- a/tests/test_ssh/run.sh +++ b/tests/test_ssh/run.sh @@ -1,7 +1,7 @@ #!/bin/bash workdir=$(mktemp -d -p ./ disbatch-test.XXXX) -cp -t $workdir Tasks Tasks_failfast +cp Tasks Tasks_failfast $workdir cd $workdir # Run the test From d65b76d2299639897b7e7b15ce2d70a8be5ca38c Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Wed, 20 Nov 2024 20:30:09 -0500 Subject: [PATCH 4/7] Fix silently failing test --- tests/test_slurm/run.sh | 6 ++++-- tests/test_ssh/Tasks_failfast | 3 +++ tests/test_ssh/run.sh | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 tests/test_ssh/Tasks_failfast diff --git a/tests/test_slurm/run.sh b/tests/test_slurm/run.sh index c6a5487..54edac8 100755 --- a/tests/test_slurm/run.sh +++ b/tests/test_slurm/run.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + workdir=$(mktemp -d -p ./ disbatch-test.XXXX) cp Tasks $workdir cd $workdir @@ -9,8 +11,8 @@ salloc -n 2 disBatch Tasks # Check that all 3 tasks ran, # which means A.txt, B.txt, and C.txt exist -[[ -f A.txt && -f B.txt && -f C.txt ]] -success=$? +success=0 +[[ -f A.txt && -f B.txt && -f C.txt ]] || success=$? cd - > /dev/null diff --git a/tests/test_ssh/Tasks_failfast b/tests/test_ssh/Tasks_failfast new file mode 100644 index 0000000..8291406 --- /dev/null +++ b/tests/test_ssh/Tasks_failfast @@ -0,0 +1,3 @@ +sleep 1000 +exit 1 +touch A.txt diff --git a/tests/test_ssh/run.sh b/tests/test_ssh/run.sh index 56211be..6d332c2 100755 --- a/tests/test_ssh/run.sh +++ b/tests/test_ssh/run.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -e + workdir=$(mktemp -d -p ./ disbatch-test.XXXX) cp Tasks Tasks_failfast $workdir cd $workdir From 81de4daf4b9be25cfb470ef798384276aa893bbb Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 25 Nov 2024 18:34:35 -0500 Subject: [PATCH 5/7] Add stderr message if failing fast --- disbatch/disBatch.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/disbatch/disBatch.py b/disbatch/disBatch.py index f2fdc0a..04724b9 100644 --- a/disbatch/disBatch.py +++ b/disbatch/disBatch.py @@ -1528,7 +1528,9 @@ def run(self): if self.failed and self.failFast: logger.info(f'Failing fast, task exited with code: {self.currentReturnCode}') + print('Quitting early due to task failure with --fail-fast', file=sys.stderr) self.ageQ.put('CheckFailExit') + # Break out of the main driver control loop and drop into the exit code break # Maybe we want to track results by streamIndex instead of taskId? But then there could be more than @@ -1577,6 +1579,7 @@ def run(self): # A "check" barrier fails if any tasks before it do (since the start or the last barrier). logger.info('Barrier check failed: %d.', self.currentReturnCode) self.ageQ.put('CheckFailExit') + # Break out of the main driver control loop and drop into the exit code break # Let the feeder know. self.ageQ.put(bTinfo.taskId) @@ -2240,7 +2243,12 @@ def shutdown(s=None, f=None): '--use-address', default=None, metavar='HOST:PORT', help='Specify hostname and port to use for this run.' ) argp.add_argument('-w', '--web', action='store_true', help='Enable web interface.') - argp.add_argument('-f', '--fail-fast', action='store_true', help='Exit on first task failure.') + argp.add_argument( + '-f', + '--fail-fast', + action='store_true', + help='Exit on first task failure. Running tasks will be interrupted and disBatch will exit with a non-zero exit code.', + ) source = argp.add_mutually_exclusive_group(required=True) source.add_argument( '--taskcommand', From e5522298a49836337ebabbf7172e2e63df0ee37b Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 25 Nov 2024 18:44:16 -0500 Subject: [PATCH 6/7] fail fast: update readme and changelog --- CHANGES.md | 1 + Readme.md | 71 +++++++++++++++++++++--------------------------------- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0dba0c4..799e7d6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -22,6 +22,7 @@ https://github.com/flatironinstitute/disBatch/pull/32 - Refreshed the readme - Added `disbatch --version` and `disbatch.__version__` - Added MacOS test +- Added `--fail-fast` option [https://github.com/flatironinstitute/disBatch/pull/38] ### Changes - `kvsstcp` submodule is now vendored diff --git a/Readme.md b/Readme.md index 9d74542..6a2e51d 100644 --- a/Readme.md +++ b/Readme.md @@ -289,14 +289,11 @@ disBatch refers to a collection of execution resources as a *context* and the re ## Invocation ``` -usage: disbatch [-h] [-e] [--force-resume] [--kvsserver [HOST:PORT]] - [--logfile FILE] - [--loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG}] [--mailFreq N] - [--mailTo ADDR] [-p PATH] [-r STATUSFILE] [-R] [-S] - [--status-header] [--use-address HOST:PORT] [-w] - [--taskcommand COMMAND] [--taskserver [HOST:PORT]] - [-C TASK_LIMIT] [-c N] [--fill] [-g] [--no-retire] [-l COMMAND] - [--retire-cmd COMMAND] [-s HOST:CORECOUNT] [-t N] +usage: disbatch [-h] [-e] [--force-resume] [--kvsserver [HOST:PORT]] [--logfile FILE] + [--loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG}] [--mailFreq N] [--mailTo ADDR] [-p PATH] + [-r STATUSFILE] [-R] [-S] [--status-header] [--use-address HOST:PORT] [-w] [-f] + [--taskcommand COMMAND] [--taskserver [HOST:PORT]] [--version] [-C TASK_LIMIT] [-c N] [--fill] + [--no-retire] [-l COMMAND] [--retire-cmd COMMAND] [-s HOST:CORECOUNT] [-t N] [taskfile] Use batch resources to process a file of tasks, one task per line. @@ -306,63 +303,51 @@ positional arguments: options: -h, --help show this help message and exit - -e, --exit-code When any task fails, exit with non-zero status (default: - only if disBatch itself fails) - --force-resume With -r, proceed even if task commands/lines are - different. + -e, --exit-code When any task fails, exit with non-zero status (default: only if disBatch itself fails) + --force-resume With -r, proceed even if task commands/lines are different. --kvsserver [HOST:PORT] Use a running KVS server. --logfile FILE Log file. --loglevel {CRITICAL,ERROR,WARNING,INFO,DEBUG} Logging level (default: INFO). - --mailFreq N Send email every N task completions (default: 1). "-- - mailTo" must be given. + --mailFreq N Send email every N task completions (default: 1). "--mailTo" must be given. --mailTo ADDR Mail address for task completion notification(s). -p PATH, --prefix PATH - Path for log, dbUtil, and status files (default: "."). - If ends with non-directory component, use as prefix for - these files names (default: - _disBatch__). + Path for log, dbUtil, and status files (default: "."). If ends with non-directory component, + use as prefix for these files names (default: _disBatch__). -r STATUSFILE, --resume-from STATUSFILE - Read the status file from a previous run and skip any - completed tasks (may be specified multiple times). - -R, --retry With -r, also retry any tasks which failed in previous - runs (non-zero return). - -S, --startup-only Startup only the disBatch server (and KVS server if - appropriate). Use "dbUtil..." script to add execution - contexts. Incompatible with "--ssh-node". + Read the status file from a previous run and skip any completed tasks (may be specified + multiple times). + -R, --retry With -r, also retry any tasks which failed in previous runs (non-zero return). + -S, --startup-only Startup only the disBatch server (and KVS server if appropriate). Use "dbUtil..." script to + add execution contexts. Incompatible with "--ssh-node". --status-header Add header line to status file. --use-address HOST:PORT Specify hostname and port to use for this run. -w, --web Enable web interface. + -f, --fail-fast Exit on first task failure. Running tasks will be interrupted and disBatch will exit with a + non-zero exit code. --taskcommand COMMAND - Tasks will come from the command specified via the KVS - server (passed in the environment). + Tasks will come from the command specified via the KVS server (passed in the environment). --taskserver [HOST:PORT] Tasks will come from the KVS server. + --version Print the version and exit -C TASK_LIMIT, --context-task-limit TASK_LIMIT Shutdown after running COUNT tasks (0 => no limit). -c N, --cpusPerTask N - Number of cores used per task; may be fractional - (default: 1). - --fill Try to use extra cores if allocated cores exceeds - requested cores. - -g, --gpu Use assigned GPU resources [DEPRECATED] - --no-retire Don't retire nodes from the batch system (e.g., if - running as part of a larger job). + Number of cores used per task; may be fractional (default: 1). + --fill Try to use extra cores if allocated cores exceeds requested cores. + --no-retire Don't retire nodes from the batch system (e.g., if running as part of a larger job). -l COMMAND, --label COMMAND Label for this context. Should be unique. - --retire-cmd COMMAND Shell command to run to retire a node (environment - includes $NODE being retired, remaining $ACTIVE node - list, $RETIRED node list; default based on batch - system). Incompatible with "--ssh-node". + --retire-cmd COMMAND Shell command to run to retire a node (environment includes $NODE being retired, remaining + $ACTIVE node list, $RETIRED node list; default based on batch system). Incompatible with "-- + ssh-node". -s HOST:CORECOUNT, --ssh-node HOST:CORECOUNT - Run tasks over SSH on the given nodes (can be specified - multiple times for additional hosts; equivalent to - setting DISBATCH_SSH_NODELIST) + Run tasks over SSH on the given nodes (can be specified multiple times for additional hosts; + equivalent to setting DISBATCH_SSH_NODELIST) -t N, --tasksPerNode N - Maximum concurrently executing tasks per node (up to - cores/cpusPerTask). + Maximum concurrently executing tasks per node (up to cores/cpusPerTask). ``` The options for mail will only work if your computing environment permits processes to access mail via SMTP. From 54767dc05e86827c6f5d4a10b79fea5cb63b5652 Mon Sep 17 00:00:00 2001 From: Lehman Garrison Date: Mon, 25 Nov 2024 18:45:34 -0500 Subject: [PATCH 7/7] fail fast: update tests --- tests/test_slurm/run.sh | 32 +++++++++++++++++++------------- tests/test_ssh/run.sh | 31 ++++++++++++++++--------------- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/tests/test_slurm/run.sh b/tests/test_slurm/run.sh index 54edac8..b43af44 100755 --- a/tests/test_slurm/run.sh +++ b/tests/test_slurm/run.sh @@ -1,9 +1,15 @@ #!/bin/bash -set -e +exit_fail() { + err=$? + echo "Slurm test failed! Output is in $workdir" + exit $err +} + +trap exit_fail ERR -workdir=$(mktemp -d -p ./ disbatch-test.XXXX) -cp Tasks $workdir +workdir=$(mktemp -d -p $PWD disbatch-test.XXXX) +cp Tasks Tasks_failfast $workdir cd $workdir # Run the test @@ -11,16 +17,16 @@ salloc -n 2 disBatch Tasks # Check that all 3 tasks ran, # which means A.txt, B.txt, and C.txt exist -success=0 -[[ -f A.txt && -f B.txt && -f C.txt ]] || success=$? +[[ -f A.txt && -f B.txt && -f C.txt ]] -cd - > /dev/null +rm -f A.txt B.txt C.txt -if [[ $success -eq 0 ]]; then - echo "Slurm test passed." - rm -rf $workdir -else - echo "Slurm test failed! Output is in $workdir" -fi +# disBatch is expected to exit with a non-zero exit code here +salloc -n 2 disbatch --fail-fast Tasks_failfast || true + +# check that we failed fast and didn't run any more tasks +[[ ! -f A.txt ]] -exit $success +trap - ERR +echo "Slurm test passed." +rm -rf $workdir diff --git a/tests/test_ssh/run.sh b/tests/test_ssh/run.sh index 6d332c2..dfeb089 100755 --- a/tests/test_ssh/run.sh +++ b/tests/test_ssh/run.sh @@ -1,8 +1,14 @@ #!/bin/bash -set -e +exit_fail() { + err=$? + echo "SSH test failed! Output is in $workdir" + exit $err +} + +trap exit_fail ERR -workdir=$(mktemp -d -p ./ disbatch-test.XXXX) +workdir=$(mktemp -d -p $PWD disbatch-test.XXXX) cp Tasks Tasks_failfast $workdir cd $workdir @@ -12,20 +18,15 @@ disBatch -s localhost:2 Tasks # Check that all 3 tasks ran, # which means A.txt, B.txt, and C.txt exist [[ -f A.txt && -f B.txt && -f C.txt ]] -success=$? -rm A.txt B.txt C.txt -disbatch -s localhost:2 --fail-fast Tasks_failfast -[[ ! -f A.txt ]] -success=$((success + $?)) +rm -f A.txt B.txt C.txt -cd - > /dev/null +# disBatch is expected to exit with a non-zero exit code here +disbatch -s localhost:2 --fail-fast Tasks_failfast || true -if [[ $success -eq 0 ]]; then - echo "SSH test passed." - rm -rf $workdir -else - echo "SSH test failed! Output is in $workdir" -fi +# check that we failed fast and didn't run any more tasks +[[ ! -f A.txt ]] -exit $success +trap - ERR +echo "SSH test passed." +rm -rf $workdir