From 45d8241faf2e3b17d62eb8715039191549b14f86 Mon Sep 17 00:00:00 2001 From: XaverStiensmeier Date: Mon, 27 May 2024 14:22:47 +0200 Subject: [PATCH] changed RESUME to POWER_DOWN and removed delete call which is now handled via Slurm that calls terminate.sh --- .../playbook/roles/bibigrid/files/slurm/fail.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/resources/playbook/roles/bibigrid/files/slurm/fail.sh b/resources/playbook/roles/bibigrid/files/slurm/fail.sh index 38d723b4..af7904e9 100644 --- a/resources/playbook/roles/bibigrid/files/slurm/fail.sh +++ b/resources/playbook/roles/bibigrid/files/slurm/fail.sh @@ -21,6 +21,7 @@ process_string() { } mkdir -p worker_logs +mkdir -p worker_logs/fail mkdir -p worker_logs/fail/out mkdir -p worker_logs/fail/err @@ -34,16 +35,13 @@ function log { log "Fail-Script started" -# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting -scontrol update NodeName="$1" state=RESUME reason=FailedStartup # no sudo needed cause executed by slurm user - hosts=$(scontrol show hostnames "$1") -echo "Hosts $hosts used" +log "Hosts $hosts used" -# delete servers -python3 /usr/local/bin/delete_server.py "${hosts}" +# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting +scontrol update NodeName="$1" state=POWER_DOWN reason=FailedStartup # no sudo needed cause executed by slurm user -echo "Finished delete_server.py execution." +log "Nodes $1 set to POWER_DOWN." exit $?