Skip to content

Commit

Permalink
Merge pull request #220 from cevich/mac_pw_pool_fix_max_tasks
Browse files Browse the repository at this point in the history
Fix possible max-tasks PW pool cascade failure
  • Loading branch information
cevich authored Aug 6, 2024
2 parents 27f6f93 + 7ae84eb commit 718ecdb
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
16 changes: 13 additions & 3 deletions mac_pw_pool/SetupInstances.sh
Original file line number Diff line number Diff line change
Expand Up @@ -218,10 +218,13 @@ for _dhentry in "${_dhstate[@]}"; do
dbg " now_epoch=$now_epoch"
dbg " age_sec=$age_sec"
dbg "hard_max_sec=$hard_max_sec"
msg "Instance alive for $((age_sec/60/60)) hours (max $PW_MAX_HOURS)"
# Soft time limit is enforced via 'sleep $PW_MAX_HOURS && shutdown' started during instance setup (below).
msg "Instance alive for $((age_sec/60/60)) hours (soft max: $PW_MAX_HOURS hard: $((hard_max_sec/60/60)))"
if [[ $age_sec -gt $hard_max_sec ]]; then
force_term "Excess instance lifetime (+$((age_sec-hard_max_sec))s)"
force_term "Excess instance lifetime; $(((age_sec - hard_max_sec)/60))m past hard max limit."
continue
elif [[ $age_sec -gt $((PW_MAX_HOURS*60*60)) ]]; then
pwst_warn "Instance alive longer than soft max. Investigation recommended."
fi

dbg "Attempting to contact '$name' at $pub_dns"
Expand Down Expand Up @@ -424,8 +427,15 @@ for _dhentry in "${_dhstate[@]}"; do
msg "Apparent tasks started/finished/running: $n_started_tasks $n_finished_tasks $((n_started_tasks-n_finished_tasks)) (max $PW_MAX_TASKS)"

dbg "Checking apparent task limit"
if [[ "$n_finished_tasks" -gt $PW_MAX_TASKS ]]; then
# N/B: This is only enforced based on the _previous_ run of this script worker-count.
# Doing this on the _current_ alive worker count would add a lot of complexity.
if [[ "$n_finished_tasks" -gt $PW_MAX_TASKS ]] && [[ $n_pw_total -gt $PW_MIN_ALIVE ]]; then
# N/B: Termination based on _finished_ tasks, so if a task happens to be currently running
# it will very likely have _just_ started in the last few seconds. Cirrus will retry
# automatically on another worker.
force_term "Instance exceeded $PW_MAX_TASKS apparent tasks."
elif [[ $n_pw_total -le $PW_MIN_ALIVE ]]; then
pwst_warn "Not enforcing max-tasks limit, only $n_pw_total workers online last run."
fi
done

Expand Down
5 changes: 3 additions & 2 deletions mac_pw_pool/pw_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,12 @@ CREATE_STAGGER_HOURS=2

# Instance shutdown controls (assumes terminate-on-shutdown behavior)
PW_MAX_HOURS=24 # Since successful configuration
PW_MAX_TASKS=12 # Logged by listener (N/B: Log can be manipulated by tasks!)
PW_MAX_TASKS=24 # Logged by listener (N/B: Log can be manipulated by tasks!)
PW_MIN_ALIVE=3 # Bypass enforcement of $PW_MAX_TASKS if <= alive/operating workers

# How long to wait for setup.sh to finish running (drop a .setup.done file)
# before forcibly terminating.
SETUP_MAX_SECONDS=1200 # Typical time ~600seconds
SETUP_MAX_SECONDS=2400 # Typical time ~10 minutes, use 2x safety-factor.

# Name of launch template. Current/default version will be used.
# https://us-east-1.console.aws.amazon.com/ec2/home?region=us-east-1#LaunchTemplates:
Expand Down

0 comments on commit 718ecdb

Please sign in to comment.