diff --git a/helm/hpcc/templates/thor.yaml b/helm/hpcc/templates/thor.yaml index 7df27415d80..8edd3831d20 100644 --- a/helm/hpcc/templates/thor.yaml +++ b/helm/hpcc/templates/thor.yaml @@ -99,7 +99,7 @@ data: spec: {{- include "hpcc.placementsByJobTargetType" (dict "root" .root "job" $eclAgentJobName "target" .me.name "type" "thor") | indent 10 }} serviceAccountName: "hpcc-agent" - terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 60 }} + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" . | indent 10 }} {{- include "hpcc.addImagePullSecrets" . | nindent 10 -}} @@ -241,7 +241,7 @@ data: spec: {{- include "hpcc.placementsByJobTargetType" (dict "root" .root "job" $thorWorkerJobName "target" .me.name "type" "thor") | indent 10 }} serviceAccountName: hpcc-default - terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 60 }} + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" . | indent 10 }} {{- include "hpcc.addImagePullSecrets" . | nindent 10 -}} diff --git a/initfiles/bin/check_executes.sh b/initfiles/bin/check_executes.sh index 405af7c7da2..405c8885eea 100755 --- a/initfiles/bin/check_executes.sh +++ b/initfiles/bin/check_executes.sh @@ -1,5 +1,14 @@ #!/bin/bash +# ---------------------------------------------------------------------------- +# This script is the main entry point for all HPCC helm components. +# It will launch the component process and check its exist status. +# If the process exits with a non-zero status, it will collect post-mortem +# information and associate with the workunit if specified. +# It is also responsible for periodically updating the running file that +# the postrun sidecar monitors (see container_watch.sh). +# ---------------------------------------------------------------------------- + # The yaml passes in all arguments as a single string, due to the way the args are built up, # and additional args _HPCC_ARGS_ are substituted in. # Split the single string argument into individual arguments diff --git a/initfiles/bin/collect_postmortem.sh b/initfiles/bin/collect_postmortem.sh index 1b16a6e12ae..89915620c24 100755 --- a/initfiles/bin/collect_postmortem.sh +++ b/initfiles/bin/collect_postmortem.sh @@ -1,5 +1,14 @@ #!/bin/bash +# ---------------------------------------------------------------------------- +# This script collects various post-mortem information and writes it to a unique +# subdirectory with the specified output directory (which should be on persistent +# storage, e.g. the debug plane). +# It will then associate this new directory with the workunit if available. +# It is either launched from the main container's entrypoint script (check_executes.sh), +# or from the 'postrun' sidecar container (container_watch.sh). +# ---------------------------------------------------------------------------- + container="" daliServer="" directory="" diff --git a/initfiles/bin/container_watch.sh b/initfiles/bin/container_watch.sh index a485ec652b2..4ad1df6b6d9 100755 --- a/initfiles/bin/container_watch.sh +++ b/initfiles/bin/container_watch.sh @@ -1,5 +1,18 @@ #!/bin/bash +# ---------------------------------------------------------------------------- +# This script is used by the 'postrun' sidecar container to monitor the +# main containers by tracking their 'running' and 'stopped' files. +# If a 'running' file is not updated within a certain time period, the +# script will trigger a postmortem collection. +# This can happen if the main containers are abruptly halted by k8s, +# e.g. due to k8s OOM evictions. +# NB: The isJob option causes the script to exit if it detects that the +# job has finished (i.e. the main container has stopped). Otherwise, it +# will continue to loop and wait for a new instance of the main container to +# restart. +# ---------------------------------------------------------------------------- + config="" daliServer="" directory=""