From 2e1e04778c6d2062a0acd2d9efed4f652716e1b9 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Tue, 1 Oct 2024 15:21:58 +0100 Subject: [PATCH] HPCC-32683 Fix issues with postmortem and container death helm changes: - Consistently generate command via addCommandAndLifecycle, and collect added container information in lifeCycleCtx based on containers added to lifeCycleCtx - Add terminationGracePeriodSeconds option - Mount ephemeral directories in distinct subPaths. - Fix issues with postmortem's from different containers overwriting one another - Add addPostRunContainer to generate postrun container, to monitor running containers. script changes: - add container_watch.sh for postrun pod. - move all options handling into check_executes.sh code changes: - code grace time into k8s job file into, so that postjob clearup can also use - Clear wuid file (prevents spurious error association) Signed-off-by: Jake Smith --- helm/hpcc/templates/_helpers.tpl | 245 +++++++++++++++++--------- helm/hpcc/templates/dafilesrv.yaml | 12 +- helm/hpcc/templates/dali.yaml | 21 ++- helm/hpcc/templates/dfuserver.yaml | 13 +- helm/hpcc/templates/eclagent.yaml | 26 +-- helm/hpcc/templates/eclccserver.yaml | 25 ++- helm/hpcc/templates/eclscheduler.yaml | 16 +- helm/hpcc/templates/esp.yaml | 16 +- helm/hpcc/templates/localroxie.yaml | 17 +- helm/hpcc/templates/roxie.yaml | 41 ++--- helm/hpcc/templates/sasha.yaml | 6 +- helm/hpcc/templates/thor.yaml | 68 +++---- helm/hpcc/values.schema.json | 37 +++- initfiles/CMakeLists.txt | 4 +- initfiles/bin/check_executes | 129 -------------- initfiles/bin/check_executes.sh | 162 +++++++++++++++++ initfiles/bin/collect_postmortem.sh | 129 ++++++++++++++ initfiles/bin/container_watch.sh | 171 ++++++++++++++++++ initfiles/bin/k8s_postjob_clearup.sh | 23 ++- system/jlib/jcontainerized.cpp | 7 +- system/jlib/jcontainerized.hpp | 1 + system/jlib/jptree.cpp | 2 +- thorlcr/master/thgraphmanager.cpp | 1 + thorlcr/slave/slavmain.cpp | 2 + 24 files changed, 813 insertions(+), 361 deletions(-) delete mode 100755 initfiles/bin/check_executes create mode 100755 initfiles/bin/check_executes.sh create mode 100755 initfiles/bin/collect_postmortem.sh create mode 100755 initfiles/bin/container_watch.sh diff --git a/helm/hpcc/templates/_helpers.tpl b/helm/hpcc/templates/_helpers.tpl index b5202d85598..e383472a70f 100644 --- a/helm/hpcc/templates/_helpers.tpl +++ b/helm/hpcc/templates/_helpers.tpl @@ -313,33 +313,85 @@ metrics: {{- end -}} {{/* -Add ConfigMap volume mount for a component +Add tmp volume mount */}} -{{- define "hpcc.addConfigMapVolumeMount" -}} -- name: {{ .name }}-temp-volume +{{- define "hpcc.addTempVolumeMount" -}} +{{- $volumeName := .volumeName | default .name -}} +- name: {{ $volumeName }}-temp-volume mountPath: /tmp -- name: {{ .name }}-hpcctmp-volume +{{- if not .noSubPath }} + subPath: {{ .name | quote }} +{{- end -}} +{{- end -}} + +{{/* +Add runtime volume mount +*/}} +{{- define "hpcc.addRuntimeVolumeMount" -}} +{{- $volumeName := .volumeName | default .name -}} +- name: {{ $volumeName }}-hpcctmp-volume mountPath: /var/lib/HPCCSystems -{{- if .tmpSubPath }} - subPath: {{ .tmpSubPath | quote }} -{{- end }} +{{- if not .noSubPath }} + subPath: {{ .name | quote }} +{{- end -}} +{{- end -}} + +{{/* +Add ConfigMap volume mount for a component +*/}} +{{- define "hpcc.addConfigMapVolumeMount" -}} - name: {{ .name }}-configmap-volume +{{- if .noSubPath }} + mountPath: {{ printf "/etc/config/%s" .name }} +{{- else }} mountPath: /etc/config {{- end -}} +{{- end -}} {{/* -Add ConfigMap volume for a component +Add standard ephemeral volume mounts for a component */}} -{{- define "hpcc.addConfigMapVolume" -}} +{{- define "hpcc.addEphemeralVolumeMounts" -}} +{{ include "hpcc.addTempVolumeMount" . }} +{{ include "hpcc.addRuntimeVolumeMount" . }} +{{ include "hpcc.addConfigMapVolumeMount" . }} +{{- end -}} + +{{/* +Add tmp volume for a component +*/}} +{{- define "hpcc.addTempVolume" -}} - name: {{ .name }}-temp-volume emptyDir: {} +{{- end -}} + +{{/* +Add runtime volume for a component +*/}} +{{- define "hpcc.addRuntimeVolume" -}} - name: {{ .name }}-hpcctmp-volume emptyDir: {} +{{- end -}} + +{{/* +Add ConfigMap volume for a component +*/}} +{{- define "hpcc.addConfigMapVolume" -}} - name: {{ .name }}-configmap-volume configMap: name: {{ .name }}-configmap {{- end -}} + +{{/* +Add ConfigMap volume for a component +*/}} +{{- define "hpcc.addEphemeralVolumes" -}} +{{ include "hpcc.addTempVolume" . }} +{{ include "hpcc.addRuntimeVolume" . }} +{{ include "hpcc.addConfigMapVolume" . }} +{{- end -}} + {{/* Get mount details Pass in plane @@ -734,57 +786,24 @@ Check that the storage and spill planes for a component exist {{- end -}} {{/* -Add command for a component -*/}} -{{- define "hpcc.componentCommand" -}} -{{- if .me.valgrind -}} -valgrind -{{- else if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}} -check_executes -{{- else -}} -{{ .process }} -{{- end }} -{{- end -}} - -{{/* -Add extra args for a component +Add config arg for a component */}} -{{- define "hpcc.componentStartArgs" -}} -{{- if .me.valgrind -}} -"--leak-check=full", -"--show-leak-kinds=all", -"--track-origins=yes", -"--num-callers=8", -"--log-fd=1", -{{ .process | quote }}, -{{- else if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}} - {{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" .root "category" "debug")) -}} - {{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}} - {{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}} - {{- $meExpert := .me.expert | default dict -}} - {{- $globalExpert := .root.Values.global.expert | default dict -}} - {{- $alwaysPostMortem := (hasKey $meExpert "alwaysPostMortem") | ternary $meExpert.alwaysPostMortem ($globalExpert.alwaysPostMortem | default false) -}} - {{- if $alwaysPostMortem -}} -"-a",{{ "\n" }} - {{- end -}} -"-d", {{ $prefix }}, -"--", -{{ .process | quote }}, -{{- end }} +{{- define "hpcc.getConfigArg" -}} +/etc/config/{{ .name }}.yaml {{- end -}} {{/* Add config arg for a component */}} {{- define "hpcc.configArg" -}} -"--config=/etc/config/{{ .name }}.yaml" +"--config={{ include "hpcc.getConfigArg" . }}" {{- end -}} {{/* -Add dali arg for a component +Get dali endpoint for a component Pass in dict with root, component (in case of error), optional (true if daliArg is optional) */}} -{{- define "hpcc.daliArg" -}} +{{- define "hpcc.getDali" -}} {{- if empty .root.Values.dali -}} {{- if not .optional -}} {{- $_ := fail (printf "%s requires a DALI to be defined" .component) -}} @@ -794,10 +813,22 @@ Pass in dict with root, component (in case of error), optional (true if daliArg {{- $daliService := $dali.service | default dict -}} {{- $daliHost := .overrideDaliHost | default $dali.name -}} {{- $daliServicePort := .overrideDaliPort | default ($daliService.servicePort | default 7070) -}} -"--daliServers={{ $daliHost }}:{{ $daliServicePort }}" +{{ $daliHost }}:{{ $daliServicePort }} {{- end -}} {{- end -}} + +{{/* +Add dali arg for a component +Pass in dict with root, component (in case of error), optional (true if daliArg is optional) +*/}} +{{- define "hpcc.daliArg" -}} +{{- $dali := include "hpcc.getDali" . -}} +{{- if $dali -}} +"--daliServers={{ $dali }}" +{{- end -}} +{{- end -}} + {{/* Get image name */}} @@ -1022,6 +1053,53 @@ NB: uid=10000 and gid=10001 are the uid/gid of the hpcc user, built into platfor {{- include "hpcc.configContainer" . | nindent 0 -}} {{- end -}} +{{/* +A sidecar container to run commands after a main container finishes +Pass in dict with me, and params +*/}} +{{- define "hpcc.addPostRunContainer" -}} +{{- $meExpert := .me.expert | default dict -}} +{{- $globalExpert := .root.Values.global.expert | default dict -}} +{{- $postRun := (hasKey $meExpert "postRunSidecar") | ternary $meExpert.postRunSidecar ((hasKey $globalExpert "postRunSidecar") | ternary $globalExpert.postRunSidecar true) }} +{{- if $postRun }} + {{- if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}} + {{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" .root "category" "debug")) -}} + {{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}} + {{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}} + {{- $dali := include "hpcc.getDali" . -}} +- name: postrun + {{- include "hpcc.addImageAttrs" . | nindent 2 }} + command: + - container_watch.sh + - {{ printf "--directory=%s" $prefix }} + {{- if $dali }} + - {{ printf "--daliServer=%s" $dali }} + {{- end }} + {{- if .isJob }} + - --isJob + {{- end }} + {{- range $container := .lifeCycleCtx.containers }} + - {{ $container.name }} + - {{ $container.process }} + {{- end }} + {{- include "hpcc.addSecurityContext" . | indent 2 }} + volumeMounts: + {{- include "hpcc.addTempVolumeMount" (.me | merge (dict "noSubPath" "true")) | nindent 2 }} + {{- include "hpcc.addRuntimeVolumeMount" (.me | merge (dict "noSubPath" "true")) | nindent 2 }} + {{- $uniqueConfigs := dict -}} + {{- range $container := .lifeCycleCtx.containers -}} + {{- $config := $container.config -}} + {{- $_ := set $uniqueConfigs $config true -}} + {{- end -}} + {{- $me := .me -}} + {{- range $config, $_ := $uniqueConfigs }} + {{- include "hpcc.addConfigMapVolumeMount" ($me | merge (dict "name" $config "noSubPath" "true")) | nindent 2 -}} + {{- end -}} + {{- include "hpcc.addVolumeMounts" (dict "root" .root "me" $me "includeCategories" (list "debug")) | nindent 2 }} + {{- end -}} +{{- end -}} +{{- end -}} + {{/* Container to watch for a file on a shared mount and execute a command Pass in dict with me and command @@ -1095,7 +1173,7 @@ Pass in a dictionary with root and me defined {{- define "hpcc.addSecurityContext" }} {{- $user := (.root.Values.global.user | default dict) }} securityContext: -{{- if .root.Values.global.privileged }} +{{- if (or .root.Values.global.privileged .privileged) }} privileged: true capabilities: add: @@ -1428,7 +1506,7 @@ data: {{/* A template to generate Sasha service containers -Pass in dict with root, me and dali if container in dali pod +Pass in dict with root, me, lifeCycleCtx and dali if container in dali pod */}} {{- define "hpcc.addSashaContainer" }} {{- $serviceName := printf "sasha-%s" .me.name }} @@ -1437,14 +1515,7 @@ Pass in dict with root, me and dali if container in dali pod {{- $env := concat (.root.Values.global.env | default list) (.env | default list) }} - name: {{ $serviceName | quote }} workingDir: /var/lib/HPCCSystems - command: [ saserver ] - args: [ -{{- with (dict "name" $serviceName) }} - {{ include "hpcc.configArg" . }}, -{{- end }} - "--service={{ .me.name }}", -{{ include "hpcc.daliArg" (dict "root" .root "component" "Sasha" "optional" false "overrideDaliHost" $overrideDaliHost "overrideDaliPort" $overrideDaliPort) | indent 10 }} - ] +{{- include "hpcc.addCommandAndLifecycle" (merge (pick . "root" "lifeCycleCtx") (dict "me" (.me | merge (dict "name" $serviceName))) (dict "process" "saserver" "extraArgs" (list (printf "--service=%s" .me.name)) "component" "Sasha" "optional" false "overrideConfigName" $serviceName "overrideDaliHost" $overrideDaliHost "overrideDaliPort" $overrideDaliPort)) | nindent 2 }} {{- include "hpcc.addResources" (dict "me" .me.resources "root" .root) | indent 2 }} {{- include "hpcc.addSecurityContext" . | indent 2 }} env: @@ -1822,11 +1893,11 @@ Pass in dict with root, pod, target and type {{/* Generate lifecycle, command and args -Pass in root, me and command +Pass in root, me and process */}} {{- define "hpcc.addCommandAndLifecycle" -}} -{{- $misc := .root.Values.global.misc | default dict }} -{{- $postJobCommand := $misc.postJobCommand | default "" }} +{{- $misc := .root.Values.global.misc | default dict -}} +{{- $postJobCommand := (.isJob | default false) | ternary $misc.postJobCommand "" -}} lifecycle: preStop: exec: @@ -1835,38 +1906,42 @@ lifecycle: - "-c" - >- k8s_postjob_clearup.sh -{{- if and (not $misc.postJobCommandViaSidecar) $postJobCommand }} ; +{{- if $misc.postJobCommandViaSidecar }} ; + touch /wait-and-run/{{ .me.name }}.jobdone +{{- else if $postJobCommand }} ; {{ $postJobCommand }} -{{- end }} -command: ["/bin/bash"] -args: -- -c -{{- $check_cmd := dict "command" .command}} -{{- if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}} +{{- end -}} +{{- $meExpert := .me.expert | default dict -}} +{{- $globalExpert := .root.Values.global.expert | default dict -}} +{{- $containerName := .containerName | default .me.name -}} +{{- $args := list -}} +{{- $configCtx := (hasKey . "overrideConfigName") | ternary (dict "name" .overrideConfigName) .me -}} +{{- if .me.valgrind -}} + {{- $args = append $args "-v" -}} +{{- else if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}} {{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" .root "category" "debug")) -}} {{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}} {{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}} - {{- $pmd_always_opt := "" -}} - {{- $globalExpert := .root.Values.global.expert | default dict -}} - {{- $meExpert := .me.expert | default dict -}} {{- $alwaysPostMortem := (hasKey $meExpert "alwaysPostMortem") | ternary $meExpert.alwaysPostMortem ($globalExpert.alwaysPostMortem | default false) -}} {{- if $alwaysPostMortem -}} - {{- $pmd_always_opt = "-a " -}} + {{- $args = append $args "-a" -}} {{- end -}} - {{- $_ := set $check_cmd "command" (printf "check_executes %s-d %s -- %s" $pmd_always_opt $prefix .command) -}} -{{- end }} -- >- - {{ $check_cmd.command }}; - exitCode=$?; - k8s_postjob_clearup.sh; -{{- if $misc.postJobCommandViaSidecar -}} - touch /wait-and-run/{{ .me.name }}.jobdone; -{{- else if $postJobCommand -}} - {{ $postJobCommand }} ; + {{- $postRun := (hasKey $meExpert "postRunSidecar") | ternary $meExpert.postRunSidecar ((hasKey $globalExpert "postRunSidecar") | ternary $globalExpert.postRunSidecar true) -}} + {{- if $postRun -}} + {{- $args = append $args "-p" -}} + {{- end -}} + {{- $args = concat $args (list "-d" $prefix "-c" $containerName "--") -}} + {{- $_ := set .lifeCycleCtx "containers" (append .lifeCycleCtx.containers (dict "name" $containerName "process" .process "config" $configCtx.name)) -}} +{{- end -}} +{{- $args = append $args .process -}} +{{- $args = append $args (include "hpcc.configArg" $configCtx) -}} +{{- $args = append $args (include "hpcc.daliArg" .) -}} +{{- if hasKey . "extraArgs" -}} + {{- $args = concat $args .extraArgs -}} {{- end }} - exit $exitCode; +command: ["check_executes.sh"] +args: [ {{ join " " $args }} ] {{- end -}} - {{- define "hpcc.addCertificateImpl" }} {{- if (.root.Values.certificates | default dict).enabled -}} {{- $externalCert := .externalCert -}} diff --git a/helm/hpcc/templates/dafilesrv.yaml b/helm/hpcc/templates/dafilesrv.yaml index b6d2ee55d89..5ca02f37169 100644 --- a/helm/hpcc/templates/dafilesrv.yaml +++ b/helm/hpcc/templates/dafilesrv.yaml @@ -22,6 +22,7 @@ data: {{- if not .disabled -}} {{- $env := concat ($.Values.global.env | default list) (.env | default list) -}} {{- $commonCtx := dict "root" $ "me" . "env" $env "exposure" "local" "visibility" .service.visibility "includeCategories" (list "data" "debug") -}} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- if (eq "spray" .application) -}} {{- $_ := set $commonCtx "includeCategories" (concat $commonCtx.includeCategories (list "lz" "remote")) -}} {{- end -}} @@ -58,16 +59,14 @@ spec: spec: {{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" .name "type" "dafilesrv") | indent 6 }} serviceAccountName: "hpcc-default" + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" $commonCtx | indent 6 }} {{- include "hpcc.addImagePullSecrets" $commonCtx | nindent 6 -}} containers: - name: {{ .name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "dafilesrv") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "dafilesrv") | nindent 16 }} - {{ include "hpcc.configArg" . }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "dafilesrv" "component" "DaFileSrv" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 2) | indent 8 -}} - name: "SENTINEL" @@ -77,7 +76,7 @@ spec: {{- include "hpcc.addResources" (dict "me" .resources "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{- if $commonCtx.certificatesEnabled }} @@ -87,8 +86,9 @@ spec: {{- $_ := fail (printf "dafilesrv[application=stream]- certificates must be enabled to use") -}} {{- end }} {{- end }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} {{- if $commonCtx.certificatesEnabled }} diff --git a/helm/hpcc/templates/dali.yaml b/helm/hpcc/templates/dali.yaml index b2b6920ef99..11ce76691e8 100644 --- a/helm/hpcc/templates/dali.yaml +++ b/helm/hpcc/templates/dali.yaml @@ -48,6 +48,7 @@ true {{- $env := concat ($.Values.global.env | default list) (.env | default list) -}} {{- $daliPlaneIncludeCategories := list "dali" "debug" -}} {{- $commonCtx := dict "root" $ "me" $dali "includeCategories" $daliPlaneIncludeCategories "env" $env -}} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- $daliSecretsCategories := list "system" "authn" -}} {{- $tmpDaliScope := dict "aggregateSashaSecretsCategories" list "aggregatePlaneCategories" $daliPlaneIncludeCategories "aggregateSashaNamedPlanes" list -}} {{- $daliSashaServicesCtx := dict "services" ($dali.services | default dict) -}} @@ -118,10 +119,7 @@ spec: containers: - name: {{ $dali.name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" $dali "root" $ "process" "daserver") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" $dali "root" $ "process" "daserver") | nindent 16 }} - {{ include "hpcc.configArg" $dali }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "daserver" "component" "Dali Server" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 4) | indent 8 -}} - name: "SENTINEL" @@ -131,11 +129,11 @@ spec: {{- include "hpcc.addResources" (dict "me" $dali.resources "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" $dali | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" $dali | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{- include "hpcc.addSecretVolumeMounts" (dict "root" $ "secretsCategories" $daliSecretsCategories) | nindent 8 }} -{{ include "hpcc.addVaultClientCertificateVolumeMounts" (dict "root" $ "secretsCategories" $daliSecretsCategories) | indent 8 }} -{{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" .name "component" "dali" "external" false) | nindent 8 }} +{{- include "hpcc.addVaultClientCertificateVolumeMounts" (dict "root" $ "secretsCategories" $daliSecretsCategories) | nindent 8 }} +{{- include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" .name "component" "dali" "external" false) | nindent 8 }} {{- range $sashaName, $_sasha := $daliSashaServicesCtx.services -}} {{- $sasha := deepCopy ($_sasha | default dict) -}} {{- $_ := set $sasha "name" $sashaName -}} @@ -146,11 +144,11 @@ spec: {{- $_ := set $thisServiceCtx "sashaStoragePlane" ($sasha.plane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" $ "category" "sasha"))) -}} {{- end -}} {{- with ($sasha | merge (dict "access" $sashaAccess)) -}} - {{- include "hpcc.addSashaContainer" (dict "root" $ "me" . "dali" $dali "overrideDaliHost" "localhost" "overrideDaliPort" $daliPort) | nindent 6 }} + {{- include "hpcc.addSashaContainer" (dict "root" $ "me" . "lifeCycleCtx" $commonCtx.lifeCycleCtx "dali" $dali "overrideDaliHost" "localhost" "overrideDaliPort" $daliPort) | nindent 6 }} volumeMounts: {{- $serviceName := printf "sasha-%s" $sashaName -}} {{- with (dict "name" $serviceName) }} - {{- include "hpcc.addConfigMapVolumeMount" . | nindent 8 }} + {{- include "hpcc.addEphemeralVolumeMounts" (dict "name" $serviceName "volumeName" $dali.name) | nindent 8 }} {{ end -}} {{- $sashaPlaneCategories := splitList " " (include "hpcc.getSashaPlanesFromAccess" .) -}} {{- include "hpcc.addVolumeMounts" (dict "root" $ "includeCategories" $sashaPlaneCategories "includeNames" (list $thisServiceCtx.sashaStoragePlane)) | indent 8 }} @@ -159,13 +157,14 @@ spec: {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" $dali.name "component" "dali" "external" false) | nindent 8 }} {{- end }} {{- end }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{- include "hpcc.addConfigMapVolume" $dali | nindent 6 -}} +{{- include "hpcc.addEphemeralVolumes" $dali | nindent 6 -}} {{- range $sashaName, $_sasha := $daliSashaServicesCtx.services -}} {{- $sasha := ($_sasha | default dict) -}} {{- $serviceName := printf "sasha-%s" $sashaName -}} {{- with (dict "name" $serviceName) -}} - {{- include "hpcc.addConfigMapVolume" . | nindent 6 -}} + {{- include "hpcc.addEphemeralVolumes" (dict "name" $serviceName) | nindent 6 -}} {{- end -}} {{- end -}} {{- include "hpcc.addVolumes" (dict "root" $ "me" $dali "includeNames" $tmpDaliScope.aggregateSashaNamedPlanes "includeCategories" $tmpDaliScope.aggregatePlaneCategories) | indent 6 }} diff --git a/helm/hpcc/templates/dfuserver.yaml b/helm/hpcc/templates/dfuserver.yaml index 8738622f967..618bb6199a6 100644 --- a/helm/hpcc/templates/dfuserver.yaml +++ b/helm/hpcc/templates/dfuserver.yaml @@ -41,6 +41,7 @@ data: {{- $env := concat ($.Values.global.env | default list) (.env | default list) -}} {{- $secretsCategories := list "system" "storage" -}} {{- $commonCtx := dict "root" $ "me" . "secretsCategories" $secretsCategories "includeCategories" (list "lz" "remote" "data" "debug") "env" $env }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.dfuServerConfigMap" "component" "dfuserver" "excludeKeys" "global")) -}} apiVersion: apps/v1 kind: Deployment @@ -67,17 +68,14 @@ spec: spec: {{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" .name "target" .name "type" "dfuserver") | indent 6 }} serviceAccountName: "hpcc-default" + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" $commonCtx | indent 6 }} {{- include "hpcc.addImagePullSecrets" $commonCtx | nindent 6 -}} containers: - name: {{ .name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "dfuserver") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "dfuserver") | nindent 16 }} - {{ include "hpcc.configArg" . }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "DFU Server" "optional" false) }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "dfuserver" "component" "DFU Server" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | indent 8 -}} - name: "SENTINEL" @@ -88,12 +86,13 @@ spec: {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: {{- include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" .name "component" "dfuserver" "external" false) | indent 8 }} {{- include "hpcc.addSecretVolumeMounts" $commonCtx | nindent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{- include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addCertificateVolume" (dict "root" $ "name" .name "component" "dfuserver" "external" false) | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} diff --git a/helm/hpcc/templates/eclagent.yaml b/helm/hpcc/templates/eclagent.yaml index 29fe33e4422..09437f1d491 100644 --- a/helm/hpcc/templates/eclagent.yaml +++ b/helm/hpcc/templates/eclagent.yaml @@ -74,6 +74,7 @@ data: {{- if $misc.postJobCommandViaSidecar }} shareProcessNamespace: true {{- end }} + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" . | indent 10 }} {{- include "hpcc.addImagePullSecrets" . | nindent 10 -}} @@ -85,14 +86,13 @@ data: {{- include "hpcc.addSecurityContext" . | indent 12 }} {{ include "hpcc.addImageAttrs" . | indent 12 }} {{- include "hpcc.addResources" (dict "me" .me.resources "root" .root) | indent 12 }} -{{- $appCmd := printf "%s %s %s _HPCC_ARGS_" $apptype (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "ECL Agent" "optional" false )) }} -{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $appCmd)) | indent 12 }} +{{- include "hpcc.addCommandAndLifecycle" (. | merge (dict "isJob" true "process" $apptype "extraArgs" (list "_HPCC_ARGS_") "component" "ECL Agent" "optional" false)) | nindent 12 }} env: {{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" .me | indent 12 }} +{{ include "hpcc.addEphemeralVolumeMounts" .me | indent 12 }} {{ include "hpcc.addVolumeMounts" . | indent 12 }} {{ include "hpcc.addSecretVolumeMounts" . | indent 12 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" . | indent 12 }} @@ -100,8 +100,9 @@ data: {{- if $misc.postJobCommandViaSidecar }} {{ include "hpcc.addWaitAndRunVolumeMount" . | indent 12 }} {{- end }} +{{- include "hpcc.addPostRunContainer" (. | merge (dict "isJob" true)) | nindent 10 }} volumes: -{{ include "hpcc.addConfigMapVolume" .me | indent 10 }} +{{ include "hpcc.addEphemeralVolumes" .me | indent 10 }} {{ include "hpcc.addVolumes" . | indent 10 }} {{ include "hpcc.addSecretVolumes" . | indent 10 }} {{ include "hpcc.addVaultClientCertificateVolumes" . | indent 10 }} @@ -122,7 +123,8 @@ data: {{- $enginePlaneDetails := dict -}} {{- $_ := include "hpcc.getEnginePlanes" (dict "root" $ "me" . "result" $enginePlaneDetails) -}} {{- $commonCtx := dict "root" $ "me" . "secretsCategories" $secretsCategories "includeCategories" $enginePlaneDetails.planeCategories "includeNames" $enginePlaneDetails.namedPlanes "env" $env }} -{{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.agentConfigMap" "component" "eclagent" "excludeKeys" (print "global," $apptype ".replicas"))) }} +{{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.agentConfigMap" "component" "eclagent" "excludeKeys" (print "global," $apptype ".replicas") "lifeCycleCtx" (dict "containers" list))) }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- include "hpcc.checkDefaultStoragePlane" $commonCtx }} apiVersion: apps/v1 kind: Deployment @@ -150,17 +152,14 @@ spec: spec: {{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" .name "target" .name "type" "eclagent") | indent 6 }} serviceAccountName: "hpcc-agent" + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" $commonCtx | indent 6 }} {{- include "hpcc.addImagePullSecrets" $commonCtx | nindent 6 -}} containers: - name: {{ .name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "agentexec") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "agentexec") | nindent 16 }} - {{ include "hpcc.configArg" . }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "ECL Agent" "optional" false) }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "agentexec" "component" "ECL Agent" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | indent 8 -}} - name: "SENTINEL" @@ -174,13 +173,14 @@ spec: {{- end }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" .name "component" $apptype "includeRemote" true) | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} @@ -191,7 +191,7 @@ spec: {{- end }} --- kind: ConfigMap -{{ include "hpcc.generateConfig" ($commonCtx | merge (dict "configMapHelper" "hpcc.agentConfigMap")) }} +{{ include "hpcc.generateConfig" ((omit $commonCtx "lifeCycleCtx") | merge (dict "configMapHelper" "hpcc.agentConfigMap" "lifeCycleCtx" (dict "containers" list))) }} --- {{ include "hpcc.addCertificate" (dict "root" $ "name" .name "component" $apptype "includeRemote" true) }} --- diff --git a/helm/hpcc/templates/eclccserver.yaml b/helm/hpcc/templates/eclccserver.yaml index 3aba72c5686..9d8f388b53b 100644 --- a/helm/hpcc/templates/eclccserver.yaml +++ b/helm/hpcc/templates/eclccserver.yaml @@ -85,14 +85,14 @@ data: {{ include "hpcc.addImageAttrs" . | indent 12 }} {{- $misc := .root.Values.global.misc | default dict -}} {{- $postJobCommand := $misc.postJobCommand | default "" }} -{{- $eclccserverCmd := printf "eclccserver %s %s _HPCC_ARGS_" (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "ECLCC Server" "optional" false)) }} -{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $eclccserverCmd)) | indent 12 }} +{{- $extraArgs := list "_HPCC_ARGS_" -}} +{{- include "hpcc.addCommandAndLifecycle" (. | merge (dict "isJob" true "process" "eclccserver" "extraArgs" $extraArgs "component" "ECLCC Server" "optional" false)) | nindent 12 }} env: {{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /tmp volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" .me | indent 12 }} +{{ include "hpcc.addEphemeralVolumeMounts" .me | indent 12 }} {{ include "hpcc.addVolumeMounts" . | indent 12 }} {{ include "hpcc.addSecretVolumeMounts" . | indent 12 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" . | indent 12 }} @@ -100,8 +100,9 @@ data: {{- if $misc.postJobCommandViaSidecar }} {{ include "hpcc.addWaitAndRunVolumeMount" . | indent 12 }} {{- end }} +{{- include "hpcc.addPostRunContainer" (. | merge (dict "isJob" true)) | nindent 10 }} volumes: -{{ include "hpcc.addConfigMapVolume" .me | indent 10 }} +{{ include "hpcc.addEphemeralVolumes" .me | indent 10 }} {{ include "hpcc.addVolumes" . | indent 10 }} {{ include "hpcc.addSecretVolumes" . | indent 10 }} {{ include "hpcc.addVaultClientCertificateVolumes" . | indent 10 }} @@ -131,7 +132,8 @@ data: {{- $env := concat ($.Values.global.env | default list) (.env | default list) $gitenv -}} {{- $secretsCategories := list "system" "codeVerify" "git" "storage" "jfrog" }} {{- $commonCtx := dict "root" $ "me" . "includeCategories" (list "dll" "git" "debug") "secretsCategories" $secretsCategories "env" $env }} -{{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.eclccServerConfigMap" "component" "eclccserver" "excludeKeys" "global,eclccserver.queues")) }} +{{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.eclccServerConfigMap" "component" "eclccserver" "excludeKeys" "global,eclccserver.queues" "lifeCycleCtx" (dict "containers" list))) }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} apiVersion: apps/v1 kind: Deployment metadata: @@ -166,11 +168,7 @@ spec: containers: - name: {{ .name | quote }} workingDir: /tmp - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "eclccserver") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "eclccserver") | nindent 16 }} - {{ include "hpcc.configArg" . }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "ECLCC Server" "optional" false) }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "eclccserver" "component" "ECLCC Server" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | indent 8 -}} - name: "SENTINEL" @@ -185,15 +183,16 @@ spec: {{- end }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 -}} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" .name "component" "eclccserver" "includeRemote" true) | indent 8 }} - name: "hpccbundles" mountPath: "/home/hpcc/.HPCCSystems" +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} @@ -206,7 +205,7 @@ spec: {{- end }} --- kind: ConfigMap -{{ include "hpcc.generateConfig" ($commonCtx | merge (dict "configMapHelper" "hpcc.eclccServerConfigMap")) }} +{{ include "hpcc.generateConfig" ((omit $commonCtx "lifeCycleCtx") | merge (dict "configMapHelper" "hpcc.eclccServerConfigMap" "lifeCycleCtx" (dict "containers" list))) }} --- {{ include "hpcc.addCertificate" (dict "root" $ "name" .name "component" "eclccserver" "includeRemote" true) }} {{ include "hpcc.addCertificate" (dict "root" $ "name" .name "component" "compile" "includeRemote" true) }} diff --git a/helm/hpcc/templates/eclscheduler.yaml b/helm/hpcc/templates/eclscheduler.yaml index 67f1321930f..c00d0b339b4 100644 --- a/helm/hpcc/templates/eclscheduler.yaml +++ b/helm/hpcc/templates/eclscheduler.yaml @@ -47,7 +47,8 @@ data: {{- if not .disabled -}} {{- $env := concat ($.Values.global.env | default list) (.env | default list) -}} {{- $secretsCategories := list "system" }} -{{- $commonCtx := dict "root" $ "me" . "secretsCategories" $secretsCategories "env" $env }} +{{- $commonCtx := dict "root" $ "me" . "includeCategories" (list "debug") "secretsCategories" $secretsCategories "env" $env }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.eclSchedulerConfigMap" "component" "eclscheduler" "excludeKeys" "global")) -}} apiVersion: apps/v1 kind: Deployment @@ -82,11 +83,7 @@ spec: containers: - name: {{ .name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "eclscheduler") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "eclscheduler") | nindent 16 }} - {{ include "hpcc.configArg" . }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "ECL Scheduler" "optional" false) }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "eclscheduler" "component" "ECL Scheduler" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | indent 8 -}} - name: "SENTINEL" @@ -97,12 +94,15 @@ spec: {{- include "hpcc.addResources" (dict "me" .resources "defaults" $defaultResources "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} +{{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 -}} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" .name "component" "eclscheduler") | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} +{{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addCertificateVolume" (dict "root" $ "name" .name "component" "eclscheduler") | indent 6 }} diff --git a/helm/hpcc/templates/esp.yaml b/helm/hpcc/templates/esp.yaml index bf62463087e..f949082cc4b 100644 --- a/helm/hpcc/templates/esp.yaml +++ b/helm/hpcc/templates/esp.yaml @@ -96,6 +96,7 @@ data: {{- $secretsCategories := ternary (append $noAuthSecretsCategories "authn") $noAuthSecretsCategories (eq $authtype "ldap") -}} {{- $includeStorageCategories := ternary (list "lz" "data" "dll" "debug") (list "data" "dll" "debug") (eq $application "eclwatch") -}} {{- $commonCtx := dict "root" $ "me" . "secretsCategories" $secretsCategories "includeCategories" $includeStorageCategories "env" $env -}} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.espConfigMap" "component" "esp" "excludeKeys" "global,esp.queues")) -}} {{- if (ne (include "hpcc.isVisibilityPublic" (dict "root" $ "visibility" .service.visibility)) "") }} {{- /* If esp is using a public cert, this flag is used later to add the local MTLS mount and cert */ -}} @@ -140,18 +141,16 @@ spec: {{- include "hpcc.addPrometheusScrapeAnnotations" $.Values.global.metrics | nindent 8 }} {{- end }} spec: + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} {{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" .name "type" "esp") | indent 6 }} serviceAccountName: "hpcc-esp-service" {{- include "hpcc.addImagePullSecrets" $commonCtx | nindent 6 -}} containers: - name: {{ .name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "esp") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "esp") | nindent 16 }} - {{ printf "--application=%s" $application | quote }}, - {{ include "hpcc.configArg" . }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" $application "optional" (has $application (list "esdl" "ldapenvironment" "loggingservice")) )}} - ] +{{- $optional := (has $application (list "esdl" "ldapenvironment" "loggingservice")) -}} +{{- $extraArgs := list (printf "--application=%s" $application) -}} +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "esp" "extraArgs" $extraArgs "component" $application "optional" $optional)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 4) | indent 8 -}} - name: "SENTINEL" @@ -161,7 +160,7 @@ spec: {{- include "hpcc.addResources" (dict "me" .resources "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} @@ -172,8 +171,9 @@ spec: {{- if $signingCertGenerator }} {{- include "hpcc.addCertificateVolumeMount" (dict "root" $ "component" $signingCertGenerator "name" "global" "issuerKeyName" "signing") | nindent 8 }} {{- end }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} diff --git a/helm/hpcc/templates/localroxie.yaml b/helm/hpcc/templates/localroxie.yaml index 99a1b11250c..9c860cdecea 100644 --- a/helm/hpcc/templates/localroxie.yaml +++ b/helm/hpcc/templates/localroxie.yaml @@ -48,6 +48,7 @@ data: {{- $enginePlaneDetails := dict -}} {{- $_ := include "hpcc.getEnginePlanes" (dict "root" $ "me" . "result" $enginePlaneDetails) -}} {{- $commonCtx := dict "root" $ "me" $roxie "includeCategories" $enginePlaneDetails.planeCategories "includeNames" $enginePlaneDetails.namedPlanes "secretsCategories" $secretsCategories "env" $env }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.localroxieConfigMap" "component" "roxie" "excludeKeys" "global")) }} {{- include "hpcc.checkDefaultStoragePlane" $commonCtx }} {{- $singleNode := (hasKey $roxie "singleNode") | ternary $roxie.singleNode ((hasKey $roxie "localAgent") | ternary $roxie.localAgent false) }} @@ -85,20 +86,15 @@ spec: spec: {{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" $roxie.name "target" $roxie.name "type" "roxie") | indent 6 }} serviceAccountName: "hpcc-default" + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" $commonCtx | indent 6 }} {{- include "hpcc.addImagePullSecrets" $commonCtx | nindent 6 -}} containers: - name: {{ $roxie.name | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" $roxie "root" $ "process" "roxie") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" $roxie "root" $ "process" "roxie") | nindent 16 }} - {{ include "hpcc.configArg" $roxie }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "Local Roxie" "optional" false) }}, - "--server=true", - "--localAgent={{ $localAgent }}", - "--resolveLocally=false" - ] +{{- $extraArgs := list "--server=true" (printf "-localAgent=%s" $localAgent) "--resolveLocally=false" -}} +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "roxie" "extraArgs" $extraArgs "component" "Local Roxie" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 8) | indent 8 -}} - name: "SENTINEL" @@ -118,15 +114,16 @@ spec: {{- include "hpcc.addResources" (dict "me" $roxie.resources "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{- include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" $roxie.name "component" "localroxie" "external" false) | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" $roxie.name "component" "localroxie" "external" true "includeRemote" true) | indent 8 }} {{ include "hpcc.addUDPCertificateVolumeMount" (dict "root" $ "name" $roxie.name "component" "localudpkey" ) | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} diff --git a/helm/hpcc/templates/roxie.yaml b/helm/hpcc/templates/roxie.yaml index 984587b0c68..4f858f6f599 100644 --- a/helm/hpcc/templates/roxie.yaml +++ b/helm/hpcc/templates/roxie.yaml @@ -110,6 +110,8 @@ data: {{- end }} {{- end }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} +{{- $topServerMe := . | merge (dict "name" $commonCtx.toponame) -}} apiVersion: apps/v1 kind: Deployment metadata: @@ -151,19 +153,19 @@ spec: {{- include "hpcc.addResources" (dict "me" .topoResources "defaults" $defaultResources "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" $toposerver "root" $ "process" "toposerver") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" $toposerver "root" $ "process" "toposerver") | nindent 16 }} - {{ include "hpcc.configArg" $toposerver }} - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "toposerver" "containerName" $commonCtx.toponame "component" "Toposerver" "optional" false "overrideConfigName" $toposerver.name)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | indent 8 -}} - name: "SENTINEL" value: "/tmp/{{ $commonCtx.toponame }}.sentinel" volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" $toposerver | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" $toposerver | indent 8 }} +{{ include "hpcc.addVolumeMounts" ($commonCtx | merge (dict "includeCategories" (list "debug"))) | indent 8 -}} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "component" "topo" "name" $commonCtx.toponame "external" false) | indent 8 }} +{{- include "hpcc.addPostRunContainer" ($commonCtx | merge (dict "me" $topServerMe)) | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" $toposerver | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" $toposerver | indent 6 }} +{{ include "hpcc.addVolumes" ($commonCtx | merge (dict "includeCategories" (list "debug"))) | indent 6 }} {{ include "hpcc.addCertificateVolume" (dict "root" $ "component" "topo" "name" $commonCtx.toponame "external" false) | indent 6 }} --- @@ -227,6 +229,7 @@ kind: ConfigMap --- {{- $_ := set $commonCtx "instanceNames" list -}} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{ if $roxie.serverReplicas -}} {{ $_ := set $commonCtx "instanceNames" (list $servername) }} apiVersion: apps/v1 @@ -274,11 +277,7 @@ spec: containers: - name: {{ $servername | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" $roxie "root" $ "process" "roxie") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" $roxie "root" $ "process" "roxie") | nindent 16 }} - {{ include "hpcc.daliArg" (dict "root" $ "component" "Roxie" "optional" false) }}, - "--server=true" - ] +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "roxie" "component" "Roxie" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 8) | indent 8 -}} - name: "SENTINEL" @@ -303,15 +302,16 @@ spec: {{- include "hpcc.addResources" (dict "me" ($roxie.serverResources | default $roxie.resources) "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" $roxie | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" $roxie | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "component" "roxie-server" "name" $servername "external" false) | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "component" "roxie-server" "name" $servername "certificate" $roxie.certificate "external" true "includeRemote" true) | indent 8 }} {{ include "hpcc.addUDPCertificateVolumeMount" (dict "root" $ "component" "udpkey" "name" $udpkeyname ) | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" $roxie | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" $roxie | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} @@ -332,6 +332,7 @@ spec: {{- $channel := add $c 1 -}} {{- $name := printf "%s-agent-%d" $roxie.name $channel }} {{- $_ := set $commonCtx "instanceNames" (append $commonCtx.instanceNames $name) }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{ include "hpcc.addCertificate" (dict "root" $ "name" $name "services" $roxie.services "component" "roxie-agent" "external" false) }} --- @@ -379,13 +380,8 @@ spec: containers: - name: {{ $name | quote}} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" $roxie "root" $ "process" "roxie") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" $roxie "root" $ "process" "roxie") | nindent 16 }} - {{ include "hpcc.configArg" $roxie }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "Roxie" "optional" false) }}, - "--channels={{ $channel }}", - "--server={{ not $roxie.serverReplicas }}", - ] +{{- $extraArgs := list (printf "--channels=%d" $channel) (printf "--server=%t" (not $roxie.serverReplicas)) -}} +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "roxie" "extraArgs" $extraArgs "component" "Roxie" "optional" false)) | nindent 8 }} env: {{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 8) | indent 8 -}} - name: "SENTINEL" @@ -412,7 +408,7 @@ spec: {{- include "hpcc.addResources" (dict "me" ($roxie.channelResources | default $roxie.resources) "root" $) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" $roxie | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" $roxie | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} @@ -423,8 +419,9 @@ spec: {{ include "hpcc.addUDPCertificateVolumeMount" (dict "root" $ "component" "udpkey" "name" $udpkeyname ) | indent 8 }} {{- end }}{{/* not serverReplicas */}} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" $roxie | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" $roxie | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} diff --git a/helm/hpcc/templates/sasha.yaml b/helm/hpcc/templates/sasha.yaml index bc40fd7cbf5..a0313beaef9 100644 --- a/helm/hpcc/templates/sasha.yaml +++ b/helm/hpcc/templates/sasha.yaml @@ -30,6 +30,7 @@ {{- $secretsCategories := append ((or (has "data" $sasha.access) (has "dalidata" $sasha.access)) | ternary (list "storage") list) "system" -}} {{- $commonCtx := dict "root" $ "me" $sasha "secretsCategories" $secretsCategories -}} {{- $_ := set $commonCtx "includeCategories" (splitList " " (include "hpcc.getSashaPlanesFromAccess" $sasha)) -}} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}} {{- if and (hasKey $sasha "plane") $sasha.plane -}} {{- $_ := set $commonCtx "includeNames" (list ($sasha.plane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" $ "category" "sasha")))) -}} {{- end -}} @@ -63,6 +64,7 @@ spec: spec: {{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" $serviceName "type" "sasha") | indent 6 }} serviceAccountName: "hpcc-default" + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} {{- if hasKey $sasha "plane" }} initContainers: {{- include "hpcc.createConfigInitContainers" $commonCtx | indent 6 }} @@ -72,7 +74,7 @@ spec: {{- include "hpcc.addSashaContainer" $commonCtx | indent 6 }} volumeMounts: {{- with (dict "name" $serviceName) }} - {{- include "hpcc.addConfigMapVolumeMount" . | nindent 8 }} + {{- include "hpcc.addEphemeralVolumeMounts" . | nindent 8 }} {{ end -}} {{- include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} @@ -80,7 +82,7 @@ spec: {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" $sashaName "component" "sasha" "external" false) | indent 8 }} volumes: {{- with (dict "name" $serviceName) }} - {{- include "hpcc.addConfigMapVolume" . | nindent 6 }} + {{- include "hpcc.addEphemeralVolumes" . | nindent 6 }} {{ end -}} {{- include "hpcc.addVolumes" $commonCtx | indent 6 }} {{- include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} diff --git a/helm/hpcc/templates/thor.yaml b/helm/hpcc/templates/thor.yaml index df95266fcbe..8edd3831d20 100644 --- a/helm/hpcc/templates/thor.yaml +++ b/helm/hpcc/templates/thor.yaml @@ -76,6 +76,7 @@ data: {{ include "hpcc.generateGlobalConfigMap" .root| indent 6 }} {{- if not .eclAgentUseChildProcesses }} + {{- $_ := set . "lifeCycleCtx" (dict "containers" list) }} {{ $eclAgentType }}-job.yaml: apiVersion: batch/v1 kind: Job @@ -98,6 +99,7 @@ data: spec: {{- include "hpcc.placementsByJobTargetType" (dict "root" .root "job" $eclAgentJobName "target" .me.name "type" "thor") | indent 10 }} serviceAccountName: "hpcc-agent" + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" . | indent 10 }} {{- include "hpcc.addImagePullSecrets" . | nindent 10 -}} @@ -112,14 +114,14 @@ data: {{- include "hpcc.addSecurityContext" . | indent 12 }} {{ include "hpcc.addImageAttrs" . | indent 12 }} {{- include "hpcc.addResources" (dict "me" .eclAgentResources "root" .root) | indent 12 }} -{{- $agentCmd := printf "%s %s %s _HPCC_ARGS_" $eclAgentType (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "Thor" "optional" false)) }} -{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $agentCmd)) | indent 12 }} +{{- $extraArgs := list "_HPCC_ARGS_" -}} +{{- include "hpcc.addCommandAndLifecycle" (. | merge (dict "isJob" true "process" $eclAgentType "extraArgs" $extraArgs "component" "Thor ECL Agent" "optional" false)) | nindent 12 }} env: {{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 1) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" .me | indent 12 }} +{{ include "hpcc.addEphemeralVolumeMounts" .me | indent 12 }} {{ include "hpcc.addVolumeMounts" . | indent 12 }} {{ include "hpcc.addSecretVolumeMounts" . | indent 12 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" . | indent 8 }} @@ -127,8 +129,9 @@ data: {{- if $misc.postJobCommandViaSidecar }} {{ include "hpcc.addWaitAndRunVolumeMount" . | indent 12 }} {{- end }} +{{- include "hpcc.addPostRunContainer" (. | merge (dict "isJob" true)) | nindent 10 }} volumes: -{{ include "hpcc.addConfigMapVolume" .me | indent 10 }} +{{ include "hpcc.addEphemeralVolumes" .me | indent 10 }} {{ include "hpcc.addVolumes" . | indent 10 }} {{ include "hpcc.addSecretVolumes" . | indent 10 }} {{ include "hpcc.addVaultClientCertificateVolumes" . | indent 10 }} @@ -140,6 +143,7 @@ data: backoffLimit: 0 {{- end }} +{{- $_ := set . "lifeCycleCtx" (dict "containers" list) }} thormanager-job.yaml: apiVersion: batch/v1 kind: Job @@ -166,6 +170,7 @@ data: spec: {{- include "hpcc.placementsByJobTargetType" (dict "root" .root "job" $thorManagerJobName "target" .me.name "type" "thor") | indent 10 }} serviceAccountName: hpcc-thoragent + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 60 }} initContainers: {{- include "hpcc.createConfigInitContainers" . | indent 10 }} {{- include "hpcc.addImagePullSecrets" . | nindent 10 -}} @@ -180,14 +185,14 @@ data: {{- include "hpcc.addSecurityContext" . | indent 12 }} {{ include "hpcc.addImageAttrs" . | indent 12 }} {{- include "hpcc.addResources" (dict "me" $thorScope.managerResources "root" .root) | indent 12 }} -{{- $thorManagerCmd := printf "thormaster_lcr %s %s _HPCC_ARGS_" (include "hpcc.configArg" .me) (include "hpcc.daliArg" (dict "root" .root "component" "Thor" "optional" false)) }} -{{ include "hpcc.addCommandAndLifecycle" (. | merge (dict "command" $thorManagerCmd)) | indent 12 }} +{{- $extraArgs := list "_HPCC_ARGS_" -}} +{{- include "hpcc.addCommandAndLifecycle" (. | merge (dict "isJob" true "process" "thormaster_lcr" "extraArgs" $extraArgs "component" "Thor Manager" "optional" false)) | nindent 12 }} env: {{- include "hpcc.mergeEnvironments" (dict "env" (append (.env | default list) (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_")) "defaultArenas" 2) | nindent 12 }} {{- include "hpcc.generateImageEnv" . | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" .me | indent 12 }} +{{ include "hpcc.addEphemeralVolumeMounts" .me | indent 12 }} {{ include "hpcc.addVolumeMounts" . | indent 12 }} {{ include "hpcc.addSecretVolumeMounts" . | indent 12 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" . | indent 12 }} @@ -195,8 +200,9 @@ data: {{- if $misc.postJobCommandViaSidecar }} {{ include "hpcc.addWaitAndRunVolumeMount" . | indent 12 }} {{- end }} +{{- include "hpcc.addPostRunContainer" (. | merge (dict "isJob" true)) | nindent 10 }} volumes: -{{ include "hpcc.addConfigMapVolume" .me | indent 10 }} +{{ include "hpcc.addEphemeralVolumes" .me | indent 10 }} {{ include "hpcc.addVolumes" . | indent 10 }} {{ include "hpcc.addSecretVolumes" . | indent 10 }} {{ include "hpcc.addVaultClientCertificateVolumes" . | indent 10 }} @@ -207,6 +213,7 @@ data: restartPolicy: Never backoffLimit: 0 +{{- $_ := set $configCtx "lifeCycleCtx" (dict "containers" list) }} thorworker-job.yaml: apiVersion: batch/v1 kind: Job @@ -234,6 +241,7 @@ data: spec: {{- include "hpcc.placementsByJobTargetType" (dict "root" .root "job" $thorWorkerJobName "target" .me.name "type" "thor") | indent 10 }} serviceAccountName: hpcc-default + terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }} initContainers: {{- include "hpcc.createConfigInitContainers" . | indent 10 }} {{- include "hpcc.addImagePullSecrets" . | nindent 10 -}} @@ -241,24 +249,28 @@ data: shareProcessNamespace: true {{- end }} containers: +{{- $_ := set $configCtx "lifeCycleCtx" (dict "containers" list) -}} {{- range $containerNum := untilStep 1 (int (add1 $numWorkersPerPod)) 1 }} {{- $slavePort := add 20100 (mul 100 (sub $containerNum 1)) -}} {{- if and ($misc.postJobCommandViaSidecar) (eq $containerNum 1) }} {{ include "hpcc.addWaitAndRunContainer" ($configCtx | merge (dict "command" $misc.postJobCommand)) | indent 10 }} {{- end }} - - name: {{ $thorWorkerJobName }}-{{ $containerNum }} +{{- $containerName := printf "%s-%d" $thorWorkerJobName $containerNum }} + - name: {{ $containerName }} {{- include "hpcc.addSecurityContext" $configCtx | indent 12 }} {{ include "hpcc.addImageAttrs" $configCtx | indent 12 }} {{- include "hpcc.addResources" (dict "me" $thorScope.workerResources "root" $configCtx.root) | indent 12 }} -{{- $thorWorkerCmd := printf "thorslave_lcr %s %s _HPCC_ARGS_ --slaveport=%d" (include "hpcc.configArg" $configCtx.me) (include "hpcc.daliArg" (dict "root" $configCtx.root "component" "Thor" "optional" false)) $slavePort }} -{{ include "hpcc.addCommandAndLifecycle" ($configCtx | merge (dict "command" $thorWorkerCmd)) | indent 12 }} +{{- $extraArgs := list "_HPCC_ARGS_" (printf "--slaveport=%d" $slavePort) -}} +{{- include "hpcc.addCommandAndLifecycle" ($configCtx | merge (dict "isJob" true "process" "thorslave_lcr" "containerName" $containerName "extraArgs" $extraArgs "component" "Thor Worker" "optional" false "overrideConfigName" $thorScope.name)) | nindent 12 }} env: {{- $env := concat ($configCtx.me.env | default list) (list (dict "name" "MY_JOB_NAME" "value" "_HPCC_JOBNAME_") (dict "name" "MY_CONTAINER_NAME" "value" (printf "%s-%d" $thorWorkerJobName $containerNum))) }} {{- include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 8) | nindent 12 }} {{- include "hpcc.generateImageEnv" $configCtx | nindent 12 }} workingDir: /var/lib/HPCCSystems volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" (deepCopy $configCtx.me | merge (dict "tmpSubPath" $containerNum)) | indent 12 }} +{{ include "hpcc.addTempVolumeMount" (dict "name" $containerName "volumeName" $configCtx.me.name) | indent 12 }} +{{ include "hpcc.addRuntimeVolumeMount" (dict "name" $containerName "volumeName" $configCtx.me.name) | indent 12 }} +{{ include "hpcc.addConfigMapVolumeMount" (dict "name" $configCtx.me.name "volumeName" $configCtx.me.name) | indent 12 }} {{ include "hpcc.addVolumeMounts" $configCtx | indent 12 }} {{ include "hpcc.addSecretVolumeMounts" $configCtx | indent 12 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $configCtx | indent 12 }} @@ -267,8 +279,9 @@ data: {{ include "hpcc.addWaitAndRunVolumeMount" $configCtx | indent 12 }} {{- end }} {{- end }}{{/* range */}} +{{- include "hpcc.addPostRunContainer" ($configCtx | merge (dict "isJob" true)) | nindent 10 }} volumes: -{{ include "hpcc.addConfigMapVolume" .me | indent 10 }} +{{ include "hpcc.addEphemeralVolumes" .me | indent 10 }} {{ include "hpcc.addVolumes" . | indent 10 }} {{ include "hpcc.addSecretVolumes" . | indent 10 }} {{ include "hpcc.addVaultClientCertificateVolumes" . | indent 10 }} @@ -341,6 +354,7 @@ data: {{- $_ := set $commonCtx "thorAgentReplicas" (.thorAgentReplicas | default 1) }} {{- $configSHA := include "hpcc.getConfigSHA" ($commonCtx | merge (dict "configMapHelper" "hpcc.thorConfigMap" "component" "thor" "excludeKeys" "global")) }} {{- include "hpcc.checkDefaultStoragePlane" $commonCtx }} +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) }} apiVersion: apps/v1 kind: Deployment metadata: @@ -376,12 +390,8 @@ spec: containers: - name: {{ $commonCtx.eclAgentName | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "agentexec") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "agentexec") | nindent 16 }} - {{ include "hpcc.configArg" . }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "Thor" "optional" false) }}, - {{ printf "\"--name=%s\"" .name }} - ] +{{- $extraArgs := list (printf "--name=%s" .name) -}} +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "agentexec" "extraArgs" $extraArgs "component" "Thor ECL Agent" "optional" false)) | nindent 8 }} env: {{- include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | nindent 8 }} {{- include "hpcc.generateImageEnv" $commonCtx | nindent 8 }} @@ -395,18 +405,20 @@ spec: {{- end }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" $commonCtx.eclAgentName "component" "eclagent" "includeRemote" true) | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addCertificateVolume" (dict "root" $ "name" $commonCtx.eclAgentName "component" "eclagent" "includeRemote" true) | indent 6 }} --- +{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) }} apiVersion: apps/v1 kind: Deployment metadata: @@ -439,13 +451,8 @@ spec: containers: - name: {{ $commonCtx.thorAgentName | quote }} workingDir: /var/lib/HPCCSystems - command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "agentexec") }} ] - args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "agentexec") | nindent 16 }} - {{ include "hpcc.configArg" . }}, - {{ "\"--componentTag=thoragent\"" }}, - {{ include "hpcc.daliArg" (dict "root" $ "component" "Thor" "optional" false) }}, - {{ printf "\"--name=%s\"" .name }} - ] +{{- $extraArgs := list "--componentTag=thoragent" (printf "--name=%s" .name) -}} +{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "agentexec" "extraArgs" $extraArgs "component" "Thor Agent" "optional" false)) | nindent 8 }} env: {{- include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 1) | nindent 8 }} {{- include "hpcc.generateImageEnv" $commonCtx | nindent 8 }} @@ -455,13 +462,14 @@ spec: {{- include "hpcc.addStubResources" (dict "me" .stubResources "root" $ ) | indent 8 }} {{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }} volumeMounts: -{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }} +{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }} {{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addSecretVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }} {{ include "hpcc.addCertificateVolumeMount" (dict "root" $ "name" $commonCtx.thorAgentName "component" "thoragent" "includeRemote" true) | indent 8 }} +{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }} volumes: -{{ include "hpcc.addConfigMapVolume" . | indent 6 }} +{{ include "hpcc.addEphemeralVolumes" . | indent 6 }} {{ include "hpcc.addVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addSecretVolumes" $commonCtx | indent 6 }} {{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }} diff --git a/helm/hpcc/values.schema.json b/helm/hpcc/values.schema.json index 312019680bc..4abf45aca50 100644 --- a/helm/hpcc/values.schema.json +++ b/helm/hpcc/values.schema.json @@ -1612,6 +1612,9 @@ }, "expert": { "$ref": "#/definitions/expert" + }, + "terminationGracePeriodSeconds": { + "$ref": "#/definitions/terminationGracePeriodSeconds" } } }, @@ -1707,6 +1710,9 @@ }, "expert": { "$ref": "#/definitions/expert" + }, + "terminationGracePeriodSeconds": { + "$ref": "#/definitions/terminationGracePeriodSeconds" } } }, @@ -2428,6 +2434,9 @@ }, "hpa": { "$ref": "#/definitions/hpa" + }, + "terminationGracePeriodSeconds": { + "$ref": "#/definitions/terminationGracePeriodSeconds" } } }, @@ -2687,6 +2696,9 @@ "type": "boolean", "default": false, "description": "Require SOAPCALL and HTTPCALL URLs are secrets or mapped to secrets" + }, + "terminationGracePeriodSeconds": { + "$ref": "#/definitions/terminationGracePeriodSeconds" } } }, @@ -2758,6 +2770,9 @@ }, "expert": { "$ref": "#/definitions/expert" + }, + "terminationGracePeriodSeconds": { + "$ref": "#/definitions/terminationGracePeriodSeconds" } } }, @@ -2798,7 +2813,8 @@ "annotations": {}, "labels": {}, "egress": {}, - "expert": {} + "expert": {}, + "terminationGracePeriodSeconds": {} }, "additionalProperties": false } @@ -2847,7 +2863,8 @@ "limit": {}, "cutoff": {}, "egress": {}, - "expert": {} + "expert": {}, + "terminationGracePeriodSeconds": {} }, "additionalProperties": false } @@ -2876,7 +2893,8 @@ "limit": {}, "cutoff": {}, "egress": {}, - "expert": {} + "expert": {}, + "terminationGracePeriodSeconds": {} }, "additionalProperties": false } @@ -2903,7 +2921,8 @@ "limit": {}, "cutoff": {}, "egress": {}, - "expert": {} + "expert": {}, + "terminationGracePeriodSeconds": {} }, "additionalProperties": false } @@ -2938,7 +2957,8 @@ "annotations": {}, "labels": {}, "egress": {}, - "expert": {} + "expert": {}, + "terminationGracePeriodSeconds": {} }, "additionalProperties": false }, @@ -2964,7 +2984,9 @@ "resources": {}, "annotations": {}, "labels": {}, - "egress": {} + "egress": {}, + "expert": {}, + "terminationGracePeriodSeconds": {} }, "additionalProperties": false }, @@ -3055,6 +3077,9 @@ }, "expert": { "$ref": "#/definitions/expert" + }, + "terminationGracePeriodSeconds": { + "$ref": "#/definitions/terminationGracePeriodSeconds" } } }, diff --git a/initfiles/CMakeLists.txt b/initfiles/CMakeLists.txt index acc3501a34e..9dc1b8685be 100644 --- a/initfiles/CMakeLists.txt +++ b/initfiles/CMakeLists.txt @@ -45,7 +45,9 @@ if ( PLATFORM AND UNIX ) install ( FILES lib/libjlib.so-gdb.py DESTINATION ${LIB_DIR} COMPONENT Runtime ) if ( CONTAINERIZED ) install ( PROGRAMS bin/k8s_postjob_clearup.sh DESTINATION ${EXEC_DIR} COMPONENT Runtime ) - install ( PROGRAMS bin/check_executes DESTINATION ${EXEC_DIR} COMPONENT Runtime ) + install ( PROGRAMS bin/check_executes.sh DESTINATION ${EXEC_DIR} COMPONENT Runtime ) + install ( PROGRAMS bin/container_watch.sh DESTINATION ${EXEC_DIR} COMPONENT Runtime ) + install ( PROGRAMS bin/collect_postmortem.sh DESTINATION ${EXEC_DIR} COMPONENT Runtime ) else () ADD_SUBDIRECTORY(etc) ADD_SUBDIRECTORY(bash) diff --git a/initfiles/bin/check_executes b/initfiles/bin/check_executes deleted file mode 100755 index 67d17c95564..00000000000 --- a/initfiles/bin/check_executes +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash - -usage() { - echo "Usage: check_executes [options] -- cmd args" - echo " -d Mounted directory to store post-mortem info in" - echo " -f Specifies a file to preserve on post-mortem" -} - -PMD_DIRECTORYBASE=$(pwd) -PMD_PROGNAME= -PMD_COPYFILES=() -PMD_DALISERVER= -PMD_WORKUNIT= -PMD_ALWAYS=false - -while [ "$#" -gt 0 ]; do - arg=$1 - if [[ ${arg:0:1} == '-' ]]; then - case "${arg:1:1}" in - -) shift - PMD_PROGNAME=$1 - shift - break - ;; - d) shift; - PMD_DIRECTORYBASE=$1 - ;; - f) shift; - PMD_COPYFILES+=($1) - ;; - a) PMD_ALWAYS=true - ;; - *) usage - exit - ;; - esac - else - usage - exit - fi - shift -done - -if [[ -z ${PMD_PROGNAME} ]] ; then - usage - exit -fi - -# Scan managed process parameters for additional information -for (( arg=1; arg <= "$#"; arg++ )); do - optname=${!arg%=*} - optval=${!arg#*=} - if [[ ${optname} == '--config' ]]; then - PMD_COPYFILES+=(${optval}) - elif [[ ${optname} == '--daliServers' ]]; then - PMD_DALISERVER=${optval} - elif [[ ${optname} == '--workunit' ]]; then - PMD_WORKUNIT=${optval} - fi -done - -ulimit -c unlimited - -function cleanup { - echo "EXIT via signal for $progPid" - if [ -n "$progPid" ]; then - kill $progPid - wait $progPid - retVal=$? - fi -} - -# Ensure any signals to the script kill the child process -# NB: do not include SIGEXIT since when handled, it will cause the script to exit prematurely. -trap cleanup SIGTERM SIGINT SIGABRT SIGQUIT SIGHUP - -# Execute the main program, defaulting postmortem logging on (can be overriden by program's config file) -${PMD_PROGNAME} --logging.postMortem=1000 "$@" & -progPid=$! - -echo "Waiting for child process $progPid" -# If the signal handler (cleanup) was called, it will wait and catpure retVal and cause this 'wait $progPid' to exit on completion. -# NB: If the signal handler itself doesn't wait, then it will still cause this statement to complete before the child process has exited. -wait $progPid -retVal2=$? -if [ ! -v retVal ]; then - retVal=$retVal2 -fi -echo "Child process $progPid has exited with exit code $retVal" - -# If it did not exit cleanly, copy some post-mortem info -if [ $PMD_ALWAYS = true ] || [ $retVal -ne 0 ]; then - POST_MORTEM_DIR=${PMD_DIRECTORYBASE}/$(hostname)/$(date -Iseconds) - mkdir -p ${POST_MORTEM_DIR} - echo "Post-mortem info gathered in $POST_MORTEM_DIR" - echo "Process exited with code $retVal" | tee $POST_MORTEM_DIR/info.log - for f in ${PMD_COPYFILES[@]}; do - mkdir -p $POST_MORTEM_DIR/$(dirname $f) - cp $f $POST_MORTEM_DIR/$f - echo "Copied $f to $POST_MORTEM_DIR/$f" | tee -a $POST_MORTEM_DIR/info.log - done - cp `ls -rt /tmp/postmortem.$progPid.log.*` $POST_MORTEM_DIR - rm /tmp/postmortem.$progPid.log.* - - readarray -t core_files < <(find . -maxdepth 1 -type f -name 'core*' -print) - # we only expect one, but cater for multiple - if [[ ${#core_files[@]} -gt 0 ]]; then - for file in "${core_files[@]}"; do - echo "Generating info from core file($file) to $POST_MORTEM_DIR/info.log" | tee -a $POST_MORTEM_DIR/info.log - gdb -batch -ix /opt/HPCCSystems/bin/.gdbinit -x /opt/HPCCSystems/bin/post-mortem-gdb ${PMD_PROGNAME} $file 2>$POST_MORTEM_DIR/info.err >>$POST_MORTEM_DIR/info.log - echo "Generated info from core file($file)" | tee -a $POST_MORTEM_DIR/info.log - rm $file - done - fi - dmesg -xT > $POST_MORTEM_DIR/dmesg.log - if [[ -n "${PMD_DALISERVER}" ]] && [[ -n "${PMD_WORKUNIT}" ]]; then - if [[ -s wuid ]]; then # takes precedence over command line option - PMD_WORKUNIT=$(cat wuid) - fi - wutool postmortem ${PMD_WORKUNIT} DALISERVER=${PMD_DALISERVER} PMD=${POST_MORTEM_DIR} - echo Updated workunit ${PMD_WORKUNIT} - fi -else - echo "Process exited cleanly (code=0)" -fi -trap - EXIT -exit $retVal - - diff --git a/initfiles/bin/check_executes.sh b/initfiles/bin/check_executes.sh new file mode 100755 index 00000000000..405c8885eea --- /dev/null +++ b/initfiles/bin/check_executes.sh @@ -0,0 +1,162 @@ +#!/bin/bash + +# ---------------------------------------------------------------------------- +# This script is the main entry point for all HPCC helm components. +# It will launch the component process and check its exist status. +# If the process exits with a non-zero status, it will collect post-mortem +# information and associate with the workunit if specified. +# It is also responsible for periodically updating the running file that +# the postrun sidecar monitors (see container_watch.sh). +# ---------------------------------------------------------------------------- + +# The yaml passes in all arguments as a single string, due to the way the args are built up, +# and additional args _HPCC_ARGS_ are substituted in. +# Split the single string argument into individual arguments +eval set -- "$1" + +usage() { + echo "Usage: check_executes [options] -- cmd args" + echo " -c The name of the container" + echo " -d Mounted directory to store post-mortem info in" + echo " -f Specifies a file to preserve on post-mortem" +} + +PMD_DIRECTORYBASE=$(pwd) +PMD_PROGNAME= +PMD_DALISERVER= +PMD_WORKUNIT= +PMD_CONTAINERNAME= +PMD_ALWAYS=false +PMD_VALGRIND=false +PMD_POSTRUN=false + + +while [ "$#" -gt 0 ]; do + arg=$1 + if [[ ${arg:0:1} == '-' ]]; then + case "${arg:1:1}" in + -) shift + PMD_PROGNAME=$1 + shift + break + ;; + c) shift + PMD_CONTAINERNAME=$1 + ;; + d) shift; + PMD_DIRECTORYBASE=$1 + ;; + a) PMD_ALWAYS=true + ;; + v) PMD_VALGRIND=true + ;; + p) PMD_POSTRUN=true + ;; + *) echo "Unknown option: ${arg:1:1}" + usage + exit + ;; + esac + else + echo "Unknown argument: $arg" + usage + exit + fi + shift +done + +if [[ -z ${PMD_PROGNAME} ]] ; then + usage + exit +fi + +# Scan managed process parameters for additional information +for (( arg=1; arg <= "$#"; arg++ )); do + optname=${!arg%=*} + optval=${!arg#*=} + if [[ ${optname} == '--config' ]]; then + PMD_CONFIG=(${optval}) + elif [[ ${optname} == '--daliServers' ]]; then + PMD_DALISERVER=${optval} + elif [[ ${optname} == '--workunit' ]]; then + PMD_WORKUNIT=${optval} + fi +done + +ulimit -c unlimited + +function cleanup +{ + echo "EXIT via signal for $progPid" + + if [ -n "$progPid" ]; then + if kill -0 $progPid 2>/dev/null; then + echo "Sending SIGTERM to process (PID $progPid)" + kill $progPid + wait $progPid + retVal=$? + fi + fi +} + +# Ensure any signals to the script kill the child process +# NB: do not include SIGEXIT since when handled, it will cause the script to exit prematurely. +trap cleanup SIGTERM SIGINT SIGABRT SIGQUIT SIGHUP + +runningFilename=/tmp/running +stoppedFilename=/tmp/stopped +if [ "$PMD_POSTRUN" = "true" ]; then + if [ -f ${runningFilename} ] || [ -f ${stoppedFilename} ]; then + echo "${runningFilename} and/or ${stoppedFilename} already exists. It suggests this container restarted quickly (postrun sidecar hasn't spotted and cleared up yet)" + secs=40 + echo "Waiting ${secs} seconds for postrun sidecar to spot and collate postmortem." + sleep ${secs} + echo "Continuing..." + fi +fi + +# Execute the main program, defaulting postmortem logging on (can be overriden by program's config file) +if [ "$PMD_VALGRIND" = "true" ]; then + echo running valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --num-callers=8 --log-fd=1 ${PMD_PROGNAME} --logging.postMortem=1000 "$@" + valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --num-callers=8 --log-fd=1 ${PMD_PROGNAME} --logging.postMortem=1000 "$@" & +else + echo running ${PMD_PROGNAME} --logging.postMortem=1000 "$@" + ${PMD_PROGNAME} --logging.postMortem=1000 "$@" & +fi + +progPid=$! + +> ${runningFilename} + +/bin/bash -c "while true; do touch ${runningFilename}; sleep 5; done" & +heartbeatPid=$! + +echo "Waiting for child process $progPid" +# If the signal handler (cleanup) was called, it will wait and capture retVal and cause this 'wait $progPid' to exit on completion. +# NB: If the signal handler itself doesn't wait, then it will still cause this statement to complete before the child process has exited. +wait $progPid +retVal2=$? +if [ ! -v retVal ]; then + retVal=$retVal2 +fi +echo "Child process $progPid exited with exit code $retVal" +kill -1 $heartbeatPid + +> ${stoppedFilename} + +trap '' SIGTERM SIGINT SIGABRT SIGQUIT SIGHUP + +# If it did not exit cleanly, copy some post-mortem info +if [ $PMD_ALWAYS = true ] || [ $retVal -ne 0 ]; then + extraArgs=() + if [[ -n "$PMD_WORKUNIT" ]]; then + extraArgs+=("--workunit=$PMD_WORKUNIT") + fi + echo "Collecting post mortem info" + collect_postmortem.sh "--directory=${PMD_DIRECTORYBASE}" "--daliServer=${PMD_DALISERVER}" "--container=${PMD_CONTAINERNAME}" "--process=${PMD_PROGNAME}" "${extraArgs[@]}" + echo "Post mortem collection completed" +else + echo "Process exited cleanly (code=0)" +fi +k8s_postjob_clearup.sh +exit $retVal diff --git a/initfiles/bin/collect_postmortem.sh b/initfiles/bin/collect_postmortem.sh new file mode 100755 index 00000000000..89915620c24 --- /dev/null +++ b/initfiles/bin/collect_postmortem.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# ---------------------------------------------------------------------------- +# This script collects various post-mortem information and writes it to a unique +# subdirectory with the specified output directory (which should be on persistent +# storage, e.g. the debug plane). +# It will then associate this new directory with the workunit if available. +# It is either launched from the main container's entrypoint script (check_executes.sh), +# or from the 'postrun' sidecar container (container_watch.sh). +# ---------------------------------------------------------------------------- + +container="" +daliServer="" +directory="" +external=false +process="" +workunit="" + +usage() +{ + echo "Usage: $0 --directory=DIRECTORY --daliServer=SERVER:PORT --container=CONTAINER_NAME --process=PROCESS_NAME" + exit 1 +} + +if [[ $# -lt 1 ]]; then + usage +fi + +while [[ $# -gt 0 ]]; do + case "$1" in + --container=*) + container="${1#*=}" + shift + ;; + --daliServer=*) + daliServer="${1#*=}" + shift + ;; + --directory=*) + directory="${1#*=}" + shift + ;; + --external|--external=true) + external=true + shift + ;; + --process=*) + process="${1#*=}" + shift + ;; + --workunit=*) + workunit="${1#*=}" + shift + ;; + --isJob) + shift + ;; + --*) + echo "Error: Unknown option '$1'" + usage + ;; + *) + # Once a non-option argument is encountered, break out of the options parsing + break + ;; + esac +done + +if [[ -z "$directory" ]]; then + echo "Error: --directory option is required." + usage +fi + +if [[ -z "$process" ]]; then + echo "Error: --process option is required." + usage +fi + +if [[ "$external" == true ]]; then + # we are in shared root of containers that mount with subPath, ${container} is the sub path + cd ${container} + pwd +fi + +containerBaseTmpDir="/tmp" +configFile="/etc/config" +if [[ "$external" == true ]]; then + containerBaseTmpDir="${containerBaseTmpDir}/${container}" + configFile="${configFile}/${container}" +fi +configFile="${configFile}/${container}.yaml" +wuidFilename="./wuid" +if [[ -e $wuidFilename ]]; then # takes precedence over command line option + workunit=$(cat ${wuidFilename}) +fi +POST_MORTEM_DIR="$directory" +if [[ -n "${workunit}" ]]; then + POST_MORTEM_DIR="${directory}/${workunit}" +fi +POST_MORTEM_DIR=${POST_MORTEM_DIR}/${container}/${process}/$(hostname)/$(date -Iseconds) +mkdir -p ${POST_MORTEM_DIR} +echo "Post-mortem info gathered in $POST_MORTEM_DIR" + +readarray -t core_files < <(find . -maxdepth 1 -type f -name 'core*' -print) +# we only expect one, but cater for multiple +if [[ ${#core_files[@]} -gt 0 ]]; then + for file in "${core_files[@]}"; do + echo "Generating info from core file($file) to $POST_MORTEM_DIR/info.log" | tee -a $POST_MORTEM_DIR/info.log + gdb -batch -ix /opt/HPCCSystems/bin/.gdbinit -x /opt/HPCCSystems/bin/post-mortem-gdb /opt/HPCCSystems/bin/${process} $file 2>$POST_MORTEM_DIR/info.err >>$POST_MORTEM_DIR/info.log + echo "Generated info from core file($file)" | tee -a $POST_MORTEM_DIR/info.log + rm $file + done +else + echo "Container instance ${container} stopped abruptly (OOM?)" | tee $POST_MORTEM_DIR/info.log +fi + +if [[ -f $configFile ]]; then + cp $configFile $POST_MORTEM_DIR + echo "Copied $configFile to $POST_MORTEM_DIR" | tee -a $POST_MORTEM_DIR/info.log +fi +cp `ls -rt ${containerBaseTmpDir}/postmortem.*.log.*` $POST_MORTEM_DIR +rm ${containerBaseTmpDir}/postmortem.*.log.* + +dmesg -xT > $POST_MORTEM_DIR/dmesg.log +if [[ -n "${daliServer}" ]] && [[ -n "${workunit}" ]]; then + wutool postmortem ${workunit} DALISERVER=${daliServer} PMD=${POST_MORTEM_DIR} + echo Updated workunit ${workunit} +fi + diff --git a/initfiles/bin/container_watch.sh b/initfiles/bin/container_watch.sh new file mode 100755 index 00000000000..4ad1df6b6d9 --- /dev/null +++ b/initfiles/bin/container_watch.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# ---------------------------------------------------------------------------- +# This script is used by the 'postrun' sidecar container to monitor the +# main containers by tracking their 'running' and 'stopped' files. +# If a 'running' file is not updated within a certain time period, the +# script will trigger a postmortem collection. +# This can happen if the main containers are abruptly halted by k8s, +# e.g. due to k8s OOM evictions. +# NB: The isJob option causes the script to exit if it detects that the +# job has finished (i.e. the main container has stopped). Otherwise, it +# will continue to loop and wait for a new instance of the main container to +# restart. +# ---------------------------------------------------------------------------- + +config="" +daliServer="" +directory="" +workunit="" +declare -a containers=() +declare -a processes=() + +usage() +{ + echo "Usage: $0 --directory=DIRECTORY --daliServer=SERVER:PORT [container process container process ...]" + exit 1 +} + +if [[ $# -lt 1 ]]; then + usage +fi + +isJob=false +declare -a originalArgs=() +while [[ $# -gt 0 ]]; do + case "$1" in + --daliServer=*) + daliServer="${1#*=}" + originalArgs+=("$1") + shift + ;; + --directory=*) + directory="${1#*=}" + originalArgs+=("$1") + shift + ;; + --workunit=*) + workunit="${1#*=}" + originalArgs+=("$1") + shift + ;; + --isJob) + isJob=true + originalArgs+=("$1") + shift + ;; + --*) + echo "Error: Unknown option '$1'" + usage + ;; + *) + # Once a non-option argument is encountered, break out of the options parsing + break + ;; + esac +done + +if [[ -z "$directory" ]]; then + echo "Error: --directory option is required." + usage +fi + +# Ensure that the remaining arguments are in pairs +if (( $# % 2 != 0 )); then + echo "Error: After options, arguments should be in container process pairs." + usage +fi + +# Collect container-process pairs +while [[ $# -gt 0 ]]; do + containers+=($1) + processes+=($2) + shift 2 +done + +SIGNALLEDFILENAME="/tmp/postrunSignalled" +monitor_container() +{ + local container="$1" + local process="$2" + RUNNINGFILENAME="/tmp/${container}/running" + STOPPEDFILENAME="/tmp/${container}/stopped" + + while true; do + echo "Waiting for ${RUNNINGFILENAME} to be created..." + until [ -f "${RUNNINGFILENAME}" ]; do + sleep 5 + if [ -f "${STOPPEDFILENAME}" ]; then + break # will cause while loop below to exit + fi + done + + echo "${RUNNINGFILENAME} found. Starting to monitor." + + retCode=0 + # Monitor the file + while true; do + if [ -f "${STOPPEDFILENAME}" ]; then + echo "${STOPPEDFILENAME} file detected. Exiting." + break + fi + if [ ! -f "${RUNNINGFILENAME}" ]; then + echo "${RUNNINGFILENAME} has been removed. Exiting." + break + fi + + # Check the file's age + CHKSECS=10 + MAXAGESECS=30 + FILE_MOD_TIME=$(stat -c %Y "${RUNNINGFILENAME}") + CURRENT_TIME=$(date +%s) + FILE_AGE=$((CURRENT_TIME - FILE_MOD_TIME)) + + if [ "${FILE_AGE}" -ge "${MAXAGESECS}" ]; then + echo "${container} ${RUNNINGFILENAME} is ${MAXAGESECS} seconds or older. Exiting." + retCode=1 # indicating bad container + break + else + echo "${container} file age is ${FILE_AGE}" + if [ "${FILE_AGE}" -lt "${CHKSECS}" ]; then + sleep $((CHKSECS - FILE_AGE)) + else + sleep ${CHKSECS} + fi + fi + done + echo removing ${STOPPEDFILENAME} and ${RUNNINGFILENAME}, retCode=${retCode} + rm -f "${STOPPEDFILENAME}" + rm -f "${RUNNINGFILENAME}" + + if [[ $retCode -eq 1 ]]; then + collect_postmortem.sh "${originalArgs[@]}" --container=${container} --process=${process} --external + fi + if [ -f "${SIGNALLEDFILENAME}" ]; then + echo "${SIGNALLEDFILENAME} file detected. Exiting." + break + fi + if [[ $isJob == true ]]; then + echo "${container} finished in job. No longer monitoring." + break + fi + done +} + +function exitTrap +{ + echo "Postrun signalled" + > ${SIGNALLEDFILENAME} # cause monitor_container to exit when passed running age check (instead of looping around expecting new container) + wait +} + +trap exitTrap SIGTERM SIGINT SIGABRT SIGQUIT SIGHUP + +for i in "${!containers[@]}"; do + container="${containers[i]}" + process="${processes[i]}" + echo "Container: $container, Process: $process" + monitor_container "$container" "$process" & +done + +wait \ No newline at end of file diff --git a/initfiles/bin/k8s_postjob_clearup.sh b/initfiles/bin/k8s_postjob_clearup.sh index 494e4292aad..fb3572c31ab 100755 --- a/initfiles/bin/k8s_postjob_clearup.sh +++ b/initfiles/bin/k8s_postjob_clearup.sh @@ -10,13 +10,22 @@ fi # NB: k8s resources created by HPCC follow the naming convention: -- # In the loop below, each k8s filename is parsed to extract these fields, the resourceName is constructed, # and it and resourceType are used to delete the resource. -for filename in *.k8s; do - IFS=, read componentName resourceType jobName <<< $(basename ${filename} .k8s) - resourceName="${componentName}-${resourceType}-${jobName}" - echo Performing: kubectl delete $resourceType/$resourceName - kubectl delete $resourceType/$resourceName - rm -f $filename -done +k8sFiles=$(ls *.k8s 2>/dev/null) + +if [ -n "$k8sFiles" ]; then + echo "Deleting k8s job resources" + for filename in $k8sFiles; do + IFS=, read componentName resourceType jobName graceTime <<< $(basename ${filename} .k8s) + resourceName="${componentName}-${resourceType}-${jobName}" + graceOpt="" + if [[ $graceTime -ne 0 ]]; then + graceOpt="--grace-period=${graceTime}" + fi + echo Performing: kubectl delete $resourceType/$resourceName $graceOpt + kubectl delete $resourceType/$resourceName $graceOpt + rm -f $filename + done +fi exit 0 diff --git a/system/jlib/jcontainerized.cpp b/system/jlib/jcontainerized.cpp index ca081c6fcb1..14e78d86ef2 100644 --- a/system/jlib/jcontainerized.cpp +++ b/system/jlib/jcontainerized.cpp @@ -266,10 +266,13 @@ bool applyYaml(const char *componentName, const char *wuid, const char *job, con if (autoCleanup) { - // touch a file, with naming convention { componentName },{ resourceType },{ jobName }.k8s + unsigned deleteJobGracePeriod = 0; + if (strcmp(resourceType, "job") == 0) + deleteJobGracePeriod = getComponentConfigSP()->getPropInt("@terminationGracePeriodSeconds", defaultDeleteJobGracePeriod); + // touch a file, with naming convention { componentName },{ resourceType },{ jobName },{ graceTimeSecs }.k8s // it will be used if the job fails ungracefully, to tidy up leaked resources // normally (during graceful cleanup) these resources and files will be deleted by deleteResource - VStringBuffer k8sResourcesFilename("%s,%s,%s.k8s", componentName, resourceType, jobName.str()); + VStringBuffer k8sResourcesFilename("%s,%s,%s,%u.k8s", componentName, resourceType, jobName.str(), deleteJobGracePeriod); touchFile(k8sResourcesFilename); } diff --git a/system/jlib/jcontainerized.hpp b/system/jlib/jcontainerized.hpp index d868be659a7..4641f0153fb 100644 --- a/system/jlib/jcontainerized.hpp +++ b/system/jlib/jcontainerized.hpp @@ -32,6 +32,7 @@ jlib_decl const char *queryMyJobName(); enum class KeepJobs { none, podfailures, all }; jlib_decl KeepJobs translateKeepJobs(const char *keepJobs); +constexpr unsigned defaultDeleteJobGracePeriod = 120; // this is time during deleteResource. Time for pod containers to stop and capture any info. jlib_decl bool isActiveService(const char *serviceName); jlib_decl void deleteResource(const char *componentName, const char *job, const char *resource); diff --git a/system/jlib/jptree.cpp b/system/jlib/jptree.cpp index 70d483495ad..80b4f7acb7a 100644 --- a/system/jlib/jptree.cpp +++ b/system/jlib/jptree.cpp @@ -8582,7 +8582,7 @@ static void applyCommandLineOption(IPropertyTree * config, const char * option, { config = config->queryPropTree(elemName); if (!config) - throw makeStringExceptionV(99, "Cannot overriding scalar configuration element %s with structure", elemName.get()); + throw makeStringExceptionV(99, "Cannot override scalar configuration element %s with structure", elemName.get()); } option = tail+1; } diff --git a/thorlcr/master/thgraphmanager.cpp b/thorlcr/master/thgraphmanager.cpp index ed2bd7d7f1c..07008ac50b5 100644 --- a/thorlcr/master/thgraphmanager.cpp +++ b/thorlcr/master/thgraphmanager.cpp @@ -1510,6 +1510,7 @@ void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphNam break; } } + saveWuidToFile(""); // clear wuid file. Signifies that no wuid is running. lingerTimer.reset(lingerPeriod); } } diff --git a/thorlcr/slave/slavmain.cpp b/thorlcr/slave/slavmain.cpp index 6dcaa9097a9..bd7c4b9fafa 100644 --- a/thorlcr/slave/slavmain.cpp +++ b/thorlcr/slave/slavmain.cpp @@ -1953,6 +1953,8 @@ class CJobListener : public CSimpleInterface // reset for next job setProcessAborted(false); + saveWuidToFile(""); // clear wuid file. Signifies that no wuid is running. + if (exception) throw exception.getClear(); // NB: this will cause exception to be part of the reply to master