Skip to content

Commit

Permalink
HPCC-32683 Fix issues with postmortem and container death
Browse files Browse the repository at this point in the history
helm changes:
- Consistently generate command via addCommandAndLifecycle,
and collect added container information in lifeCycleCtx
based on containers added to lifeCycleCtx
- Add terminationGracePeriodSeconds option
- Mount ephemeral directories in distinct subPaths.
- Fix issues with postmortem's from different containers
overwriting one another
- Add addPostRunContainer to generate postrun container,
to monitor running containers.

script changes:
- add container_watch.sh for postrun pod.
- move all options handling into check_executes.sh

code changes:
- code grace time into k8s job file into, so that postjob
clearup can also use
- Clear wuid file (prevents spurious error association)

Signed-off-by: Jake Smith <[email protected]>
  • Loading branch information
jakesmith committed Dec 4, 2024
1 parent 16e1ad1 commit 2e1e047
Show file tree
Hide file tree
Showing 24 changed files with 813 additions and 361 deletions.
245 changes: 160 additions & 85 deletions helm/hpcc/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -313,33 +313,85 @@ metrics:
{{- end -}}

{{/*
Add ConfigMap volume mount for a component
Add tmp volume mount
*/}}
{{- define "hpcc.addConfigMapVolumeMount" -}}
- name: {{ .name }}-temp-volume
{{- define "hpcc.addTempVolumeMount" -}}
{{- $volumeName := .volumeName | default .name -}}
- name: {{ $volumeName }}-temp-volume
mountPath: /tmp
- name: {{ .name }}-hpcctmp-volume
{{- if not .noSubPath }}
subPath: {{ .name | quote }}
{{- end -}}
{{- end -}}

{{/*
Add runtime volume mount
*/}}
{{- define "hpcc.addRuntimeVolumeMount" -}}
{{- $volumeName := .volumeName | default .name -}}
- name: {{ $volumeName }}-hpcctmp-volume
mountPath: /var/lib/HPCCSystems
{{- if .tmpSubPath }}
subPath: {{ .tmpSubPath | quote }}
{{- end }}
{{- if not .noSubPath }}
subPath: {{ .name | quote }}
{{- end -}}
{{- end -}}

{{/*
Add ConfigMap volume mount for a component
*/}}
{{- define "hpcc.addConfigMapVolumeMount" -}}
- name: {{ .name }}-configmap-volume
{{- if .noSubPath }}
mountPath: {{ printf "/etc/config/%s" .name }}
{{- else }}
mountPath: /etc/config
{{- end -}}
{{- end -}}

{{/*
Add ConfigMap volume for a component
Add standard ephemeral volume mounts for a component
*/}}
{{- define "hpcc.addConfigMapVolume" -}}
{{- define "hpcc.addEphemeralVolumeMounts" -}}
{{ include "hpcc.addTempVolumeMount" . }}
{{ include "hpcc.addRuntimeVolumeMount" . }}
{{ include "hpcc.addConfigMapVolumeMount" . }}
{{- end -}}

{{/*
Add tmp volume for a component
*/}}
{{- define "hpcc.addTempVolume" -}}
- name: {{ .name }}-temp-volume
emptyDir: {}
{{- end -}}

{{/*
Add runtime volume for a component
*/}}
{{- define "hpcc.addRuntimeVolume" -}}
- name: {{ .name }}-hpcctmp-volume
emptyDir: {}
{{- end -}}

{{/*
Add ConfigMap volume for a component
*/}}
{{- define "hpcc.addConfigMapVolume" -}}
- name: {{ .name }}-configmap-volume
configMap:
name: {{ .name }}-configmap
{{- end -}}


{{/*
Add ConfigMap volume for a component
*/}}
{{- define "hpcc.addEphemeralVolumes" -}}
{{ include "hpcc.addTempVolume" . }}
{{ include "hpcc.addRuntimeVolume" . }}
{{ include "hpcc.addConfigMapVolume" . }}
{{- end -}}

{{/*
Get mount details
Pass in plane
Expand Down Expand Up @@ -734,57 +786,24 @@ Check that the storage and spill planes for a component exist
{{- end -}}

{{/*
Add command for a component
*/}}
{{- define "hpcc.componentCommand" -}}
{{- if .me.valgrind -}}
valgrind
{{- else if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}}
check_executes
{{- else -}}
{{ .process }}
{{- end }}
{{- end -}}

{{/*
Add extra args for a component
Add config arg for a component
*/}}
{{- define "hpcc.componentStartArgs" -}}
{{- if .me.valgrind -}}
"--leak-check=full",
"--show-leak-kinds=all",
"--track-origins=yes",
"--num-callers=8",
"--log-fd=1",
{{ .process | quote }},
{{- else if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}}
{{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}}
{{- $meExpert := .me.expert | default dict -}}
{{- $globalExpert := .root.Values.global.expert | default dict -}}
{{- $alwaysPostMortem := (hasKey $meExpert "alwaysPostMortem") | ternary $meExpert.alwaysPostMortem ($globalExpert.alwaysPostMortem | default false) -}}
{{- if $alwaysPostMortem -}}
"-a",{{ "\n" }}
{{- end -}}
"-d", {{ $prefix }},
"--",
{{ .process | quote }},
{{- end }}
{{- define "hpcc.getConfigArg" -}}
/etc/config/{{ .name }}.yaml
{{- end -}}

{{/*
Add config arg for a component
*/}}
{{- define "hpcc.configArg" -}}
"--config=/etc/config/{{ .name }}.yaml"
"--config={{ include "hpcc.getConfigArg" . }}"
{{- end -}}

{{/*
Add dali arg for a component
Get dali endpoint for a component
Pass in dict with root, component (in case of error), optional (true if daliArg is optional)
*/}}
{{- define "hpcc.daliArg" -}}
{{- define "hpcc.getDali" -}}
{{- if empty .root.Values.dali -}}
{{- if not .optional -}}
{{- $_ := fail (printf "%s requires a DALI to be defined" .component) -}}
Expand All @@ -794,10 +813,22 @@ Pass in dict with root, component (in case of error), optional (true if daliArg
{{- $daliService := $dali.service | default dict -}}
{{- $daliHost := .overrideDaliHost | default $dali.name -}}
{{- $daliServicePort := .overrideDaliPort | default ($daliService.servicePort | default 7070) -}}
"--daliServers={{ $daliHost }}:{{ $daliServicePort }}"
{{ $daliHost }}:{{ $daliServicePort }}
{{- end -}}
{{- end -}}


{{/*
Add dali arg for a component
Pass in dict with root, component (in case of error), optional (true if daliArg is optional)
*/}}
{{- define "hpcc.daliArg" -}}
{{- $dali := include "hpcc.getDali" . -}}
{{- if $dali -}}
"--daliServers={{ $dali }}"
{{- end -}}
{{- end -}}

{{/*
Get image name
*/}}
Expand Down Expand Up @@ -1022,6 +1053,53 @@ NB: uid=10000 and gid=10001 are the uid/gid of the hpcc user, built into platfor
{{- include "hpcc.configContainer" . | nindent 0 -}}
{{- end -}}

{{/*
A sidecar container to run commands after a main container finishes
Pass in dict with me, and params
*/}}
{{- define "hpcc.addPostRunContainer" -}}
{{- $meExpert := .me.expert | default dict -}}
{{- $globalExpert := .root.Values.global.expert | default dict -}}
{{- $postRun := (hasKey $meExpert "postRunSidecar") | ternary $meExpert.postRunSidecar ((hasKey $globalExpert "postRunSidecar") | ternary $globalExpert.postRunSidecar true) }}
{{- if $postRun }}
{{- if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}}
{{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}}
{{- $dali := include "hpcc.getDali" . -}}
- name: postrun
{{- include "hpcc.addImageAttrs" . | nindent 2 }}
command:
- container_watch.sh
- {{ printf "--directory=%s" $prefix }}
{{- if $dali }}
- {{ printf "--daliServer=%s" $dali }}
{{- end }}
{{- if .isJob }}
- --isJob
{{- end }}
{{- range $container := .lifeCycleCtx.containers }}
- {{ $container.name }}
- {{ $container.process }}
{{- end }}
{{- include "hpcc.addSecurityContext" . | indent 2 }}
volumeMounts:
{{- include "hpcc.addTempVolumeMount" (.me | merge (dict "noSubPath" "true")) | nindent 2 }}
{{- include "hpcc.addRuntimeVolumeMount" (.me | merge (dict "noSubPath" "true")) | nindent 2 }}
{{- $uniqueConfigs := dict -}}
{{- range $container := .lifeCycleCtx.containers -}}
{{- $config := $container.config -}}
{{- $_ := set $uniqueConfigs $config true -}}
{{- end -}}
{{- $me := .me -}}
{{- range $config, $_ := $uniqueConfigs }}
{{- include "hpcc.addConfigMapVolumeMount" ($me | merge (dict "name" $config "noSubPath" "true")) | nindent 2 -}}
{{- end -}}
{{- include "hpcc.addVolumeMounts" (dict "root" .root "me" $me "includeCategories" (list "debug")) | nindent 2 }}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Container to watch for a file on a shared mount and execute a command
Pass in dict with me and command
Expand Down Expand Up @@ -1095,7 +1173,7 @@ Pass in a dictionary with root and me defined
{{- define "hpcc.addSecurityContext" }}
{{- $user := (.root.Values.global.user | default dict) }}
securityContext:
{{- if .root.Values.global.privileged }}
{{- if (or .root.Values.global.privileged .privileged) }}
privileged: true
capabilities:
add:
Expand Down Expand Up @@ -1428,7 +1506,7 @@ data:

{{/*
A template to generate Sasha service containers
Pass in dict with root, me and dali if container in dali pod
Pass in dict with root, me, lifeCycleCtx and dali if container in dali pod
*/}}
{{- define "hpcc.addSashaContainer" }}
{{- $serviceName := printf "sasha-%s" .me.name }}
Expand All @@ -1437,14 +1515,7 @@ Pass in dict with root, me and dali if container in dali pod
{{- $env := concat (.root.Values.global.env | default list) (.env | default list) }}
- name: {{ $serviceName | quote }}
workingDir: /var/lib/HPCCSystems
command: [ saserver ]
args: [
{{- with (dict "name" $serviceName) }}
{{ include "hpcc.configArg" . }},
{{- end }}
"--service={{ .me.name }}",
{{ include "hpcc.daliArg" (dict "root" .root "component" "Sasha" "optional" false "overrideDaliHost" $overrideDaliHost "overrideDaliPort" $overrideDaliPort) | indent 10 }}
]
{{- include "hpcc.addCommandAndLifecycle" (merge (pick . "root" "lifeCycleCtx") (dict "me" (.me | merge (dict "name" $serviceName))) (dict "process" "saserver" "extraArgs" (list (printf "--service=%s" .me.name)) "component" "Sasha" "optional" false "overrideConfigName" $serviceName "overrideDaliHost" $overrideDaliHost "overrideDaliPort" $overrideDaliPort)) | nindent 2 }}
{{- include "hpcc.addResources" (dict "me" .me.resources "root" .root) | indent 2 }}
{{- include "hpcc.addSecurityContext" . | indent 2 }}
env:
Expand Down Expand Up @@ -1822,11 +1893,11 @@ Pass in dict with root, pod, target and type

{{/*
Generate lifecycle, command and args
Pass in root, me and command
Pass in root, me and process
*/}}
{{- define "hpcc.addCommandAndLifecycle" -}}
{{- $misc := .root.Values.global.misc | default dict }}
{{- $postJobCommand := $misc.postJobCommand | default "" }}
{{- $misc := .root.Values.global.misc | default dict -}}
{{- $postJobCommand := (.isJob | default false) | ternary $misc.postJobCommand "" -}}
lifecycle:
preStop:
exec:
Expand All @@ -1835,38 +1906,42 @@ lifecycle:
- "-c"
- >-
k8s_postjob_clearup.sh
{{- if and (not $misc.postJobCommandViaSidecar) $postJobCommand }} ;
{{- if $misc.postJobCommandViaSidecar }} ;
touch /wait-and-run/{{ .me.name }}.jobdone
{{- else if $postJobCommand }} ;
{{ $postJobCommand }}
{{- end }}
command: ["/bin/bash"]
args:
- -c
{{- $check_cmd := dict "command" .command}}
{{- if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- end -}}
{{- $meExpert := .me.expert | default dict -}}
{{- $globalExpert := .root.Values.global.expert | default dict -}}
{{- $containerName := .containerName | default .me.name -}}
{{- $args := list -}}
{{- $configCtx := (hasKey . "overrideConfigName") | ternary (dict "name" .overrideConfigName) .me -}}
{{- if .me.valgrind -}}
{{- $args = append $args "-v" -}}
{{- else if (include "hpcc.hasPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory" (dict "root" .root "category" "debug")) -}}
{{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}}
{{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}}
{{- $pmd_always_opt := "" -}}
{{- $globalExpert := .root.Values.global.expert | default dict -}}
{{- $meExpert := .me.expert | default dict -}}
{{- $alwaysPostMortem := (hasKey $meExpert "alwaysPostMortem") | ternary $meExpert.alwaysPostMortem ($globalExpert.alwaysPostMortem | default false) -}}
{{- if $alwaysPostMortem -}}
{{- $pmd_always_opt = "-a " -}}
{{- $args = append $args "-a" -}}
{{- end -}}
{{- $_ := set $check_cmd "command" (printf "check_executes %s-d %s -- %s" $pmd_always_opt $prefix .command) -}}
{{- end }}
- >-
{{ $check_cmd.command }};
exitCode=$?;
k8s_postjob_clearup.sh;
{{- if $misc.postJobCommandViaSidecar -}}
touch /wait-and-run/{{ .me.name }}.jobdone;
{{- else if $postJobCommand -}}
{{ $postJobCommand }} ;
{{- $postRun := (hasKey $meExpert "postRunSidecar") | ternary $meExpert.postRunSidecar ((hasKey $globalExpert "postRunSidecar") | ternary $globalExpert.postRunSidecar true) -}}
{{- if $postRun -}}
{{- $args = append $args "-p" -}}
{{- end -}}
{{- $args = concat $args (list "-d" $prefix "-c" $containerName "--") -}}
{{- $_ := set .lifeCycleCtx "containers" (append .lifeCycleCtx.containers (dict "name" $containerName "process" .process "config" $configCtx.name)) -}}
{{- end -}}
{{- $args = append $args .process -}}
{{- $args = append $args (include "hpcc.configArg" $configCtx) -}}
{{- $args = append $args (include "hpcc.daliArg" .) -}}
{{- if hasKey . "extraArgs" -}}
{{- $args = concat $args .extraArgs -}}
{{- end }}
exit $exitCode;
command: ["check_executes.sh"]
args: [ {{ join " " $args }} ]
{{- end -}}

{{- define "hpcc.addCertificateImpl" }}
{{- if (.root.Values.certificates | default dict).enabled -}}
{{- $externalCert := .externalCert -}}
Expand Down
12 changes: 6 additions & 6 deletions helm/hpcc/templates/dafilesrv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ data:
{{- if not .disabled -}}
{{- $env := concat ($.Values.global.env | default list) (.env | default list) -}}
{{- $commonCtx := dict "root" $ "me" . "env" $env "exposure" "local" "visibility" .service.visibility "includeCategories" (list "data" "debug") -}}
{{- $_ := set $commonCtx "lifeCycleCtx" (dict "containers" list) -}}
{{- if (eq "spray" .application) -}}
{{- $_ := set $commonCtx "includeCategories" (concat $commonCtx.includeCategories (list "lz" "remote")) -}}
{{- end -}}
Expand Down Expand Up @@ -58,16 +59,14 @@ spec:
spec:
{{- include "hpcc.placementsByPodTargetType" (dict "root" $ "pod" .name "type" "dafilesrv") | indent 6 }}
serviceAccountName: "hpcc-default"
terminationGracePeriodSeconds: {{ .terminationGracePeriodSeconds | default 600 }}
initContainers:
{{- include "hpcc.createConfigInitContainers" $commonCtx | indent 6 }}
{{- include "hpcc.addImagePullSecrets" $commonCtx | nindent 6 -}}
containers:
- name: {{ .name | quote }}
workingDir: /var/lib/HPCCSystems
command: [ {{ include "hpcc.componentCommand" (dict "me" . "root" $ "process" "dafilesrv") }} ]
args: [ {{- include "hpcc.componentStartArgs" (dict "me" . "root" $ "process" "dafilesrv") | nindent 16 }}
{{ include "hpcc.configArg" . }}
]
{{- include "hpcc.addCommandAndLifecycle" ($commonCtx | merge (dict "process" "dafilesrv" "component" "DaFileSrv" "optional" false)) | nindent 8 }}
env:
{{ include "hpcc.mergeEnvironments" (dict "env" $env "defaultArenas" 2) | indent 8 -}}
- name: "SENTINEL"
Expand All @@ -77,7 +76,7 @@ spec:
{{- include "hpcc.addResources" (dict "me" .resources "root" $) | indent 8 }}
{{ include "hpcc.addImageAttrs" $commonCtx | indent 8 }}
volumeMounts:
{{ include "hpcc.addConfigMapVolumeMount" . | indent 8 }}
{{ include "hpcc.addEphemeralVolumeMounts" . | indent 8 }}
{{ include "hpcc.addVolumeMounts" $commonCtx | indent 8 }}
{{ include "hpcc.addVaultClientCertificateVolumeMounts" $commonCtx | indent 8 }}
{{- if $commonCtx.certificatesEnabled }}
Expand All @@ -87,8 +86,9 @@ spec:
{{- $_ := fail (printf "dafilesrv[application=stream]- certificates must be enabled to use") -}}
{{- end }}
{{- end }}
{{- include "hpcc.addPostRunContainer" $commonCtx | nindent 6 }}
volumes:
{{ include "hpcc.addConfigMapVolume" . | indent 6 }}
{{ include "hpcc.addEphemeralVolumes" . | indent 6 }}
{{ include "hpcc.addVolumes" $commonCtx | indent 6 }}
{{ include "hpcc.addVaultClientCertificateVolumes" $commonCtx | indent 6 }}
{{- if $commonCtx.certificatesEnabled }}
Expand Down
Loading

0 comments on commit 2e1e047

Please sign in to comment.