Skip to content

Commit

Permalink
Merge pull request #17787 from jakesmith/HPCC-30305-job-failure
Browse files Browse the repository at this point in the history
HPCC-30305 Improve job failure detection and report

Reviewed-by: Gavin Halliday <[email protected]>
Merged-by: Gavin Halliday <[email protected]>
  • Loading branch information
ghalliday authored Sep 21, 2023
2 parents f584765 + b5d7aeb commit 9487b02
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 56 deletions.
10 changes: 6 additions & 4 deletions helm/hpcc/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -1633,12 +1633,14 @@ args:
{{- end }}
- >-
{{ $check_cmd.command }};
exitCode=$?;
k8s_postjob_clearup.sh;
{{- if $misc.postJobCommandViaSidecar -}} ;
touch /wait-and-run/{{ .me.name }}.jobdone
{{- else if $postJobCommand -}} ;
{{ $postJobCommand }}
{{- if $misc.postJobCommandViaSidecar -}}
touch /wait-and-run/{{ .me.name }}.jobdone;
{{- else if $postJobCommand -}}
{{ $postJobCommand }} ;
{{- end }}
exit $exitCode;
{{- end -}}

{{/*
Expand Down
108 changes: 56 additions & 52 deletions system/jlib/jcontainerized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

namespace k8s {

#ifdef _CONTAINERIZED
static StringBuffer myPodName;

const char *queryMyPodName()
Expand Down Expand Up @@ -63,13 +62,45 @@ void deleteResource(const char *componentName, const char *resourceType, const c
remove(k8sResourcesFilename);
}

bool checkExitCodes(StringBuffer &output, const char *podStatuses)
{
const char *startOfPodStatus = podStatuses;
while (*startOfPodStatus)
{
const char *endOfPodStatus = strchr(startOfPodStatus, '|');
StringBuffer podStatus;
if (endOfPodStatus)
podStatus.append((size_t)(endOfPodStatus-startOfPodStatus), startOfPodStatus);
else
podStatus.append(startOfPodStatus);
StringArray fields;
fields.appendList(podStatus, ",");
if (3 == fields.length()) // should be 3 fields {<exitCode>,<"initContainer"|"container">,<name>}
{
const char *exitCodeStr = fields.item(0);
if (strlen(exitCodeStr))
{
unsigned exitCode = atoi(exitCodeStr);
if (exitCode) // non-zero = failure
{
output.appendf(" %s '%s' failed with exitCode = %u", fields.item(1), fields.item(2), exitCode);
return true;
}
}
}
if (!endOfPodStatus)
break;
startOfPodStatus = endOfPodStatus+1;
}
return false;
}

void waitJob(const char *componentName, const char *resourceType, const char *job, unsigned pendingTimeoutSecs, KeepJobs keepJob)
{
VStringBuffer jobName("%s-%s-%s", componentName, resourceType, job);
jobName.toLowerCase();
VStringBuffer waitJob("kubectl get jobs %s -o jsonpath={.status.active}", jobName.str());
VStringBuffer getScheduleStatus("kubectl get pods --selector=job-name=%s --output=jsonpath={.items[*].status.conditions[?(@.type=='PodScheduled')].status}", jobName.str());
VStringBuffer checkJobExitCode("kubectl get pods --selector=job-name=%s --output=jsonpath={.items[*].status.containerStatuses[?(@.name==\"%s\")].state.terminated.exitCode}", jobName.str(), jobName.str());

unsigned delay = 100;
unsigned start = msTick();
Expand All @@ -82,14 +113,30 @@ void waitJob(const char *componentName, const char *resourceType, const char *jo
{
StringBuffer output;
runKubectlCommand(componentName, waitJob, nullptr, &output);
if (!streq(output, "1")) // status.active value
if ((0 == output.length()) || streq(output, "0")) // status.active value
{
// Job is no longer active - we can terminate
DBGLOG("kubectl jobs output: %s", output.str());
runKubectlCommand(componentName, checkJobExitCode, nullptr, &output.clear());
if (output.length() && !streq(output, "0")) // state.terminated.exitCode
throw makeStringExceptionV(0, "Failed to run %s: pod exited with error: %s", jobName.str(), output.str());
break;
VStringBuffer checkJobExitStatus("kubectl get jobs %s '-o=jsonpath={range .status.conditions[*]}{.type}: {.status} - {.message}|{end}'", jobName.str());
runKubectlCommand(componentName, checkJobExitStatus, nullptr, &output.clear());
if (strstr(output.str(), "Failed: "))
{
VStringBuffer errMsg("Job %s failed [%s].", jobName.str(), output.str());
VStringBuffer checkInitContainerExitCodes("kubectl get pods --selector=job-name=%s '-o=jsonpath={range .items[*].status.initContainerStatuses[*]}{.state.terminated.exitCode},{\"initContainer\"},{.name}{\"|\"}{end}'", jobName.str());
runKubectlCommand(componentName, checkInitContainerExitCodes, nullptr, &output.clear());
DBGLOG("checkInitContainerExitCodes - output = %s", output.str());
if (!checkExitCodes(errMsg, output))
{
// no init container failures, check regular containers
VStringBuffer checkContainerExitCodes("kubectl get pods --selector=job-name=%s '-o=jsonpath={range .items[*].status.containerStatuses[*]}{.state.terminated.exitCode},{\"container\"},{.name}{\"|\"}{end}'", jobName.str());
runKubectlCommand(componentName, checkContainerExitCodes, nullptr, &output.clear());
DBGLOG("checkContainerExitCodes - output = %s", output.str());
checkExitCodes(errMsg, output);
}
throw makeStringException(0, errMsg);
}
else // assume success, either .status.conditions type of "Complete" or "Succeeded"
break;
}
runKubectlCommand(nullptr, getScheduleStatus, nullptr, &output.clear());

Expand Down Expand Up @@ -261,51 +308,8 @@ MODULE_INIT(INIT_PRIORITY_STANDARD)
}
MODULE_EXIT()
{
removeConfigUpdateHook(podInfoInitCBId);
}

#else

const char *queryMyPodName()
{
throwUnexpected();
}

KeepJobs translateKeepJobs(const char *keepJobs)
{
throwUnexpected();
}

bool isActiveService(const char *serviceName)
{
throwUnexpected();
}

void deleteResource(const char *componentName, const char *job, const char *resource)
{
throwUnexpected();
}

void waitJob(const char *componentName, const char *resourceType, const char *job, unsigned pendingTimeoutSecs, KeepJobs keepJob)
{
throwUnexpected();
}

bool applyYaml(const char *componentName, const char *wuid, const char *job, const char *resourceType, const std::list<std::pair<std::string, std::string>> &extraParams, bool optional, bool autoCleanup)
{
throwUnexpected();
}

void runJob(const char *componentName, const char *wuid, const char *job, const std::list<std::pair<std::string, std::string>> &extraParams)
{
throwUnexpected();
}

std::vector<std::vector<std::string>> getPodNodes(const char *selector)
{
throwUnexpected();
if (isContainerized())
removeConfigUpdateHook(podInfoInitCBId);
}

#endif // _CONTAINERIZED

} // end of k8s namespace

0 comments on commit 9487b02

Please sign in to comment.