From bf7d04ff5029de1e41d418dbfe32206e087f96e9 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Wed, 13 Dec 2023 15:30:28 +0000 Subject: [PATCH] HPCC-31017 Report cause of k8s thorworker job failure Ensure that the cause of the failure to apply the k8s thorworker job is reported back to the workunit. Also suppress follow on 'backoff' failure if the primary cause of failure has already been reported. Signed-off-by: Jake Smith --- thorlcr/master/thmastermain.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/thorlcr/master/thmastermain.cpp b/thorlcr/master/thmastermain.cpp index 459c0d92773..adc174d9a84 100644 --- a/thorlcr/master/thmastermain.cpp +++ b/thorlcr/master/thmastermain.cpp @@ -1162,18 +1162,23 @@ int main( int argc, const char *argv[] ) } if (isContainerized()) { + int retCode = exception ? TEC_Exception : 0; if (!cloudJobName.isEmpty()) { + if (exception) + { + Owned factory = getWorkUnitFactory(); + Owned wu = factory->openWorkUnit(workunit); + if (wu) + { + relayWuidException(wu, exception); + retCode = 0; // if successfully reported, suppress thormanager exit failure that would trigger another exception + } + } if (workerJobInstalled) { try { - if (exception) - { - Owned factory = getWorkUnitFactory(); - Owned wu = factory->openWorkUnit(workunit); - relayWuidException(wu, exception); - } k8s::KeepJobs keepJob = k8s::translateKeepJobs(globals->queryProp("@keepJobs")); switch (keepJob) { @@ -1208,7 +1213,7 @@ int main( int argc, const char *argv[] ) } } } - setExitCode(exception ? TEC_Exception : 0); + setExitCode(retCode); } // cleanup handler to be sure we end