From 0172f1542cada73c111296151c5ccac0d27356e2 Mon Sep 17 00:00:00 2001 From: Gavin Halliday Date: Thu, 21 Nov 2024 12:09:17 +0000 Subject: [PATCH] HPCC-33015 Improve system resilience when thor crashes * Ensure that a thor engine that has crashed is no longer associated with a workunit * Ensure that a thor instance that never processes a workunit terminates cleanly Signed-off-by: Gavin Halliday --- common/workunit/workunit.cpp | 5 +++++ thorlcr/master/thgraphmanager.cpp | 37 ++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/common/workunit/workunit.cpp b/common/workunit/workunit.cpp index fe32bad10aa..04ebd2ce4dd 100644 --- a/common/workunit/workunit.cpp +++ b/common/workunit/workunit.cpp @@ -14500,6 +14500,11 @@ void executeThorGraph(const char * graphName, IConstWorkUnit &workunit, const IP // NB: check for expected success state (WUStateWait). If any other state, abort. { Owned w = &workunit.lock(); + //If the thor instance crashed, make sure that the workunit is no longer associated with it - otherwise a + //failure clause that causes a graph to run can abort because the instances has stopped. + if (w->getEngineSession() > 0) + w->setEngineSession(-1); + WUState state = w->getState(); if (WUStateWait != state) // expected state from successful Thor run from above { diff --git a/thorlcr/master/thgraphmanager.cpp b/thorlcr/master/thgraphmanager.cpp index bf095d9ae1d..7441eeaddf9 100644 --- a/thorlcr/master/thgraphmanager.cpp +++ b/thorlcr/master/thgraphmanager.cpp @@ -1473,8 +1473,13 @@ void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphNam Owned workunit; factory.setown(getWorkUnitFactory()); workunit.setown(factory->openWorkUnit(currentWuid)); - SessionId agentSessionID = workunit->getAgentSession(); - if (agentSessionID <= 0) + SessionId agentSessionID = workunit ? workunit->getAgentSession() : 0; + if (!workunit) + { + WARNLOG("Discarding job with missing workunit wuid=%s, graph=%s", currentWuid.str(), currentGraphName.str()); + currentWuid.clear(); + } + else if (agentSessionID <= 0) { WARNLOG("Discarding job with invalid sessionID: wuid=%s, graph=%s (sessionID=%" I64F "d)", currentWuid.str(), currentGraphName.str(), agentSessionID); currentWuid.clear(); @@ -1524,8 +1529,8 @@ void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphNam } } } - currentGraphName.clear(); + currentGraphName.clear(); if (lingerPeriod) { PROGLOG("Lingering time left: %.2f", ((float)lingerPeriod)/1000); @@ -1545,15 +1550,25 @@ void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphNam break; // timeout/abort // else - reject/ignore duff message. } - if (0 == currentGraphName.length()) // only ever true if !multiJobLinger + + // The following is true if no workunit/graph have been received + // MORE: I think it should also be executed if lingerPeriod is 0 + if (0 == currentGraphName.length()) { - // De-register the idle lingering entry. - Owned factory; - Owned workunit; - factory.setown(getWorkUnitFactory()); - workunit.setown(factory->openWorkUnit(currentWuid)); - Owned w = &workunit->lock(); - w->setDebugValue(instance, "0", true); + if (!multiJobLinger) + { + // De-register the idle lingering entry. + Owned factory; + Owned workunit; + factory.setown(getWorkUnitFactory()); + workunit.setown(factory->openWorkUnit(currentWuid)); + //Unlikely, but the workunit could have been deleted while we were lingering + if (workunit) + { + Owned w = &workunit->lock(); + w->setDebugValue(instance, "0", true); + } + } break; } }