From d83da9fba06c7f628ccc89311296da25ed88d6b6 Mon Sep 17 00:00:00 2001 From: Jake Smith Date: Tue, 21 May 2024 13:31:16 +0100 Subject: [PATCH] HPCC-31902 Adjust Thor auditing info to contain pod/container meta info Signed-off-by: Jake Smith --- thorlcr/master/thdemonserver.cpp | 8 ----- thorlcr/master/thgraphmanager.cpp | 59 +++++++++++++++++++++---------- thorlcr/master/thgraphmanager.hpp | 3 ++ thorlcr/master/thmastermain.cpp | 14 ++++---- thorlcr/thorutil/thormisc.cpp | 2 +- 5 files changed, 51 insertions(+), 35 deletions(-) diff --git a/thorlcr/master/thdemonserver.cpp b/thorlcr/master/thdemonserver.cpp index 5f7d7f8b989..be154603883 100644 --- a/thorlcr/master/thdemonserver.cpp +++ b/thorlcr/master/thdemonserver.cpp @@ -273,14 +273,6 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer unsigned startTime = msTick(); graphStarts.append(startTime); reportGraph(graph, false, true, startTime, getTimeStampNowValue()); - const char *graphname = graph->queryJob().queryGraphName(); - if (memcmp(graphname,"graph",5)==0) - graphname+=5; - LOG(MCauditInfo,",Progress,Thor,StartSubgraph,%s,%s,%s,%u,%s,%s", - queryServerStatus().queryProperties()->queryProp("@thorname"), - graph->queryJob().queryWuid(), - graphname, - (unsigned)graph->queryGraphId(), queryServerStatus().queryProperties()->queryProp("@nodeGroup"), queryServerStatus().queryProperties()->queryProp("@queue")); } void endGraph(CGraphBase *graph, bool success) { diff --git a/thorlcr/master/thgraphmanager.cpp b/thorlcr/master/thgraphmanager.cpp index 4f8a9f1315a..32154ef040c 100644 --- a/thorlcr/master/thgraphmanager.cpp +++ b/thorlcr/master/thgraphmanager.cpp @@ -423,10 +423,7 @@ void CJobManager::fatal(IException *e) { IERRLOG("Unknown exception in CJobManager::fatal"); } - LOG(MCauditInfo,",Progress,Thor,Terminate,%s,%s,%s,exception", - queryServerStatus().queryProperties()->queryProp("@thorname"), - queryServerStatus().queryProperties()->queryProp("@nodeGroup"), - queryServerStatus().queryProperties()->queryProp("@queue")); + auditThorSystemEvent("Terminate", {"exception"}); queryLogMsgManager()->flushQueue(10*1000); @@ -890,13 +887,8 @@ bool CJobManager::doit(IConstWorkUnit *workunit, const char *graphName, const So JobNameScope activeJobName(wuid); LOG(MCdebugInfo, "Processing wuid=%s, graph=%s from agent: %s", wuid.str(), graphName, agentep.getEndpointHostText(s).str()); - LOG(MCauditInfo,",Progress,Thor,Start,%s,%s,%s,%s,%s,%s", - queryServerStatus().queryProperties()->queryProp("@thorname"), - wuid.str(), - graphName, - user.str(), - queryServerStatus().queryProperties()->queryProp("@nodeGroup"), - queryServerStatus().queryProperties()->queryProp("@queue")); + auditThorJobEvent("Start", wuid, graphName, user); + Owned e; bool allDone = false; try @@ -904,13 +896,7 @@ bool CJobManager::doit(IConstWorkUnit *workunit, const char *graphName, const So allDone = executeGraph(*workunit, graphName, agentep); } catch (IException *_e) { e.setown(_e); } - LOG(MCauditInfo,",Progress,Thor,Stop,%s,%s,%s,%s,%s,%s", - queryServerStatus().queryProperties()->queryProp("@thorname"), - wuid.str(), - graphName, - user.str(), - queryServerStatus().queryProperties()->queryProp("@nodeGroup"), - queryServerStatus().queryProperties()->queryProp("@queue")); + auditThorJobEvent("Stop", wuid, graphName, user); if (e.get()) throw e.getClear(); return allDone; @@ -1285,7 +1271,6 @@ void closeThorServerStatus() } } - /* * Waits on recv for another wuid/graph to run. * Return values: @@ -1358,6 +1343,42 @@ void publishPodNames(IWorkUnit *workunit, const char *graphName) } } +static void auditThorSystemEventBuilder(std::string &msg, const char *eventName, std::initializer_list args) +{ + msg += std::string(",Progress,Thor,") + eventName + "," + getComponentConfigSP()->queryProp("@name"); + for (auto arg : args) + msg += "," + std::string(arg); + if (isContainerized()) + msg += std::string(",") + k8s::queryMyPodName() + "," + k8s::queryMyContainerName(); + else + { + const char *nodeGroup = queryServerStatus().queryProperties()->queryProp("@nodeGroup"); + const char *queueName = queryServerStatus().queryProperties()->queryProp("@queue"); + msg += std::string(",") + nodeGroup + "," + queueName; + } +} + +void auditThorSystemEvent(const char *eventName) +{ + std::string msg; + auditThorSystemEventBuilder(msg, eventName, {}); + LOG(MCauditInfo, "%s", msg.c_str()); +} + +void auditThorSystemEvent(const char *eventName, std::initializer_list args) +{ + std::string msg; + auditThorSystemEventBuilder(msg, eventName, args); + LOG(MCauditInfo, "%s", msg.c_str()); +} + +void auditThorJobEvent(const char *eventName, const char *wuid, const char *graphName, const char *user) +{ + std::string msg; + auditThorSystemEventBuilder(msg, eventName, { wuid, graphName, nullText(user) }); + LOG(MCauditInfo, "%s", msg.c_str()); +} + void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphName) { aborting = 0; diff --git a/thorlcr/master/thgraphmanager.hpp b/thorlcr/master/thgraphmanager.hpp index 98c394e654b..d705259b17f 100644 --- a/thorlcr/master/thgraphmanager.hpp +++ b/thorlcr/master/thgraphmanager.hpp @@ -34,6 +34,9 @@ int queryExitCode(); void addConnectedWorkerPod(const char *podName, const char *containerName); void publishPodNames(IWorkUnit *workunit, const char *graphName); void relayWuidException(IConstWorkUnit *wu, const IException *exception); +void auditThorSystemEvent(const char *eventName); +void auditThorSystemEvent(const char *eventName, std::initializer_list args); +void auditThorJobEvent(const char *eventName, const char *wuid, const char *graphName, const char *user); #endif diff --git a/thorlcr/master/thmastermain.cpp b/thorlcr/master/thmastermain.cpp index b08a8b65ce0..f889e0bd7b2 100644 --- a/thorlcr/master/thmastermain.cpp +++ b/thorlcr/master/thmastermain.cpp @@ -596,10 +596,7 @@ bool ControlHandler(ahType type) if (auditStartLogged) { auditStartLogged = false; - LOG(MCauditInfo,",Progress,Thor,Terminate,%s,%s,%s,ctrlc", - queryServerStatus().queryProperties()->queryProp("@thorname"), - queryServerStatus().queryProperties()->queryProp("@nodeGroup"), - queryServerStatus().queryProperties()->queryProp("@queue")); + auditThorSystemEvent("Terminate", {"ctrlc"}); } queryLogMsgManager()->flushQueue(10*1000); _exit(TEC_CtrlC); @@ -968,7 +965,10 @@ int main( int argc, const char *argv[] ) getClusterThorQueueName(queueNames, thorName); #else if (!thorName) + { thorName = "thor"; + globals->setProp("@name", thorName); + } SCMStringBuffer queueNames; getThorQueueNames(queueNames, thorName); #endif @@ -992,6 +992,7 @@ int main( int argc, const char *argv[] ) masterSlaveMpTag = allocateClusterMPTag(); kjServiceMpTag = allocateClusterMPTag(); + auditThorSystemEvent("Initializing"); unsigned numWorkers = 0; if (isContainerized()) { @@ -1000,7 +1001,6 @@ int main( int argc, const char *argv[] ) StringBuffer thorEpStr; LOG(MCdebugProgress, "ThorMaster version %d.%d, Started on %s", THOR_VERSION_MAJOR,THOR_VERSION_MINOR,thorEp.getEndpointHostText(thorEpStr).str()); - LOG(MCdebugProgress, "Thor name = %s, queue = %s, nodeGroup = %s",thorname,queueName.str(),nodeGroup.str()); unsigned numWorkersPerPod = 1; if (!globals->hasProp("@numWorkers")) @@ -1126,7 +1126,7 @@ int main( int argc, const char *argv[] ) PROGLOG("Persistent Thor group created with group name: %s", uniqueGrpName.str()); } #endif - LOG(MCauditInfo, ",Progress,Thor,Startup,%s,%s,%s,%s",nodeGroup.str(),thorname,queueName.str(),logUrl.str()); + auditThorSystemEvent("Startup"); auditStartLogged = true; writeSentinelFile(sentinelFile); @@ -1140,7 +1140,7 @@ int main( int argc, const char *argv[] ) // NB: workunit/graphName only set in one-shot mode (if isCloud()) thorMain(logHandler, workunit, graphName); - LOG(MCauditInfo, ",Progress,Thor,Terminate,%s,%s,%s",thorname,nodeGroup.str(),queueName.str()); + auditThorSystemEvent("Terminate"); LOG(MCdebugProgress, "ThorMaster terminated OK"); } catch (IException *e) diff --git a/thorlcr/thorutil/thormisc.cpp b/thorlcr/thorutil/thormisc.cpp index a97fc94c5f8..fd4d721f2c1 100644 --- a/thorlcr/thorutil/thormisc.cpp +++ b/thorlcr/thorutil/thormisc.cpp @@ -1692,4 +1692,4 @@ void saveWuidToFile(const char *wuid) if (!wuidFileIO) throw makeStringException(0, "Failed to create file 'wuid' to store current workunit for post mortem script"); wuidFileIO->write(0, strlen(wuid), wuid); -} \ No newline at end of file +}