Skip to content

Commit

Permalink
Merge pull request #18686 from jakesmith/HPCC-31902-audit-containerinfo
Browse files Browse the repository at this point in the history
HPCC-31902 Adjust Thor auditing info to contain pod/container meta info

Reviewed-by: Gavin Halliday <[email protected]>
Merged-by: Gavin Halliday <[email protected]>
  • Loading branch information
ghalliday authored Jun 14, 2024
2 parents fcb0440 + d83da9f commit 21ccf10
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 35 deletions.
8 changes: 0 additions & 8 deletions thorlcr/master/thdemonserver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,6 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer
unsigned startTime = msTick();
graphStarts.append(startTime);
reportGraph(graph, false, true, startTime, getTimeStampNowValue());
const char *graphname = graph->queryJob().queryGraphName();
if (memcmp(graphname,"graph",5)==0)
graphname+=5;
LOG(MCauditInfo,",Progress,Thor,StartSubgraph,%s,%s,%s,%u,%s,%s",
queryServerStatus().queryProperties()->queryProp("@thorname"),
graph->queryJob().queryWuid(),
graphname,
(unsigned)graph->queryGraphId(), queryServerStatus().queryProperties()->queryProp("@nodeGroup"), queryServerStatus().queryProperties()->queryProp("@queue"));
}
void endGraph(CGraphBase *graph, bool success)
{
Expand Down
59 changes: 40 additions & 19 deletions thorlcr/master/thgraphmanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,10 +433,7 @@ void CJobManager::fatal(IException *e)
{
IERRLOG("Unknown exception in CJobManager::fatal");
}
LOG(MCauditInfo,",Progress,Thor,Terminate,%s,%s,%s,exception",
queryServerStatus().queryProperties()->queryProp("@thorname"),
queryServerStatus().queryProperties()->queryProp("@nodeGroup"),
queryServerStatus().queryProperties()->queryProp("@queue"));
auditThorSystemEvent("Terminate", {"exception"});

queryLogMsgManager()->flushQueue(10*1000);

Expand Down Expand Up @@ -890,27 +887,16 @@ bool CJobManager::doit(IConstWorkUnit *workunit, const char *graphName, const So
JobNameScope activeJobName(wuid);

LOG(MCdebugInfo, "Processing wuid=%s, graph=%s from agent: %s", wuid.str(), graphName, agentep.getEndpointHostText(s).str());
LOG(MCauditInfo,",Progress,Thor,Start,%s,%s,%s,%s,%s,%s",
queryServerStatus().queryProperties()->queryProp("@thorname"),
wuid.str(),
graphName,
user.str(),
queryServerStatus().queryProperties()->queryProp("@nodeGroup"),
queryServerStatus().queryProperties()->queryProp("@queue"));
auditThorJobEvent("Start", wuid, graphName, user);

Owned<IException> e;
bool allDone = false;
try
{
allDone = executeGraph(*workunit, graphName, agentep);
}
catch (IException *_e) { e.setown(_e); }
LOG(MCauditInfo,",Progress,Thor,Stop,%s,%s,%s,%s,%s,%s",
queryServerStatus().queryProperties()->queryProp("@thorname"),
wuid.str(),
graphName,
user.str(),
queryServerStatus().queryProperties()->queryProp("@nodeGroup"),
queryServerStatus().queryProperties()->queryProp("@queue"));
auditThorJobEvent("Stop", wuid, graphName, user);

if (e.get()) throw e.getClear();
return allDone;
Expand Down Expand Up @@ -1285,7 +1271,6 @@ void closeThorServerStatus()
}
}


/*
* Waits on recv for another wuid/graph to run.
* Return values:
Expand Down Expand Up @@ -1358,6 +1343,42 @@ void publishPodNames(IWorkUnit *workunit, const char *graphName)
}
}

static void auditThorSystemEventBuilder(std::string &msg, const char *eventName, std::initializer_list<const char*> args)
{
msg += std::string(",Progress,Thor,") + eventName + "," + getComponentConfigSP()->queryProp("@name");
for (auto arg : args)
msg += "," + std::string(arg);
if (isContainerized())
msg += std::string(",") + k8s::queryMyPodName() + "," + k8s::queryMyContainerName();
else
{
const char *nodeGroup = queryServerStatus().queryProperties()->queryProp("@nodeGroup");
const char *queueName = queryServerStatus().queryProperties()->queryProp("@queue");
msg += std::string(",") + nodeGroup + "," + queueName;
}
}

void auditThorSystemEvent(const char *eventName)
{
std::string msg;
auditThorSystemEventBuilder(msg, eventName, {});
LOG(MCauditInfo, "%s", msg.c_str());
}

void auditThorSystemEvent(const char *eventName, std::initializer_list<const char*> args)
{
std::string msg;
auditThorSystemEventBuilder(msg, eventName, args);
LOG(MCauditInfo, "%s", msg.c_str());
}

void auditThorJobEvent(const char *eventName, const char *wuid, const char *graphName, const char *user)
{
std::string msg;
auditThorSystemEventBuilder(msg, eventName, { wuid, graphName, nullText(user) });
LOG(MCauditInfo, "%s", msg.c_str());
}

void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphName)
{
aborting = 0;
Expand Down
3 changes: 3 additions & 0 deletions thorlcr/master/thgraphmanager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ int queryExitCode();
void addConnectedWorkerPod(const char *podName, const char *containerName);
void publishPodNames(IWorkUnit *workunit, const char *graphName);
void relayWuidException(IConstWorkUnit *wu, const IException *exception);
void auditThorSystemEvent(const char *eventName);
void auditThorSystemEvent(const char *eventName, std::initializer_list<const char*> args);
void auditThorJobEvent(const char *eventName, const char *wuid, const char *graphName, const char *user);


#endif
14 changes: 7 additions & 7 deletions thorlcr/master/thmastermain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,10 +596,7 @@ bool ControlHandler(ahType type)
if (auditStartLogged)
{
auditStartLogged = false;
LOG(MCauditInfo,",Progress,Thor,Terminate,%s,%s,%s,ctrlc",
queryServerStatus().queryProperties()->queryProp("@thorname"),
queryServerStatus().queryProperties()->queryProp("@nodeGroup"),
queryServerStatus().queryProperties()->queryProp("@queue"));
auditThorSystemEvent("Terminate", {"ctrlc"});
}
queryLogMsgManager()->flushQueue(10*1000);
_exit(TEC_CtrlC);
Expand Down Expand Up @@ -968,7 +965,10 @@ int main( int argc, const char *argv[] )
getClusterThorQueueName(queueNames, thorName);
#else
if (!thorName)
{
thorName = "thor";
globals->setProp("@name", thorName);
}
SCMStringBuffer queueNames;
getThorQueueNames(queueNames, thorName);
#endif
Expand All @@ -992,6 +992,7 @@ int main( int argc, const char *argv[] )
masterSlaveMpTag = allocateClusterMPTag();
kjServiceMpTag = allocateClusterMPTag();

auditThorSystemEvent("Initializing");
unsigned numWorkers = 0;
if (isContainerized())
{
Expand All @@ -1000,7 +1001,6 @@ int main( int argc, const char *argv[] )

StringBuffer thorEpStr;
LOG(MCdebugProgress, "ThorMaster version %d.%d, Started on %s", THOR_VERSION_MAJOR,THOR_VERSION_MINOR,thorEp.getEndpointHostText(thorEpStr).str());
LOG(MCdebugProgress, "Thor name = %s, queue = %s, nodeGroup = %s",thorname,queueName.str(),nodeGroup.str());

unsigned numWorkersPerPod = 1;
if (!globals->hasProp("@numWorkers"))
Expand Down Expand Up @@ -1126,7 +1126,7 @@ int main( int argc, const char *argv[] )
PROGLOG("Persistent Thor group created with group name: %s", uniqueGrpName.str());
}
#endif
LOG(MCauditInfo, ",Progress,Thor,Startup,%s,%s,%s,%s",nodeGroup.str(),thorname,queueName.str(),logUrl.str());
auditThorSystemEvent("Startup");
auditStartLogged = true;

writeSentinelFile(sentinelFile);
Expand All @@ -1140,7 +1140,7 @@ int main( int argc, const char *argv[] )

// NB: workunit/graphName only set in one-shot mode (if isCloud())
thorMain(logHandler, workunit, graphName);
LOG(MCauditInfo, ",Progress,Thor,Terminate,%s,%s,%s",thorname,nodeGroup.str(),queueName.str());
auditThorSystemEvent("Terminate");
LOG(MCdebugProgress, "ThorMaster terminated OK");
}
catch (IException *e)
Expand Down
2 changes: 1 addition & 1 deletion thorlcr/thorutil/thormisc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1692,4 +1692,4 @@ void saveWuidToFile(const char *wuid)
if (!wuidFileIO)
throw makeStringException(0, "Failed to create file 'wuid' to store current workunit for post mortem script");
wuidFileIO->write(0, strlen(wuid), wuid);
}
}

0 comments on commit 21ccf10

Please sign in to comment.