Skip to content

Commit

Permalink
HPCC-30998 Fix post-mortem files being attached to wrong workunit.
Browse files Browse the repository at this point in the history
With multiJobLinger on (now the default), post-mortem files were
being attached to the 1st workunit than ran on a started Thor
instance, rather than the one that had most recently ran and
caused the reports.

Fix by recording the current wuid to a local temp file, for
check_executes.sh to use when attaching post-mortem files.

Signed-off-by: Jake Smith <[email protected]>
  • Loading branch information
jakesmith committed Dec 11, 2023
1 parent bc9616b commit 479ca3e
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 0 deletions.
3 changes: 3 additions & 0 deletions initfiles/bin/check_executes
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ if [ $PMD_ALWAYS = true ] || [ $retVal -ne 0 ]; then
fi
dmesg -xT > $POST_MORTEM_DIR/dmesg.log
if [[ -n "${PMD_DALISERVER}" ]] && [[ -n "${PMD_WORKUNIT}" ]]; then
if [[ -s wuid ]]; then # takes precedence over command line option
PMD_WORKUNIT=$(cat wuid)
fi
wutool postmortem ${PMD_WORKUNIT} DALISERVER=${PMD_DALISERVER} PMD=${POST_MORTEM_DIR}
echo Updated workunit ${PMD_WORKUNIT}
fi
Expand Down
1 change: 1 addition & 0 deletions thorlcr/master/thgraphmanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1473,6 +1473,7 @@ void thorMain(ILogMsgHandler *logHandler, const char *wuid, const char *graphNam
// NB: this set of pods could still already be published, if so, publishPodNames will not re-add.
}
currentWuid.set(wuid); // NB: will always be same if !multiJobLinger
saveWuidToFile(currentWuid);
break; // success
}
else if (ret < 0)
Expand Down
1 change: 1 addition & 0 deletions thorlcr/master/thmastermain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,7 @@ int main( int argc, const char *argv[] )
bool doWorkerRegistration = false;
if (isContainerized())
{
saveWuidToFile(workunit);
LogMsgJobId thorJobId = queryLogMsgManager()->addJobId(workunit);
thorJob.setJobID(thorJobId);
setDefaultJobId(thorJobId);
Expand Down
1 change: 1 addition & 0 deletions thorlcr/slave/slavmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1788,6 +1788,7 @@ class CJobListener : public CSimpleInterface
StringAttr wuid, graphName;
StringBuffer soPath;
msg.read(wuid);
saveWuidToFile(wuid);
msg.read(graphName);

Owned<ILoadedDllEntry> querySo;
Expand Down
10 changes: 10 additions & 0 deletions thorlcr/thorutil/thormisc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1690,4 +1690,14 @@ void CThorPerfTracer::stop()
EXCLOG(E);
::Release(E);
}
}

void saveWuidToFile(const char *wuid)
{
// Store current wuid to a local file, so post mortem script can find it (and if necessary publish files to it)
Owned<IFile> wuidFile = createIFile("wuid"); // NB: each pod is in it's own private working directory
Owned<IFileIO> wuidFileIO = wuidFile->open(IFOcreate);
if (!wuidFileIO)
throw makeStringException(0, "Failed to create file 'wuid' to store current workunit for post mortem script");
wuidFileIO->write(0, strlen(wuid), wuid);
}
2 changes: 2 additions & 0 deletions thorlcr/thorutil/thormisc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,5 +716,7 @@ class graph_decl CThorPerfTracer : protected PerfTracer
void stop();
};

extern graph_decl void saveWuidToFile(const char *wuid);

#endif

0 comments on commit 479ca3e

Please sign in to comment.