Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HPCC-31569 Thor CostExecute calc will be incorrect if workersPerPod > 1 #18497

Merged
merged 1 commit into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions common/workunit/workunit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13937,13 +13937,16 @@ extern WORKUNIT_API double getThorManagerRate()

extern WORKUNIT_API double getThorWorkerRate()
{
// Note: (bare-metal) the use of getAffinityCpus to get the number of CPUs used by workers
// doesn't really make sense since the caller is likely to be running on thor manager (so it will
// return cpu affinity for the manager, rather than for the worker). This needs rethinking.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed. Is there a JIRA to track this? (can you link it to HPCC-31569)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done. I've created https://hpccsystems.atlassian.net/browse/HPCC-31572 and linked to this issue.

double numCpus = isContainerized() ? getResourcedCpus("workerResources") : getAffinityCpus();
return getCostCpuHour() * numCpus;
}

extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberOfMachines)
extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberOfWorkers)
{
return calcCost(getThorManagerRate(), ms) + calcCost(getThorWorkerRate(), ms) * numberOfMachines;
return calcCost(getThorManagerRate(), ms) + calcCost(getThorWorkerRate(), ms) * numberOfWorkers;
}

void aggregateStatistic(StatsAggregation & result, IConstWorkUnit * wu, const WuScopeFilter & filter, StatisticKind search)
Expand Down
2 changes: 1 addition & 1 deletion common/workunit/workunit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1739,7 +1739,7 @@ extern WORKUNIT_API void addTimeStamp(IWorkUnit * wu, unsigned wfid, const char
extern WORKUNIT_API double getMachineCostRate();
extern WORKUNIT_API double getThorManagerRate();
extern WORKUNIT_API double getThorWorkerRate();
extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberMachines);
extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberOfWorkers);

extern WORKUNIT_API IPropertyTree * getWUGraphProgress(const char * wuid, bool readonly);

Expand Down
4 changes: 1 addition & 3 deletions thorlcr/master/thdemonserver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer
UnsignedArray graphStarts;
double thorManagerRate = 0;
double thorWorkerRate = 0;
unsigned numberOfMachines = 0;
cost_type costLimit = 0;
cost_type workunitCost = 0;

Expand Down Expand Up @@ -90,7 +89,7 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer
updateWorkunitStat(wu, SSTsubgraph, graphScope, StTimeElapsed, timer, milliToNano(duration));
if (costLimit || finished)
{
const cost_type sgCost = money2cost_type(calcCost(thorManagerRate, duration) + calcCost(thorWorkerRate, duration) * numberOfMachines);
const cost_type sgCost = money2cost_type(calcCost(thorManagerRate, duration) + calcCost(thorWorkerRate, duration) * queryNodeClusterWidth());
cost_type costDiskAccess = graph.getDiskAccessCost();
if (finished)
{
Expand Down Expand Up @@ -268,7 +267,6 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer
costLimit = money2cost_type(hardLimit);
else
costLimit = money2cost_type(tmpcostLimit);
numberOfMachines = queryNodeClusterWidth() / globals->getPropInt("@numWorkersPerPod", 1); // Number of Pods or physical machines
activeGraphs.append(*LINK(graph));
unsigned startTime = msTick();
graphStarts.append(startTime);
Expand Down
3 changes: 1 addition & 2 deletions thorlcr/master/thgraphmanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1094,8 +1094,7 @@ bool CJobManager::executeGraph(IConstWorkUnit &workunit, const char *graphName,
updateWorkunitStat(wu, SSTgraph, graphName, StTimeElapsed, graphTimeStr, graphTimeNs, wfid);

addTimeStamp(wu, SSTgraph, graphName, StWhenFinished, wfid);
unsigned numberOfMachines = queryNodeClusterWidth() / globals->getPropInt("@numWorkersPerPod", 1); // Number of Pods or physical machines
cost_type cost = money2cost_type(calculateThorCost(nanoToMilli(graphTimeNs), numberOfMachines));
cost_type cost = money2cost_type(calculateThorCost(nanoToMilli(graphTimeNs), queryNodeClusterWidth()));
if (cost)
wu->setStatistic(queryStatisticsComponentType(), queryStatisticsComponentName(), SSTgraph, graphScope, StCostExecute, NULL, cost, 1, 0, StatsMergeReplace);
offset_t totalSpillSize = job->getTotalSpillSize();
Expand Down
Loading