From c638ce5d53ad252d7f4ebe38554ec43d98a464a1 Mon Sep 17 00:00:00 2001 From: Shamser Ahmed Date: Fri, 5 Apr 2024 10:20:51 +0100 Subject: [PATCH] HPCC-31569 Thor CostExecute calc may be incorrect under some circumstances Signed-off-by: Shamser Ahmed --- common/workunit/workunit.cpp | 7 +++++-- common/workunit/workunit.hpp | 2 +- thorlcr/master/thdemonserver.cpp | 4 +--- thorlcr/master/thgraphmanager.cpp | 3 +-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/common/workunit/workunit.cpp b/common/workunit/workunit.cpp index 593a9c8a43b..97c2b391f2d 100644 --- a/common/workunit/workunit.cpp +++ b/common/workunit/workunit.cpp @@ -13937,13 +13937,16 @@ extern WORKUNIT_API double getThorManagerRate() extern WORKUNIT_API double getThorWorkerRate() { + // Note: (bare-metal) the use of getAffinityCpus to get the number of CPUs used by workers + // doesn't really make sense since the caller is likely to be running on thor manager (so it will + // return cpu affinity for the manager, rather than for the worker). This needs rethinking. double numCpus = isContainerized() ? getResourcedCpus("workerResources") : getAffinityCpus(); return getCostCpuHour() * numCpus; } -extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberOfMachines) +extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberOfWorkers) { - return calcCost(getThorManagerRate(), ms) + calcCost(getThorWorkerRate(), ms) * numberOfMachines; + return calcCost(getThorManagerRate(), ms) + calcCost(getThorWorkerRate(), ms) * numberOfWorkers; } void aggregateStatistic(StatsAggregation & result, IConstWorkUnit * wu, const WuScopeFilter & filter, StatisticKind search) diff --git a/common/workunit/workunit.hpp b/common/workunit/workunit.hpp index 1ca8b625ab7..3dfdfd3ae8d 100644 --- a/common/workunit/workunit.hpp +++ b/common/workunit/workunit.hpp @@ -1739,7 +1739,7 @@ extern WORKUNIT_API void addTimeStamp(IWorkUnit * wu, unsigned wfid, const char extern WORKUNIT_API double getMachineCostRate(); extern WORKUNIT_API double getThorManagerRate(); extern WORKUNIT_API double getThorWorkerRate(); -extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberMachines); +extern WORKUNIT_API double calculateThorCost(unsigned __int64 ms, unsigned numberOfWorkers); extern WORKUNIT_API IPropertyTree * getWUGraphProgress(const char * wuid, bool readonly); diff --git a/thorlcr/master/thdemonserver.cpp b/thorlcr/master/thdemonserver.cpp index 5de19dced0b..b6f35ff5ec6 100644 --- a/thorlcr/master/thdemonserver.cpp +++ b/thorlcr/master/thdemonserver.cpp @@ -40,7 +40,6 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer UnsignedArray graphStarts; double thorManagerRate = 0; double thorWorkerRate = 0; - unsigned numberOfMachines = 0; cost_type costLimit = 0; cost_type workunitCost = 0; @@ -90,7 +89,7 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer updateWorkunitStat(wu, SSTsubgraph, graphScope, StTimeElapsed, timer, milliToNano(duration)); if (costLimit || finished) { - const cost_type sgCost = money2cost_type(calcCost(thorManagerRate, duration) + calcCost(thorWorkerRate, duration) * numberOfMachines); + const cost_type sgCost = money2cost_type(calcCost(thorManagerRate, duration) + calcCost(thorWorkerRate, duration) * queryNodeClusterWidth()); cost_type costDiskAccess = graph.getDiskAccessCost(); if (finished) { @@ -268,7 +267,6 @@ class DeMonServer : public CSimpleInterface, implements IDeMonServer costLimit = money2cost_type(hardLimit); else costLimit = money2cost_type(tmpcostLimit); - numberOfMachines = queryNodeClusterWidth() / globals->getPropInt("@numWorkersPerPod", 1); // Number of Pods or physical machines activeGraphs.append(*LINK(graph)); unsigned startTime = msTick(); graphStarts.append(startTime); diff --git a/thorlcr/master/thgraphmanager.cpp b/thorlcr/master/thgraphmanager.cpp index 8ac7ba3e021..4acbf5eebbd 100644 --- a/thorlcr/master/thgraphmanager.cpp +++ b/thorlcr/master/thgraphmanager.cpp @@ -1094,8 +1094,7 @@ bool CJobManager::executeGraph(IConstWorkUnit &workunit, const char *graphName, updateWorkunitStat(wu, SSTgraph, graphName, StTimeElapsed, graphTimeStr, graphTimeNs, wfid); addTimeStamp(wu, SSTgraph, graphName, StWhenFinished, wfid); - unsigned numberOfMachines = queryNodeClusterWidth() / globals->getPropInt("@numWorkersPerPod", 1); // Number of Pods or physical machines - cost_type cost = money2cost_type(calculateThorCost(nanoToMilli(graphTimeNs), numberOfMachines)); + cost_type cost = money2cost_type(calculateThorCost(nanoToMilli(graphTimeNs), queryNodeClusterWidth())); if (cost) wu->setStatistic(queryStatisticsComponentType(), queryStatisticsComponentName(), SSTgraph, graphScope, StCostExecute, NULL, cost, 1, 0, StatsMergeReplace); offset_t totalSpillSize = job->getTotalSpillSize();