Skip to content

Commit

Permalink
Update winml to use #cores - #soc cores by Default as the number of i…
Browse files Browse the repository at this point in the history
…ntraopthreads (#18384)

Update winml to use #cores - #soc cores by Default as the number of
intraopthreads

---------

Co-authored-by: Sheil Kumar <[email protected]>
  • Loading branch information
smk2007 and Sheil Kumar authored Nov 28, 2023
1 parent a6d8726 commit 0b7048e
Show file tree
Hide file tree
Showing 7 changed files with 117 additions and 10 deletions.
2 changes: 2 additions & 0 deletions cmake/winml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
${winml_lib_api_dir}/impl/TensorKindFrom.h
${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
${winml_lib_api_dir}/NumericData.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.h
${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
${winml_lib_api_dir}/ImageFeatureDescriptor.h
${winml_lib_api_dir}/ImageFeatureValue.cpp
Expand Down
90 changes: 90 additions & 0 deletions winml/lib/Api/HardwareCoreEnumerator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "lib/Api/pch/pch.h"

#include "HardwareCoreEnumerator.h"

namespace WINMLP {

struct LogicalProcessorInformation {
std::unique_ptr<char[]> Buffer;
size_t Length;
};

struct CoreCounter {
uint32_t PhysicalCores = 0;
uint32_t SocDieCores = 0;
};

static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
DWORD length = 0;
DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);

assert(rc == FALSE);

auto processorInformationBytes = std::make_unique<char[]>(length);

rc = GetLogicalProcessorInformationEx(
relationship, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(processorInformationBytes.get()), &length
);

assert(rc == TRUE);

return {std::move(processorInformationBytes), length};
}

uint32_t CountSetBits(DWORD input) {
uint32_t c;
for (c = 0; input; c++) {
input &= input - 1;
}
return c;
}

static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);

CoreCounter cores;
DWORD dwLevel2GroupMask = 0;
DWORD dwLevel3GroupMask = 0;
size_t read = 0;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL;

while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length
) {
currentProcessorInfo =
reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(logicalProcessorInformation.Buffer.get() + read);
if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) {
break;
}

switch (currentProcessorInfo->Relationship) {
case RelationProcessorCore:
cores.PhysicalCores++;
break;
case RelationCache:
if (currentProcessorInfo->Cache.Level == 2) {
dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
} else if (currentProcessorInfo->Cache.Level == 3) {
dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
}
break;
}

read += currentProcessorInfo->Size;
}

cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
return cores;
}

uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
auto cores = GetNumberOPhysicalAndEngineeringCores();
// We want to use the number of physical cores, but exclude soc cores
return cores.PhysicalCores - cores.SocDieCores;
}

} // namespace WINMLP
11 changes: 11 additions & 0 deletions winml/lib/Api/HardwareCoreEnumerator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

namespace WINMLP {
struct HardwareCoreEnumerator {
HardwareCoreEnumerator() = delete;
static uint32_t DefaultIntraOpNumThreads();
};
} // namespace WINMLP
3 changes: 2 additions & 1 deletion winml/lib/Api/LearningModelDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <D3d11_4.h>
#include <d3d11on12.h>
#include "D3DDeviceCache.h"
#include "HardwareCoreEnumerator.h"

#include "ConverterResourceStore.h"

Expand Down Expand Up @@ -131,7 +132,7 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) {

uint32_t LearningModelDevice::NumberOfIntraOpThreads() {
if (IsCpuDevice()) {
return std::thread::hardware_concurrency();
return HardwareCoreEnumerator::DefaultIntraOpNumThreads();
} else {
// GPU sessions should not rely on intra op threads.
// Creating a large thread pool is unnecessary and wasteful, and can cause
Expand Down
11 changes: 10 additions & 1 deletion winml/lib/Api/LearningModelSessionOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,20 @@

#include "lib/Api/pch/pch.h"
#include "LearningModelSessionOptions.h"
#include "HardwareCoreEnumerator.h"

namespace WINMLP {

LearningModelSessionOptions::LearningModelSessionOptions() {
intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads();
}

LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options)
: batch_size_override_(options.batch_size_override_),
close_model_on_session_creation_(options.close_model_on_session_creation_) {
close_model_on_session_creation_(options.close_model_on_session_creation_),
named_dim_overrides_(options.named_dim_overrides_),
intra_op_num_threads_override_(options.intra_op_num_threads_override_),
custom_ops_lib_paths_(options.custom_ops_lib_paths_) {
}

uint32_t LearningModelSessionOptions::BatchSizeOverride() {
Expand Down
4 changes: 2 additions & 2 deletions winml/lib/Api/LearningModelSessionOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
LearningModelSessionOptions,
ILearningModelSessionOptionsNative,
ILearningModelSessionOptionsNative1> {
LearningModelSessionOptions() = default;
LearningModelSessionOptions();

LearningModelSessionOptions(const LearningModelSessionOptions& options);

Expand Down Expand Up @@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
// The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations.
// The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest.
// WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool
uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency();
uint32_t intra_op_num_threads_override_;

bool allow_thread_spinning_ = true;

Expand Down
6 changes: 0 additions & 6 deletions winml/test/api/LearningModelSessionAPITest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2195,12 +2195,6 @@ static void SetIntraOpNumThreads() {
auto binding = LearningModelBinding(session);
binding.Bind(L"input", tensor_input);
WINML_EXPECT_NO_THROW(session.Evaluate(binding, L""));

// Check to verify that the default number of threads in LearningModelSession is equal to the number of logical cores.
session = LearningModelSession(model, device);
nativeSession = session.as<ILearningModelSessionNative>();
WINML_EXPECT_NO_THROW(nativeSession->GetIntraOpNumThreads(&numIntraOpThreads));
WINML_EXPECT_EQUAL(std::thread::hardware_concurrency(), numIntraOpThreads);
}

static void SetIntraOpThreadSpinning() {
Expand Down

0 comments on commit 0b7048e

Please sign in to comment.