Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update winml to use #cores - #soc cores by Default as the number of intraopthreads #18384

Merged
merged 10 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmake/winml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
${winml_lib_api_dir}/impl/TensorKindFrom.h
${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
${winml_lib_api_dir}/NumericData.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.h
${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
${winml_lib_api_dir}/ImageFeatureDescriptor.h
${winml_lib_api_dir}/ImageFeatureValue.cpp
Expand Down
90 changes: 90 additions & 0 deletions winml/lib/Api/HardwareCoreEnumerator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) Microsoft Corporation. All rights reserved.

Check warning on line 1 in winml/lib/Api/HardwareCoreEnumerator.cpp

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] winml/lib/Api/HardwareCoreEnumerator.cpp#L1

At least two spaces is best between code and comments [whitespace/comments] [2]
Raw output
winml/lib/Api/HardwareCoreEnumerator.cpp:1:  At least two spaces is best between code and comments  [whitespace/comments] [2]
Fixed Show fixed Hide fixed
// Licensed under the MIT License.

#include "lib/Api/pch/pch.h"

#include "HardwareCoreEnumerator.h"

namespace WINMLP {

struct LogicalProcessorInformation {
std::unique_ptr<char[]> Buffer;
size_t Length;
};

struct CoreCounter {
uint32_t PhysicalCores = 0;
uint32_t SocDieCores = 0;
};

static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
DWORD length = 0;
DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);

assert(rc == FALSE);

auto processorInformationBytes = std::make_unique<char[]>(length);

Check warning on line 26 in winml/lib/Api/HardwareCoreEnumerator.cpp

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] winml/lib/Api/HardwareCoreEnumerator.cpp#L26

Add #include <memory> for make_unique<> [build/include_what_you_use] [4]
Raw output
winml/lib/Api/HardwareCoreEnumerator.cpp:26:  Add #include <memory> for make_unique<>  [build/include_what_you_use] [4]
PatriceVignola marked this conversation as resolved.
Show resolved Hide resolved

rc = GetLogicalProcessorInformationEx(
relationship, static_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(processorInformationBytes.get()), &length
);

Check warning on line 30 in winml/lib/Api/HardwareCoreEnumerator.cpp

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] winml/lib/Api/HardwareCoreEnumerator.cpp#L30

Closing ) should be moved to the previous line [whitespace/parens] [2]
Raw output
winml/lib/Api/HardwareCoreEnumerator.cpp:30:  Closing ) should be moved to the previous line  [whitespace/parens] [2]

assert(rc == TRUE);

return {std::move(processorInformationBytes), length};

Check warning on line 34 in winml/lib/Api/HardwareCoreEnumerator.cpp

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] winml/lib/Api/HardwareCoreEnumerator.cpp#L34

Add #include <utility> for move [build/include_what_you_use] [4]
Raw output
winml/lib/Api/HardwareCoreEnumerator.cpp:34:  Add #include <utility> for move  [build/include_what_you_use] [4]
}

uint32_t CountSetBits(DWORD input) {
uint32_t c;
for (c = 0; input; c++) {
input &= input - 1;
}
return c;
}

static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);

CoreCounter cores;
DWORD dwLevel2GroupMask = 0;
DWORD dwLevel3GroupMask = 0;
size_t read = 0;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL;

while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length
) {
PatriceVignola marked this conversation as resolved.
Show resolved Hide resolved
currentProcessorInfo =
static_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(logicalProcessorInformation.Buffer.get() + read);
if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) {
break;
}

switch (currentProcessorInfo->Relationship) {
case RelationProcessorCore:
cores.PhysicalCores++;
break;
case RelationCache:
if (currentProcessorInfo->Cache.Level == 2) {
dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
} else if (currentProcessorInfo->Cache.Level == 3) {
dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
}
break;
}

read += currentProcessorInfo->Size;
}

cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
return cores;
}

uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
// # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
// # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
auto cores = GetNumberOPhysicalAndEngineeringCores();
// We want to use the number of physical cores, but exclude soc cores
return cores.PhysicalCores - cores.SocDieCores;
}

} // namespace WINMLP
11 changes: 11 additions & 0 deletions winml/lib/Api/HardwareCoreEnumerator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.

Check warning on line 1 in winml/lib/Api/HardwareCoreEnumerator.h

View workflow job for this annotation

GitHub Actions / cpplint

[cpplint] winml/lib/Api/HardwareCoreEnumerator.h#L1

At least two spaces is best between code and comments [whitespace/comments] [2]
Raw output
winml/lib/Api/HardwareCoreEnumerator.h:1:  At least two spaces is best between code and comments  [whitespace/comments] [2]
// Licensed under the MIT License.

#pragma once

namespace WINMLP {
struct HardwareCoreEnumerator {
HardwareCoreEnumerator() = delete;
static uint32_t DefaultIntraOpNumThreads();
};
} // namespace WINMLP
3 changes: 2 additions & 1 deletion winml/lib/Api/LearningModelDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <D3d11_4.h>
#include <d3d11on12.h>
#include "D3DDeviceCache.h"
#include "HardwareCoreEnumerator.h"

#include "ConverterResourceStore.h"

Expand Down Expand Up @@ -131,7 +132,7 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) {

uint32_t LearningModelDevice::NumberOfIntraOpThreads() {
if (IsCpuDevice()) {
return std::thread::hardware_concurrency();
return HardwareCoreEnumerator::DefaultIntraOpNumThreads();
} else {
// GPU sessions should not rely on intra op threads.
// Creating a large thread pool is unnecessary and wasteful, and can cause
Expand Down
11 changes: 10 additions & 1 deletion winml/lib/Api/LearningModelSessionOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,20 @@

#include "lib/Api/pch/pch.h"
#include "LearningModelSessionOptions.h"
#include "HardwareCoreEnumerator.h"

namespace WINMLP {

LearningModelSessionOptions::LearningModelSessionOptions() {
intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads();
}

LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options)
: batch_size_override_(options.batch_size_override_),
close_model_on_session_creation_(options.close_model_on_session_creation_) {
close_model_on_session_creation_(options.close_model_on_session_creation_),
named_dim_overrides_(options.named_dim_overrides_),
intra_op_num_threads_override_(options.intra_op_num_threads_override_),
custom_ops_lib_paths_(options.custom_ops_lib_paths_) {
}

uint32_t LearningModelSessionOptions::BatchSizeOverride() {
Expand Down
4 changes: 2 additions & 2 deletions winml/lib/Api/LearningModelSessionOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
LearningModelSessionOptions,
ILearningModelSessionOptionsNative,
ILearningModelSessionOptionsNative1> {
LearningModelSessionOptions() = default;
LearningModelSessionOptions();

LearningModelSessionOptions(const LearningModelSessionOptions& options);

Expand Down Expand Up @@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
// The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations.
// The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest.
// WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool
uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency();
uint32_t intra_op_num_threads_override_;

bool allow_thread_spinning_ = true;

Expand Down
6 changes: 0 additions & 6 deletions winml/test/api/LearningModelSessionAPITest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2195,12 +2195,6 @@ static void SetIntraOpNumThreads() {
auto binding = LearningModelBinding(session);
binding.Bind(L"input", tensor_input);
WINML_EXPECT_NO_THROW(session.Evaluate(binding, L""));

// Check to verify that the default number of threads in LearningModelSession is equal to the number of logical cores.
session = LearningModelSession(model, device);
nativeSession = session.as<ILearningModelSessionNative>();
WINML_EXPECT_NO_THROW(nativeSession->GetIntraOpNumThreads(&numIntraOpThreads));
WINML_EXPECT_EQUAL(std::thread::hardware_concurrency(), numIntraOpThreads);
}

static void SetIntraOpThreadSpinning() {
Expand Down
Loading