diff --git a/cmake/winml.cmake b/cmake/winml.cmake index 395996f0fa4b9..268ee3960e75a 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api ${winml_lib_api_dir}/impl/TensorKindFrom.h ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h ${winml_lib_api_dir}/NumericData.cpp + ${winml_lib_api_dir}/HardwareCoreEnumerator.cpp + ${winml_lib_api_dir}/HardwareCoreEnumerator.h ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp ${winml_lib_api_dir}/ImageFeatureDescriptor.h ${winml_lib_api_dir}/ImageFeatureValue.cpp diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp new file mode 100644 index 0000000000000..a89ac561f8860 --- /dev/null +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "lib/Api/pch/pch.h" + +#include "HardwareCoreEnumerator.h" + +namespace WINMLP { + +struct LogicalProcessorInformation { + std::unique_ptr Buffer; + size_t Length; +}; + +struct CoreCounter { + uint32_t PhysicalCores = 0; + uint32_t SocDieCores = 0; +}; + +static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { + DWORD length = 0; + DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length); + + assert(rc == FALSE); + + auto processorInformationBytes = std::make_unique(length); + + rc = GetLogicalProcessorInformationEx( + relationship, reinterpret_cast(processorInformationBytes.get()), &length + ); + + assert(rc == TRUE); + + return {std::move(processorInformationBytes), length}; +} + +uint32_t CountSetBits(DWORD input) { + uint32_t c; + for (c = 0; input; c++) { + input &= input - 1; + } + return c; +} + +static CoreCounter GetNumberOPhysicalAndEngineeringCores() { + auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); + + CoreCounter cores; + DWORD dwLevel2GroupMask = 0; + DWORD dwLevel3GroupMask = 0; + size_t read = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL; + + while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length + ) { + currentProcessorInfo = + reinterpret_cast(logicalProcessorInformation.Buffer.get() + read); + if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) { + break; + } + + switch (currentProcessorInfo->Relationship) { + case RelationProcessorCore: + cores.PhysicalCores++; + break; + case RelationCache: + if (currentProcessorInfo->Cache.Level == 2) { + dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; + } else if (currentProcessorInfo->Cache.Level == 3) { + dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; + } + break; + } + + read += currentProcessorInfo->Size; + } + + cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + return cores; +} + +uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { + // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. + // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. + auto cores = GetNumberOPhysicalAndEngineeringCores(); + // We want to use the number of physical cores, but exclude soc cores + return cores.PhysicalCores - cores.SocDieCores; +} + +} // namespace WINMLP diff --git a/winml/lib/Api/HardwareCoreEnumerator.h b/winml/lib/Api/HardwareCoreEnumerator.h new file mode 100644 index 0000000000000..6861ba7d46bcf --- /dev/null +++ b/winml/lib/Api/HardwareCoreEnumerator.h @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace WINMLP { +struct HardwareCoreEnumerator { + HardwareCoreEnumerator() = delete; + static uint32_t DefaultIntraOpNumThreads(); +}; +} // namespace WINMLP diff --git a/winml/lib/Api/LearningModelDevice.cpp b/winml/lib/Api/LearningModelDevice.cpp index c9c6f5bc70ee2..9f48ee03886e1 100644 --- a/winml/lib/Api/LearningModelDevice.cpp +++ b/winml/lib/Api/LearningModelDevice.cpp @@ -7,6 +7,7 @@ #include #include #include "D3DDeviceCache.h" +#include "HardwareCoreEnumerator.h" #include "ConverterResourceStore.h" @@ -131,7 +132,7 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) { uint32_t LearningModelDevice::NumberOfIntraOpThreads() { if (IsCpuDevice()) { - return std::thread::hardware_concurrency(); + return HardwareCoreEnumerator::DefaultIntraOpNumThreads(); } else { // GPU sessions should not rely on intra op threads. // Creating a large thread pool is unnecessary and wasteful, and can cause diff --git a/winml/lib/Api/LearningModelSessionOptions.cpp b/winml/lib/Api/LearningModelSessionOptions.cpp index 2ff9c6d1d56d0..374200fb3b9f8 100644 --- a/winml/lib/Api/LearningModelSessionOptions.cpp +++ b/winml/lib/Api/LearningModelSessionOptions.cpp @@ -3,11 +3,20 @@ #include "lib/Api/pch/pch.h" #include "LearningModelSessionOptions.h" +#include "HardwareCoreEnumerator.h" namespace WINMLP { + +LearningModelSessionOptions::LearningModelSessionOptions() { + intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads(); +} + LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options) : batch_size_override_(options.batch_size_override_), - close_model_on_session_creation_(options.close_model_on_session_creation_) { + close_model_on_session_creation_(options.close_model_on_session_creation_), + named_dim_overrides_(options.named_dim_overrides_), + intra_op_num_threads_override_(options.intra_op_num_threads_override_), + custom_ops_lib_paths_(options.custom_ops_lib_paths_) { } uint32_t LearningModelSessionOptions::BatchSizeOverride() { diff --git a/winml/lib/Api/LearningModelSessionOptions.h b/winml/lib/Api/LearningModelSessionOptions.h index 5fc7e54997403..21d0242735f94 100644 --- a/winml/lib/Api/LearningModelSessionOptions.h +++ b/winml/lib/Api/LearningModelSessionOptions.h @@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT< LearningModelSessionOptions, ILearningModelSessionOptionsNative, ILearningModelSessionOptionsNative1> { - LearningModelSessionOptions() = default; + LearningModelSessionOptions(); LearningModelSessionOptions(const LearningModelSessionOptions& options); @@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT< // The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations. // The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest. // WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool - uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency(); + uint32_t intra_op_num_threads_override_; bool allow_thread_spinning_ = true; diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp index 4ec79b8a0f4c6..d6e70e35e3a6d 100644 --- a/winml/test/api/LearningModelSessionAPITest.cpp +++ b/winml/test/api/LearningModelSessionAPITest.cpp @@ -2195,12 +2195,6 @@ static void SetIntraOpNumThreads() { auto binding = LearningModelBinding(session); binding.Bind(L"input", tensor_input); WINML_EXPECT_NO_THROW(session.Evaluate(binding, L"")); - - // Check to verify that the default number of threads in LearningModelSession is equal to the number of logical cores. - session = LearningModelSession(model, device); - nativeSession = session.as(); - WINML_EXPECT_NO_THROW(nativeSession->GetIntraOpNumThreads(&numIntraOpThreads)); - WINML_EXPECT_EQUAL(std::thread::hardware_concurrency(), numIntraOpThreads); } static void SetIntraOpThreadSpinning() {