Update winml to use #cores - #soc cores by Default as the number of i…

…ntraopthreads (#18384) Update winml to use #cores - #soc cores by Default as the number of intraopthreads --------- Co-authored-by: Sheil Kumar <[email protected]>
microsoft · Nov 28, 2023 · 0b7048e · 0b7048e
1 parent a6d8726
commit 0b7048e
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 10 deletions.
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
@@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
   ${winml_lib_api_dir}/impl/TensorKindFrom.h
   ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
   ${winml_lib_api_dir}/NumericData.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.h
   ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
   ${winml_lib_api_dir}/ImageFeatureDescriptor.h
   ${winml_lib_api_dir}/ImageFeatureValue.cpp

diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "lib/Api/pch/pch.h"
+
+#include "HardwareCoreEnumerator.h"
+
+namespace WINMLP {
+
+struct LogicalProcessorInformation {
+  std::unique_ptr<char[]> Buffer;
+  size_t Length;
+};
+
+struct CoreCounter {
+  uint32_t PhysicalCores = 0;
+  uint32_t SocDieCores = 0;
+};
+
+static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
+  DWORD length = 0;
+  DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);
+
+  assert(rc == FALSE);
+
+  auto processorInformationBytes = std::make_unique<char[]>(length);
+
+  rc = GetLogicalProcessorInformationEx(
+    relationship, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(processorInformationBytes.get()), &length
+  );
+
+  assert(rc == TRUE);
+
+  return {std::move(processorInformationBytes), length};
+}
+
+uint32_t CountSetBits(DWORD input) {
+  uint32_t c;
+  for (c = 0; input; c++) {
+    input &= input - 1;
+  }
+  return c;
+}
+
+static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
+  auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
+
+  CoreCounter cores;
+  DWORD dwLevel2GroupMask = 0;
+  DWORD dwLevel3GroupMask = 0;
+  size_t read = 0;
+  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL;
+
+  while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length
+  ) {
+    currentProcessorInfo =
+      reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(logicalProcessorInformation.Buffer.get() + read);
+    if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) {
+      break;
+    }
+
+    switch (currentProcessorInfo->Relationship) {
+      case RelationProcessorCore:
+        cores.PhysicalCores++;
+        break;
+      case RelationCache:
+        if (currentProcessorInfo->Cache.Level == 2) {
+          dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
+        } else if (currentProcessorInfo->Cache.Level == 3) {
+          dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
+        }
+        break;
+    }
+
+    read += currentProcessorInfo->Size;
+  }
+
+  cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
+  return cores;
+}
+
+uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
+  // # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
+  // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
+  auto cores = GetNumberOPhysicalAndEngineeringCores();
+  // We want to use the number of physical cores, but exclude soc cores
+  return cores.PhysicalCores - cores.SocDieCores;
+}
+
+}  // namespace WINMLP
diff --git a/winml/lib/Api/HardwareCoreEnumerator.h b/winml/lib/Api/HardwareCoreEnumerator.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace WINMLP {
+struct HardwareCoreEnumerator {
+  HardwareCoreEnumerator() = delete;
+  static uint32_t DefaultIntraOpNumThreads();
+};
+}  // namespace WINMLP
diff --git a/winml/lib/Api/LearningModelDevice.cpp b/winml/lib/Api/LearningModelDevice.cpp
@@ -7,6 +7,7 @@
 #include <D3d11_4.h>
 #include <d3d11on12.h>
 #include "D3DDeviceCache.h"
+#include "HardwareCoreEnumerator.h"
 
 #include "ConverterResourceStore.h"
 
@@ -131,7 +132,7 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) {
 
 uint32_t LearningModelDevice::NumberOfIntraOpThreads() {
   if (IsCpuDevice()) {
-    return std::thread::hardware_concurrency();
+    return HardwareCoreEnumerator::DefaultIntraOpNumThreads();
   } else {
     // GPU sessions should not rely on intra op threads.
     // Creating a large thread pool is unnecessary and wasteful, and can cause

diff --git a/winml/lib/Api/LearningModelSessionOptions.cpp b/winml/lib/Api/LearningModelSessionOptions.cpp
@@ -3,11 +3,20 @@
 
 #include "lib/Api/pch/pch.h"
 #include "LearningModelSessionOptions.h"
+#include "HardwareCoreEnumerator.h"
 
 namespace WINMLP {
+
+LearningModelSessionOptions::LearningModelSessionOptions() {
+  intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads();
+}
+
 LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options)
   : batch_size_override_(options.batch_size_override_),
-    close_model_on_session_creation_(options.close_model_on_session_creation_) {
+    close_model_on_session_creation_(options.close_model_on_session_creation_),
+    named_dim_overrides_(options.named_dim_overrides_),
+    intra_op_num_threads_override_(options.intra_op_num_threads_override_),
+    custom_ops_lib_paths_(options.custom_ops_lib_paths_) {
 }
 
 uint32_t LearningModelSessionOptions::BatchSizeOverride() {

diff --git a/winml/lib/Api/LearningModelSessionOptions.h b/winml/lib/Api/LearningModelSessionOptions.h
@@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
                                        LearningModelSessionOptions,
                                        ILearningModelSessionOptionsNative,
                                        ILearningModelSessionOptionsNative1> {
-  LearningModelSessionOptions() = default;
+  LearningModelSessionOptions();
 
   LearningModelSessionOptions(const LearningModelSessionOptions& options);
 
@@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
   // The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations.
   // The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest.
   // WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool
-  uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency();
+  uint32_t intra_op_num_threads_override_;
 
   bool allow_thread_spinning_ = true;
 

diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp
@@ -2195,12 +2195,6 @@ static void SetIntraOpNumThreads() {
   auto binding = LearningModelBinding(session);
   binding.Bind(L"input", tensor_input);
   WINML_EXPECT_NO_THROW(session.Evaluate(binding, L""));
-
-  // Check to verify that the default number of threads in LearningModelSession is equal to the number of logical cores.
-  session = LearningModelSession(model, device);
-  nativeSession = session.as<ILearningModelSessionNative>();
-  WINML_EXPECT_NO_THROW(nativeSession->GetIntraOpNumThreads(&numIntraOpThreads));
-  WINML_EXPECT_EQUAL(std::thread::hardware_concurrency(), numIntraOpThreads);
 }
 
 static void SetIntraOpThreadSpinning() {