microsoft · smk2007 · Nov 28, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 13, 2023
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
@@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
   ${winml_lib_api_dir}/impl/TensorKindFrom.h
   ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
   ${winml_lib_api_dir}/NumericData.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.h
   ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
   ${winml_lib_api_dir}/ImageFeatureDescriptor.h
   ${winml_lib_api_dir}/ImageFeatureValue.cpp

diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -0,0 +1,90 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "lib/Api/pch/pch.h"
+
+#include "HardwareCoreEnumerator.h"
+
+namespace WINMLP {
+
+struct LogicalProcessorInformation {
+  std::unique_ptr<char[]> Buffer;
+  size_t Length;
+};
+
+static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
+  DWORD length = 0;
+  DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);
+
+  assert(rc == FALSE);
+
+  auto processorInformationBytes = std::make_unique<char[]>(length);
+
+  rc = GetLogicalProcessorInformationEx(
+    relationship, (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)processorInformationBytes.get(), &length
+  );
+
+  assert(rc == TRUE);
+
+  return {std::move(processorInformationBytes), length};
+}
+
+static long long GetNumberOfSoCDieAtoms() {
+  // while (Size > (ULONG)FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) {
+  DWORD dwLevel2GroupMask = 0;
+  DWORD dwLevel3GroupMask = 0;
+  DWORD dwSoCGroupMask = 0;
+
+  auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
+  auto processorInformation = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)logicalProcessorInformation.Buffer.get();
+
+  size_t read = 0;
+  while (read <= logicalProcessorInformation.Length) {
+    switch (processorInformation->Relationship) {
+      case RelationCache:
+        if (processorInformation->Cache.Level == 2) {
+          dwLevel2GroupMask |= processorInformation->Cache.GroupMask.Mask;
+        } else if (processorInformation->Cache.Level == 3) {
+          dwLevel3GroupMask |= processorInformation->Cache.GroupMask.Mask;
+        }
+        break;
+    }
+
+    read += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX);
+    processorInformation++;
+  }
+
+  dwSoCGroupMask = (dwLevel2GroupMask & ~dwLevel3GroupMask);
+
+  return __popcnt(dwSoCGroupMask);
+}
+
+static long long GetNumberOfCores() {
+  auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationProcessorCore);
+  auto processorInformation = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)logicalProcessorInformation.Buffer.get();
+
+  KAFFINITY coreMask = 0;
+  size_t read = 0;
+  while (read <= logicalProcessorInformation.Length) {
+    switch (processorInformation->Relationship) {
+      case RelationProcessorCore:
+        coreMask |= processorInformation->Processor.GroupMask->Mask;
+        break;
+    }
+
+    read += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX);
+    processorInformation++;
+  }
+  return __popcnt64(coreMask);
+}
+
+uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
+  auto get_number_of_cores = static_cast<uint32_t>(GetNumberOfCores());
+  auto get_number_of_soc_die_atoms = static_cast<uint32_t>(GetNumberOfSoCDieAtoms());
+  auto num_p_and_e_cores = get_number_of_cores - get_number_of_soc_die_atoms;
+  printf("num_cores = %d, get_number_of_cores = %d, get_number_of_soc_die_atoms = %d\n", num_cores, get_number_of_cores,
+get_number_of_soc_die_atoms);
+  return num_p_and_e_cores;
+}
+
+}  // namespace WINMLP
diff --git a/winml/lib/Api/HardwareCoreEnumerator.h b/winml/lib/Api/HardwareCoreEnumerator.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace WINMLP {
+struct HardwareCoreEnumerator {
+  HardwareCoreEnumerator() = delete;
+  static uint32_t DefaultIntraOpNumThreads();
+};
+}  // namespace WINMLP
diff --git a/winml/lib/Api/LearningModelDevice.cpp b/winml/lib/Api/LearningModelDevice.cpp
@@ -7,6 +7,7 @@
 #include <D3d11_4.h>
 #include <d3d11on12.h>
 #include "D3DDeviceCache.h"
+#include "HardwareCoreEnumerator.h"
 
 #include "ConverterResourceStore.h"
 
@@ -129,9 +130,10 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) {
   return S_OK;
 }
 
+
 uint32_t LearningModelDevice::NumberOfIntraOpThreads() {
   if (IsCpuDevice()) {
-    return std::thread::hardware_concurrency();
+    return HardwareCoreEnumerator::DefaultIntraOpNumThreads();
   } else {
     // GPU sessions should not rely on intra op threads.
     // Creating a large thread pool is unnecessary and wasteful, and can cause

diff --git a/winml/lib/Api/LearningModelSessionOptions.cpp b/winml/lib/Api/LearningModelSessionOptions.cpp
@@ -3,11 +3,22 @@
 
 #include "lib/Api/pch/pch.h"
 #include "LearningModelSessionOptions.h"
+#include "HardwareCoreEnumerator.h"
 
 namespace WINMLP {
+
+LearningModelSessionOptions::LearningModelSessionOptions()
+{
+  intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads();
+}
+
+
 LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options)
   : batch_size_override_(options.batch_size_override_),
-    close_model_on_session_creation_(options.close_model_on_session_creation_) {
+    close_model_on_session_creation_(options.close_model_on_session_creation_),
+    named_dim_overrides_(options.named_dim_overrides_),
+    intra_op_num_threads_override_(options.intra_op_num_threads_override_),
+    custom_ops_lib_paths_(options.custom_ops_lib_paths_) {
 }
 
 uint32_t LearningModelSessionOptions::BatchSizeOverride() {

diff --git a/winml/lib/Api/LearningModelSessionOptions.h b/winml/lib/Api/LearningModelSessionOptions.h
@@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
                                        LearningModelSessionOptions,
                                        ILearningModelSessionOptionsNative,
                                        ILearningModelSessionOptionsNative1> {
-  LearningModelSessionOptions() = default;
+  LearningModelSessionOptions();
 
   LearningModelSessionOptions(const LearningModelSessionOptions& options);
 
@@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
   // The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations.
   // The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest.
   // WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool
-  uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency();
+  uint32_t intra_op_num_threads_override_;
 
   bool allow_thread_spinning_ = true;