microsoft · YUNQIUGUO · Feb 21, 2024 · Jan 31, 2024 · Feb 2, 2024 · Feb 7, 2024
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -258,7 +258,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
     cpu_state.sequences.InitDevice(beam_state.sequences_device);
     ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
                                                       cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
-                                                      nullptr,
+                                                      this->ort_stream_,
                                                       DeviceCopyDirection::hostToDevice));
   }
 

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -214,7 +214,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
     cpu_state.sequences.InitDevice(beam_state.sequences_device);
     ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
                                                       cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
-                                                      nullptr,
+                                                      this->ort_stream_,
                                                       DeviceCopyDirection::hostToDevice));
   }
 

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -226,7 +226,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     cpu_state.sequences.InitDevice(beam_state.sequences_device);
     ORT_RETURN_IF_ERROR(this->device_copy_int32_func_(beam_state.sequences_device.subspan(0, beam_state.sequences_device.size() / 2),
                                                       cpu_state.sequences_space.subspan(0, cpu_state.sequences_space.size() / 2),
-                                                      nullptr,
+                                                      this->ort_stream_,
                                                       DeviceCopyDirection::hostToDevice));
   }
 

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -344,20 +344,25 @@ namespace Dml::GraphDescBuilder
                             dmlFusedNodeInputIndex < isConstGpuGraphInputCount &&
                             isConstGpuGraphInput[dmlFusedNodeInputIndex])
                         {
-                            // This is a highly inefficient approach to generating constant nodes.  It duplicates constant data 
-                            // across the graph input as well as every consumer's unique constant node.  However it is currently 
+                            // This is a highly inefficient approach to generating constant nodes.  It duplicates constant data
+                            // across the graph input as well as every consumer's unique constant node.  However it is currently
                             // only used for small inputs.
                             uint32_t c_maxConstNodeDataSize = 8;
 
-                            ComPtr<OnnxTensorWrapper> constantInput = constantCpuGraphInputGetter(arg->Name());
 
                             auto& operatorGraphInputNode = graphNodeCreateInfo.nodesAsOperatorDesc[operatorGraphInputEdge.ToNodeIndex];
                             std::vector<DmlBufferTensorDesc*> toNodeInputTensorDescs = operatorGraphInputNode->GetInputTensors();
                             DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorGraphInputEdge.ToNodeInputIndex];
+                            ComPtr<OnnxTensorWrapper> constantInput;
 
-                            if (constantInput && tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
+                            if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize)
                             {
-                                // The tensor description's size should be no larger than the constant input unless it was rounded to 
+                                constantInput = constantCpuGraphInputGetter(arg->Name());
+                            }
+
+                            if (constantInput)
+                            {
+                                // The tensor description's size should be no larger than the constant input unless it was rounded to
                                 // the required alignment.
                                 assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
                                 size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1123,7 +1123,7 @@ namespace Windows::AI::MachineLearning::Adapter
         }
         ORT_CATCH_RETURN
     }
-    
+
     template <class NodeInfoImpl_t, class Base1_t, class Base2_t>
     HRESULT STDMETHODCALLTYPE OpNodeInfoWrapper<NodeInfoImpl_t, Base1_t, Base2_t>::GetConstantInputTensor(uint32_t inputIndex, IMLOperatorTensor** tensor) const noexcept
     {
@@ -1168,7 +1168,7 @@ namespace Windows::AI::MachineLearning::Adapter
                                                  m_requiredConstantCpuInputs.begin(),
                                                  m_requiredConstantCpuInputs.end(),
                                                  inputIndex) != m_requiredConstantCpuInputs.end();
-                
+
                 // This shouldn't happen since kernel creation is deferred and repeated when required constant inputs are not present.
                 ORT_THROW_HR_IF(E_UNEXPECTED, inputRequiredAsConstant);
             }
@@ -1562,7 +1562,13 @@ namespace Windows::AI::MachineLearning::Adapter
     OnnxTensorWrapper::OnnxTensorWrapper(onnx::TensorProto* impl, const onnxruntime::Path& modelPath) : m_impl(impl)
     {
         // The tensor may be stored as raw data or in typed fields.
-        if (impl->has_raw_data())
+        if (impl->data_location() == onnx::TensorProto_DataLocation_EXTERNAL)
+        {
+            THROW_IF_NOT_OK(onnxruntime::utils::UnpackInitializerData(*impl, modelPath, m_unpackedExternalTensor));
+            m_dataPtr = reinterpret_cast<std::byte*>(m_unpackedExternalTensor.data());
+            m_tensorByteSize = m_unpackedExternalTensor.size();
+        }
+        else if (impl->has_raw_data())
         {
             m_dataPtr = reinterpret_cast<std::byte*>(impl->mutable_raw_data()->data());
             m_tensorByteSize = impl->raw_data().size();

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.h
@@ -309,6 +309,7 @@ class OnnxTensorWrapper : public WRL::Base<IMLOperatorTensor>, public Closable
  private:
     size_t m_tensorByteSize = 0;
     std::unique_ptr<std::byte[]> m_unpackedTensor;
+    std::vector<uint8_t> m_unpackedExternalTensor;
     std::byte* m_dataPtr = nullptr;
 
     // Lifetime is managed by the caller and guaranteed to outlive this class

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -55,6 +55,19 @@ Status SplitOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   return Status::OK();
 }
 
+// Converts an ONNX list of split lengths to a QNN list of split indices.
+// Note that the first split index at 0 is implicit (QNN SDK >= 2.19 will raise a validation error if included).
+static void ConvertSplitLengthsToSplitIndices(gsl::span<const int64_t> split_lengths,
+                                              std::vector<uint32_t>& split_indices) {
+  uint32_t split_it = 0;
+  for (size_t i = 0; i < split_lengths.size(); ++i) {
+    if (i > 0) {  // Do not include the 0th split index.
+      split_indices.push_back(split_it);
+    }
+    split_it += SafeInt<uint32_t>(split_lengths[i]);
+  }
+}
+
 Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
                                                    const NodeUnit& node_unit,
                                                    std::vector<std::string>&& input_names,
@@ -79,22 +92,15 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
       const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
       size_t tensor_byte_size = unpacked_tensor.size();
       size_t size = tensor_byte_size / sizeof(int64_t);
-      split_index.push_back(0);  // QNN need the start index of each range and starts from 0
-      std::transform(tensor_data, tensor_data + size, std::back_inserter(split_index),
-                     [](int64_t item) { return SafeInt<uint32_t>(item); });
-      split_index.pop_back();
+      ConvertSplitLengthsToSplitIndices({tensor_data, size}, split_index);
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic split");
     }
   } else {
     NodeAttrHelper node_helper(node_unit);
     if (node_helper.HasAttr("split")) {
-      auto split = node_helper.Get("split", std::vector<int32_t>{0});
-      uint32_t split_it = 0;
-      for (size_t i = 0; i < split.size(); ++i) {
-        split_index.push_back(split_it);
-        split_it += split[i];
-      }
+      auto split_lengths = node_helper.Get("split", std::vector<int64_t>{0});
+      ConvertSplitLengthsToSplitIndices(split_lengths, split_index);
     }
   }
 
@@ -105,11 +111,19 @@ Status SplitOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
                       "Cannot get shape");
     ORT_ENFORCE(static_cast<int32_t>(input_shape.size()) > axis_value, "axis not valid!");
     ORT_RETURN_IF_NOT(input_shape.at(axis_value) > 0, "Shape value not valid!");
-    auto num_outputs = node_unit.Outputs().size();
-    auto step = SafeInt<uint32_t>(input_shape.at(axis_value) / num_outputs);
+
+    // ONNX spec states that if not evenly divisible by `num_outputs`, the last chunk is smaller.
+    // Therefore, we have to use ceil() when computing shape[axis] / num_outputs.
+    // See: core/providers/cpu/tensor/split.cc::PrepareForCompute()
+    const float num_outputs = static_cast<float>(node_unit.Outputs().size());
+    const float split_dim_size = static_cast<float>(input_shape[axis_value]);
+    const uint32_t step = SafeInt<uint32_t>(std::ceil(split_dim_size / num_outputs));
     uint32_t split_it = 0;
+
     for (size_t i = 0; i < num_outputs; ++i) {
-      split_index.push_back(split_it);
+      if (i > 0) {  // 0th split index is implicit (QNN >= 2.19 raises validation error if included)
+        split_index.push_back(split_it);
+      }
       split_it += step;
     }
   }

diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc
@@ -7,6 +7,7 @@
 
 #ifdef _WIN32
 #include <Windows.h>
+#include <versionhelpers.h>
 #endif
 #include <thread>
 #include "core/session/ort_apis.h"
@@ -98,7 +99,16 @@ CreateThreadPoolHelper(Env* env, OrtThreadPoolParams options) {
     }
     options.thread_pool_size = static_cast<int>(default_affinities.size());
     if (options.auto_set_affinity) {
+#ifdef _WIN32
+      // Only set thread affinity on Server with auto affinity.
+      // On client best to let OS scheduler handle.
+      // On big (P-Core) / little (E-Core) CPU designs affinity overrides QoS and has high power usage
+      if (IsWindowsServer()) {
+        to.affinities = std::move(default_affinities);
+      }
+#else
       to.affinities = std::move(default_affinities);
+#endif
     }
   }
   if (options.thread_pool_size <= 1) {

diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -270,6 +270,8 @@ def quantize_model(self):
 
         self.model.model.producer_name = __producer__
         self.model.model.producer_version = __version__
+        if self.qdq_op_domain == ms_domain:
+            self.model.set_opset_import(ms_domain, 1)
 
         return self.model.model
 

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -197,6 +197,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "BiasGelu": self._infer_BiasGelu,
             "BiasSplitGelu": self._infer_BiasSplitGelu,
             "DecoderMaskedMultiHeadAttention": self._infer_DecoderMaskedMultiHeadAttention,
+            "DequantizeLinear": self._infer_DequantizeLinear,
             "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
             "FastGelu": self._infer_FastGelu,
             "GatedRelativePositionBias": self._infer_GatedRelativePositionBias,
@@ -212,6 +213,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "PackedAttention": self._infer_PackedAttention,
             "PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
             "PythonOp": self._infer_PythonOp,
+            "QuantizeLinear": self._infer_QuantizeLinear,
             "QuickGelu": self._infer_FastGelu,
             "RelativePositionBias": self._infer_RelativePositionBias,
             "RemovePadding": self._infer_RemovePadding,
@@ -457,6 +459,8 @@ def _onnx_infer_single_node(self, node):
             "GemmFastGelu",
             "LayerNormalization",
             "LongformerAttention",
+            "DequantizeLinear",
+            "QuantizeLinear",
             "RelativePositionBias",
             "RemovePadding",
             "RestorePadding",
@@ -979,6 +983,29 @@ def _infer_NhwcConv(self, node):  # noqa: N802
             )
         )
 
+    def _infer_DequantizeLinear(self, node):  # noqa: N802
+        # Get the output data type from the scale input (index 1, required).
+        output_dtype = self.known_vi_[node.input[1]].type.tensor_type.elem_type
+
+        # Get the output shape from the first input.
+        output_shape = self._get_shape(node, 0)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
+    def _infer_QuantizeLinear(self, node):  # noqa: N802
+        # Get the output data type from the zero-point input (index 2, optional).
+        # Otherwise, default to uint8
+        output_dtype = onnx.TensorProto.UINT8
+        if len(node.input) > 2 and node.input[2]:
+            output_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
+
+        # Get the output shape from the first input.
+        output_shape = self._get_shape(node, 0)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
     def _infer_Einsum(self, node):  # noqa: N802
         # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
         equation = get_attribute(node, "equation")

diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -302,19 +302,46 @@ TEST_F(QnnHTPBackendTests, Split_Int32_Opset13) {
 // Test 8-bit QDQ Split opset 18 on HTP backend: equal split of axis 0 via 'num_outputs' attribute
 // and 'split' input.
 TEST_F(QnnHTPBackendTests, Split_Equal_Axis0_Opset18) {
+  // Split 6 into 3 outputs of lengths [2, 2, 2]
+  TestInputDef<float> input_def({6, 2}, false,
+                                {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f, 9.0f, 10.0f, 11.0f});
+
   // Use 'split' input (initializer).
-  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
-                                  {2, 2},  // split
-                                  0,       // axis
-                                  -1,      // num_outputs
-                                  18,      // opset
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
+                                  {2, 2, 2},  // split
+                                  0,          // axis
+                                  -1,         // num_outputs
+                                  18,         // opset
                                   ExpectedEPNodeAssignment::All);
 
   // Use 'num_outputs' attribute.
-  RunQDQSplitOpTestOnHTP<uint8_t>(TestInputDef<float>({4, 2}, false, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f}),
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
+                                  {},  // split (use num_outputs instead)
+                                  0,   // axis
+                                  3,   // num_outputs
+                                  18,  // opset
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test 8-bit QDQ Split opset 18 on HTP backend. Use an uneven split (last chunk should be smaller).
+TEST_F(QnnHTPBackendTests, Split_NonEqual_Axis0_Opset18) {
+  // Split 7 into 3 outputs of lengths [3, 3, 1]
+  TestInputDef<float> input_def({7, 2}, false,
+                                {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.f, 8.f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f});
+
+  // Use a `split` input with uneven split lengths.
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
+                                  {3, 3, 1},  // split
+                                  0,          // axis
+                                  -1,         // num_outputs
+                                  18,         // opset
+                                  ExpectedEPNodeAssignment::All);
+
+  // Use a `num_outputs` attribute that does not evenly divide into shape[axis].
+  RunQDQSplitOpTestOnHTP<uint8_t>(input_def,
                                   {},  // split (use num_outputs instead)
                                   0,   // axis
-                                  2,   // num_outputs
+                                  3,   // num_outputs
                                   18,  // opset
                                   ExpectedEPNodeAssignment::All);
 }