From 93669ddb429c04411be265c2c255d643dfcc9728 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 12 Oct 2023 02:12:52 -0700
Subject: [PATCH 1/6] Fix index-out-of-bounds bug in Slice builder when
 initializer is shared.

---
 .../qnn/builder/opbuilder/slice_op_builder.cc | 198 ++++++++----------
 .../test/providers/qnn/slice_htp_test.cc      |  82 +++++++-
 2 files changed, 171 insertions(+), 109 deletions(-)
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 15fc55b5b59b6..4f6cf714f4108 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -37,16 +37,16 @@ class SliceOpBuilder : public BaseOpBuilder {
                             TensorShapeVector& raw_starts,
                             TensorShapeVector& raw_ends,
                             TensorShapeVector& raw_axes) const;
-  typedef struct {
-    int32_t begin, end, stride;
-  } Range;
-  mutable std::vector<Range> ranges_;
 };
 
 Status SliceOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
   size_t input_count = node_unit.Inputs().size();
-  // Op set 9 only has 1 input with starts, ends, axes attribute
-  // Op set > 9, starts, ends, axes are from node input
+
+  // Opset < 10: Only has 1 data input. The starts, ends, and axes values are attributes.
+  // Opset >= 10: Everything is an input. The data, starts, and ends inputs are required.
+
+  ORT_RETURN_IF(node_unit.SinceVersion() >= 10 && input_count < 3);
+
   if (input_count > 1) {
     // Skip the first input. All other input need to be initializer
     for (size_t i = 1; i < input_count; i++) {
@@ -75,6 +75,45 @@ void SliceOpBuilder::GetDataFromAttribute(const NodeUnit& node_unit,
   }
 }
 
+// Gets the data from initializer inputs (e.g., starts, ends, axes, or steps) as a TensorShapeVector.
+static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModelWrapper& qnn_model_wrapper,
+                                      TensorShapeVector& output) {
+  OnnxInputInfo input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(input, input_info));
+  ORT_RETURN_IF_NOT(input_info.is_initializer,
+                    "QNN requires the starts, ends, axes, and steps inputs to "
+                    "be initializers");
+
+  std::vector<uint8_t> initializer_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, initializer_bytes));
+
+  size_t tensor_byte_size = initializer_bytes.size();
+  const auto data_type = input_info.initializer_tensor->data_type();
+
+  Status status;
+
+  switch (data_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
+      const int64_t* tensor_data = reinterpret_cast<const int64_t*>(initializer_bytes.data());
+      size_t size = tensor_byte_size / sizeof(int64_t);
+      output.insert(output.end(), tensor_data, tensor_data + size);
+      break;
+    }
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
+      const int32_t* tensor_data = reinterpret_cast<const int32_t*>(initializer_bytes.data());
+      size_t size = tensor_byte_size / sizeof(int32_t);
+      output.insert(output.end(), tensor_data, tensor_data + size);
+      break;
+    }
+    default:
+      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", data_type,
+                               " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
+      break;
+  }
+
+  return status;
+}
+
 // Note: For ONNX Slice operation the expected number of inputs is between 3 and 5
 Status SliceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                      const NodeUnit& node_unit,
@@ -84,123 +123,68 @@ Status SliceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   if (do_op_validation) {
     ORT_RETURN_IF_ERROR(ExplictOpCheck(qnn_model_wrapper, node_unit));
   }
-  Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+
+  // Only need to add input 0. The other inputs (if any) contain static data that is passed to QNN APIs
+  // as static parameters.
+  return ProcessInput(qnn_model_wrapper, node_unit.Inputs()[0], logger, input_names);
+}
+
+Status SliceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                   const NodeUnit& node_unit,
+                                                   std::vector<std::string>&& input_names,
+                                                   const logging::Logger& logger,
+                                                   bool do_op_validation) const {
+  // Extract starts, ends, axes, and steps data from attributes (opset < 10) or initializer inputs (opset >= 10).
   TensorShapeVector raw_starts;
   TensorShapeVector raw_ends;
   TensorShapeVector raw_axes;
   TensorShapeVector raw_steps;
-  std::vector<uint32_t> input0_shape;
 
-  auto inputs = node_unit.Inputs();
-  auto input_count = inputs.size();
-  // Opset 9, only 1 input, starts, ends, axes are in attribute
-  if (1 == input_count) {
+  const auto& inputs = node_unit.Inputs();
+  const size_t input_count = inputs.size();
+
+  // Opset 9 only has 1 input. The starts, ends, axes values are attributes.
+  if (node_unit.SinceVersion() < 10) {
     GetDataFromAttribute(node_unit, raw_starts, raw_ends, raw_axes);
-  }
+  } else {
+    constexpr size_t starts_index = 1;
+    constexpr size_t ends_index = 2;
+    constexpr size_t axes_index = 3;
+    constexpr size_t steps_index = 4;
 
-  for (size_t input_i = 0; input_i < input_count; ++input_i) {
-    auto& input_name = inputs[input_i].node_arg.Name();
-    if (input_name.empty()) {
-      // Ignore unspecified/unused optional input
-      continue;
-    }
-    if (qnn_model_wrapper.IsQnnTensorWrapperExist(input_name)) {
-      LOGS(logger, VERBOSE) << "Tensor already added or the input is not named, skip it: " << input_name;
-      input_names.push_back(input_name);
-      ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[input_i].node_arg, input0_shape), "Cannot get shape");
-      continue;
+    // Starts input (required).
+    ORT_RETURN_IF_ERROR(GetInitializerInputData(inputs[starts_index], qnn_model_wrapper, raw_starts));
+
+    // Ends input (required).
+    ORT_RETURN_IF_ERROR(GetInitializerInputData(inputs[ends_index], qnn_model_wrapper, raw_ends));
+
+    // Axes input (optional).
+    if (input_count > axes_index && !inputs[axes_index].node_arg.Name().empty()) {
+      ORT_RETURN_IF_ERROR(GetInitializerInputData(inputs[axes_index], qnn_model_wrapper, raw_axes));
     }
 
-    bool is_quantized_tensor = inputs[input_i].quant_param.has_value();
-    const auto* type_proto = inputs[input_i].node_arg.TypeAsProto();
-    ORT_RETURN_IF_ERROR(utils::GetQnnDataType(is_quantized_tensor, type_proto, qnn_data_type));
-
-    std::vector<uint32_t> input_shape;
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[input_i].node_arg, input_shape), "Cannot get shape");
-
-    Qnn_QuantizeParams_t quantize_param = QNN_QUANTIZE_PARAMS_INIT;
-    utils::InitializeQuantizeParam(quantize_param, is_quantized_tensor);
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.ProcessQuantizationParameter(inputs[input_i].quant_param,
-                                                                     quantize_param.scaleOffsetEncoding.scale,
-                                                                     quantize_param.scaleOffsetEncoding.offset),
-                      "Cannot get quantization parameter");
-
-    std::vector<uint8_t> unpacked_tensor;
-    bool is_initializer_input = qnn_model_wrapper.IsInitializerInput(input_name);
-    if (is_initializer_input) {
-      const auto& input_tensor = qnn_model_wrapper.GetInitializerTensors().at(input_name);
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_tensor, unpacked_tensor));
-      size_t tensor_byte_size = unpacked_tensor.size();
-      const auto data_type = input_tensor->data_type();
-      TensorShapeVector data;
-      if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
-        const int64_t* tensor_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
-        size_t size = tensor_byte_size / sizeof(int64_t);
-        data.insert(data.end(), tensor_data, tensor_data + size);
-      } else if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
-        const int32_t* tensor_data = reinterpret_cast<const int32_t*>(unpacked_tensor.data());
-        size_t size = tensor_byte_size / sizeof(int32_t);
-        data.insert(data.end(), tensor_data, tensor_data + size);
-      } else {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                               "Data type for starts and ends inputs' is not supported in this build. Got ",
-                               data_type);
-      }
-      if (input_i == 0) {
-        // Do nothing!
-      } else if (input_i == 1) {
-        // Starts
-        raw_starts = data;
-        continue;
-      } else if (input_i == 2) {
-        // Ends
-        raw_ends = data;
-        continue;
-      } else if (input_i == 3) {
-        // Axes
-        raw_axes = data;
-        continue;
-      } else if (input_i == 4) {
-        // Steps
-        raw_steps = data;
-        continue;
-      }
+    // Steps input (optional).
+    if (input_count > steps_index && !inputs[steps_index].node_arg.Name().empty()) {
+      ORT_RETURN_IF_ERROR(GetInitializerInputData(inputs[steps_index], qnn_model_wrapper, raw_steps));
     }
-    input0_shape = input_shape;
-
-    input_names.push_back(input_name);
-    Qnn_TensorType_t tensor_type = GetInputTensorType(qnn_model_wrapper, input_name);
-    Qnn_QuantizeParams_t quantize_params = QNN_QUANTIZE_PARAMS_INIT;
-    QnnTensorWrapper input_tensorwrapper(input_name, tensor_type, qnn_data_type, quantize_params,
-                                         std::move(input_shape), std::move(unpacked_tensor));
-    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)), "Failed to add tensor.");
   }
+
+  std::vector<uint32_t> input0_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input0_shape),
+                    "Cannot get shape for Slice input 0.");
+
   TensorShapeVector input_dimensions(input0_shape.cbegin(), input0_shape.cend());
   onnxruntime::SliceOp::PrepareForComputeMetadata compute_metadata(input_dimensions);
-  ORT_RETURN_IF_ERROR(
-      SliceOp::PrepareForComputeHelper(raw_starts, raw_ends, raw_axes, raw_steps, compute_metadata));
-  ranges_.clear();
-  for (size_t i = 0; i < input_dimensions.size(); i++) {
-    auto start = static_cast<int32_t>(compute_metadata.starts_[i]);
-    auto end = static_cast<int32_t>(compute_metadata.ends_[i]);
-    auto step = static_cast<int32_t>(compute_metadata.steps_[i]);
-    ranges_.push_back(Range({start, end, step}));
-  }
-  return Status::OK();
-}
+  ORT_RETURN_IF_ERROR(SliceOp::PrepareForComputeHelper(raw_starts, raw_ends, raw_axes, raw_steps, compute_metadata));
 
-Status SliceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
-                                                   const NodeUnit& node_unit,
-                                                   std::vector<std::string>&& input_names,
-                                                   const logging::Logger& logger,
-                                                   bool do_op_validation) const {
-  std::vector<uint32_t> ranges_dims{static_cast<uint32_t>(ranges_.size()), 3};
+  std::vector<uint32_t> ranges_dims{static_cast<uint32_t>(input_dimensions.size()), 3};
   std::vector<uint32_t> ranges_data;
-  for (auto range : ranges_) {
-    ranges_data.push_back(static_cast<uint32_t>(range.begin));
-    ranges_data.push_back(static_cast<uint32_t>(range.end));
-    ranges_data.push_back(static_cast<uint32_t>(range.stride));
+  for (size_t i = 0; i < input_dimensions.size(); i++) {
+    ranges_data.push_back(static_cast<uint32_t>(compute_metadata.starts_[i]));
+    ranges_data.push_back(static_cast<uint32_t>(compute_metadata.ends_[i]));
+    ranges_data.push_back(static_cast<uint32_t>(compute_metadata.steps_[i]));
   }
+
   QnnParamWrapper ranges_paramwrapper(node_unit.Index(),
                                       node_unit.Name(),
                                       QNN_OP_STRIDED_SLICE_PARAM_RANGES,
diff --git a/onnxruntime/test/providers/qnn/slice_htp_test.cc b/onnxruntime/test/providers/qnn/slice_htp_test.cc
index edc079dc65276..07c97d2d7b1fa 100644
--- a/onnxruntime/test/providers/qnn/slice_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/slice_htp_test.cc
@@ -14,6 +14,48 @@
 
 namespace onnxruntime {
 namespace test {
+
+// Test for "index-out-of-bounds" bug that occurred when a Slice operator
+// shared one of its initializer inputs with another op that was processed by QNN EP first.
+TEST_F(QnnCPUBackendTests, Slice_SharedInitializersBugFix) {
+  // Model with an Add that processes a shared initializer before Slice is processed.
+  GetTestModelFn model_fn = [](ModelTestBuilder& builder) {
+    NodeArg* input0 = builder.MakeInput<int32_t>({2, 2}, {1, 2, 3, 4});
+
+    // Initializers
+    NodeArg* starts_input = builder.Make1DInitializer<int32_t>({1, 0});  // Shared by Add
+    NodeArg* ends_input = builder.Make1DInitializer<int32_t>({2, 2});
+    NodeArg* axes_input = builder.Make1DInitializer<int32_t>({0, 1});
+    NodeArg* steps_input = builder.Make1DInitializer<int32_t>({1, 1});
+
+    // Add input0 with a shared initializer.
+    NodeArg* add_output = builder.MakeIntermediate();
+    builder.AddNode("Add", {input0, starts_input}, {add_output});
+
+    // Cast Add's output to float.
+    NodeArg* cast_output = builder.MakeIntermediate();
+    Node& cast_node = builder.AddNode("Cast", {add_output}, {cast_output});
+    cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT));
+
+    // Slice Cast's output
+    NodeArg* slice0_out = builder.MakeOutput();
+    builder.AddNode("Slice", {cast_output, starts_input, ends_input, axes_input, steps_input}, {slice0_out});
+  };
+
+  ProviderOptions provider_options;
+
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  RunQnnModelTest(model_fn,
+                  provider_options,
+                  13,  // opset
+                  ExpectedEPNodeAssignment::All);
+}
+
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 /**
@@ -26,6 +68,7 @@ namespace test {
  * \param axes_def The axes input's definition.
  * \param steps_def The steps input's definition.
  * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
+ * \param use_contrib_qdq Force Q/DQ ops to use the com.microsoft domain (enable 16-bit).
  */
 template <typename QuantType = uint8_t>
 static void RunSliceQDQTest(const TestInputDef<float>& data_def,
@@ -33,7 +76,8 @@ static void RunSliceQDQTest(const TestInputDef<float>& data_def,
                             const TestInputDef<int64_t>& ends_def,
                             const TestInputDef<int64_t>& axes_def,
                             const TestInputDef<int64_t>& steps_def,
-                            ExpectedEPNodeAssignment expected_ep_assignment) {
+                            ExpectedEPNodeAssignment expected_ep_assignment,
+                            bool use_contrib_qdq = false) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
   provider_options["backend_path"] = "QnnHtp.dll";
@@ -45,7 +89,8 @@ static void RunSliceQDQTest(const TestInputDef<float>& data_def,
   const std::vector<TestInputDef<int64_t>> int64_inputs = {starts_def, ends_def, axes_def, steps_def};
 
   TestQDQModelAccuracy(BuildOpTestCase<float, int64_t>("Slice", f32_inputs, int64_inputs, {}),
-                       BuildQDQOpTestCase<QuantType, int64_t>("Slice", f32_inputs, int64_inputs, {}),
+                       BuildQDQOpTestCase<QuantType, int64_t>("Slice", f32_inputs, int64_inputs, {}, kOnnxDomain,
+                                                              use_contrib_qdq),
                        provider_options,
                        18,
                        expected_ep_assignment);
@@ -123,6 +168,39 @@ TEST_F(QnnHTPBackendTests, SliceInt32OnHTP) {
                                ExpectedEPNodeAssignment::All);
 }
 
+// Test 8-bit QDQ Slice with more than 1 axis.
+TEST_F(QnnHTPBackendTests, SliceU8_MultAxes) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunSliceQDQTest<uint8_t>(TestInputDef<float>({2, 4}, false, input_data),
+                           TestInputDef<int64_t>({2}, true, {1, 0}),  // starts
+                           TestInputDef<int64_t>({2}, true, {2, 3}),  // ends
+                           TestInputDef<int64_t>({2}, true, {0, 1}),  // axes
+                           TestInputDef<int64_t>({2}, true, {1, 2}),  // steps
+                           ExpectedEPNodeAssignment::All);
+}
+
+// Test 16-bit QDQ Slice with more than 1 axis.
+TEST_F(QnnHTPBackendTests, SliceU16_MultAxes) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunSliceQDQTest<uint16_t>(TestInputDef<float>({2, 4}, false, input_data),
+                            TestInputDef<int64_t>({2}, true, {1, 0}),  // starts
+                            TestInputDef<int64_t>({2}, true, {2, 3}),  // ends
+                            TestInputDef<int64_t>({2}, true, {0, 1}),  // axes
+                            TestInputDef<int64_t>({2}, true, {1, 2}),  // steps
+                            ExpectedEPNodeAssignment::All,
+                            true);  // Use com.microsoft Q/DQ ops for 16-bit
+}
+
+// Test 8-bit QDQ Slice with more than 1 axis and an end value that exceeds the associated dimension size.
+TEST_F(QnnHTPBackendTests, SliceU8_MultAxes_LargeEnd) {
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  RunSliceQDQTest<uint8_t>(TestInputDef<float>({2, 4}, false, input_data),
+                           TestInputDef<int64_t>({2}, true, {0, 1}),      // starts
+                           TestInputDef<int64_t>({2}, true, {-1, 1000}),  // ends
+                           TestInputDef<int64_t>({2}, true, {0, 1}),      // axes
+                           TestInputDef<int64_t>({2}, true, {1, 1}),      // steps
+                           ExpectedEPNodeAssignment::All);
+}
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test

From 429fea99a52ec7cfda98bbe9f3eacb5574ecc458 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 12 Oct 2023 02:30:25 -0700
Subject: [PATCH 2/6] No need for input count check

---
 .../core/providers/qnn/builder/opbuilder/slice_op_builder.cc   | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 4f6cf714f4108..db0c6f9517f69 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -44,9 +44,6 @@ Status SliceOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const
 
   // Opset < 10: Only has 1 data input. The starts, ends, and axes values are attributes.
   // Opset >= 10: Everything is an input. The data, starts, and ends inputs are required.
-
-  ORT_RETURN_IF(node_unit.SinceVersion() >= 10 && input_count < 3);
-
   if (input_count > 1) {
     // Skip the first input. All other input need to be initializer
     for (size_t i = 1; i < input_count; i++) {

From 1567536eeb7670478678512bcd77be44159a4088 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 12 Oct 2023 12:45:47 -0700
Subject: [PATCH 3/6] Address review comments

---
 .../qnn/builder/opbuilder/slice_op_builder.cc | 28 +++++++++++--------
 .../core/providers/qnn/builder/qnn_utils.h    |  8 ++++++
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index db0c6f9517f69..90c82eeac7255 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -78,28 +78,29 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
   OnnxInputInfo input_info = {};
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(input, input_info));
   ORT_RETURN_IF_NOT(input_info.is_initializer,
-                    "QNN requires the starts, ends, axes, and steps inputs to "
-                    "be initializers");
+                    "QNN requires the starts, ends, axes, and steps inputs to be initializers");
 
   std::vector<uint8_t> initializer_bytes;
+
+  // Note: UnpackInitializerData() uses ORT's protobuf utilities, which ensure that the initializer bytes are
+  // contiguous, aligned, and in the appropriate endianness. This is necessary to be able to reinterpret bytes
+  // as an array of larger elements.
   ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, initializer_bytes));
 
-  size_t tensor_byte_size = initializer_bytes.size();
   const auto data_type = input_info.initializer_tensor->data_type();
-
   Status status;
 
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      const int64_t* tensor_data = reinterpret_cast<const int64_t*>(initializer_bytes.data());
-      size_t size = tensor_byte_size / sizeof(int64_t);
-      output.insert(output.end(), tensor_data, tensor_data + size);
+      gsl::span<const int64_t> elements = qnn::utils::ReinterpretBytesAsSpan<int64_t>(initializer_bytes.data(),
+                                                                                      initializer_bytes.size());
+      output.insert(output.end(), elements.begin(), elements.end());
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-      const int32_t* tensor_data = reinterpret_cast<const int32_t*>(initializer_bytes.data());
-      size_t size = tensor_byte_size / sizeof(int32_t);
-      output.insert(output.end(), tensor_data, tensor_data + size);
+      gsl::span<const int32_t> elements = qnn::utils::ReinterpretBytesAsSpan<int32_t>(initializer_bytes.data(),
+                                                                                      initializer_bytes.size());
+      output.insert(output.end(), elements.begin(), elements.end());
       break;
     }
     default:
@@ -174,9 +175,12 @@ Status SliceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wr
   onnxruntime::SliceOp::PrepareForComputeMetadata compute_metadata(input_dimensions);
   ORT_RETURN_IF_ERROR(SliceOp::PrepareForComputeHelper(raw_starts, raw_ends, raw_axes, raw_steps, compute_metadata));
 
-  std::vector<uint32_t> ranges_dims{static_cast<uint32_t>(input_dimensions.size()), 3};
+  const size_t input_rank = input_dimensions.size();
+  std::vector<uint32_t> ranges_dims{static_cast<uint32_t>(input_rank), 3};
   std::vector<uint32_t> ranges_data;
-  for (size_t i = 0; i < input_dimensions.size(); i++) {
+  ranges_data.reserve(input_rank);
+
+  for (size_t i = 0; i < input_rank; i++) {
     ranges_data.push_back(static_cast<uint32_t>(compute_metadata.starts_[i]));
     ranges_data.push_back(static_cast<uint32_t>(compute_metadata.ends_[i]));
     ranges_data.push_back(static_cast<uint32_t>(compute_metadata.steps_[i]));
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index a54e0c8276e71..f71d63ba7e6f6 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -8,12 +8,20 @@
 #include <numeric>
 #include <vector>
 #include <string>
+#include <gsl/span>
 
 namespace onnxruntime {
 namespace qnn {
 class QnnOpConfigWrapper;
 
 namespace utils {
+
+// Reinterprets an array of contiguous bytes in the target's endianness to a span of elements.
+template <typename T>
+inline gsl::span<const T> ReinterpretBytesAsSpan(const uint8_t* data, size_t num_bytes) {
+  return gsl::span<const T>(reinterpret_cast<const T*>(data), num_bytes / sizeof(T));
+}
+
 size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);

From 98741862401434977ebf4959216153aec4fc2cb6 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 12 Oct 2023 12:55:27 -0700
Subject: [PATCH 4/6] Simplify more

---
 .../providers/qnn/builder/opbuilder/slice_op_builder.cc     | 6 ++----
 onnxruntime/core/providers/qnn/builder/qnn_utils.h          | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 90c82eeac7255..ddfb3355c50b0 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -92,14 +92,12 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
 
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      gsl::span<const int64_t> elements = qnn::utils::ReinterpretBytesAsSpan<int64_t>(initializer_bytes.data(),
-                                                                                      initializer_bytes.size());
+      gsl::span<const int64_t> elements = qnn::utils::ReinterpretBytesAsSpan<int64_t>(initializer_bytes);
       output.insert(output.end(), elements.begin(), elements.end());
       break;
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-      gsl::span<const int32_t> elements = qnn::utils::ReinterpretBytesAsSpan<int32_t>(initializer_bytes.data(),
-                                                                                      initializer_bytes.size());
+      gsl::span<const int32_t> elements = qnn::utils::ReinterpretBytesAsSpan<int32_t>(initializer_bytes);
       output.insert(output.end(), elements.begin(), elements.end());
       break;
     }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index f71d63ba7e6f6..e2a0965fb8135 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -16,10 +16,10 @@ class QnnOpConfigWrapper;
 
 namespace utils {
 
-// Reinterprets an array of contiguous bytes in the target's endianness to a span of elements.
+// Reinterprets an array of contiguous, aligned bytes in the target's endianness to a span of elements.
 template <typename T>
-inline gsl::span<const T> ReinterpretBytesAsSpan(const uint8_t* data, size_t num_bytes) {
-  return gsl::span<const T>(reinterpret_cast<const T*>(data), num_bytes / sizeof(T));
+inline gsl::span<const T> ReinterpretBytesAsSpan(gsl::span<const uint8_t> bytes_span) {
+  return gsl::span<const T>(reinterpret_cast<const T*>(bytes_span.data()), bytes_span.size() / sizeof(T));
 }
 
 size_t GetElementSizeByType(const Qnn_DataType_t& data_type);

From e5683889d426ff275631aea2a469afbfc1781a1f Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 12 Oct 2023 22:59:53 -0700
Subject: [PATCH 5/6] Use Tensor utils to get initializer data

---
 .../qnn/builder/opbuilder/slice_op_builder.cc | 58 ++++++++++---------
 .../providers/qnn/builder/qnn_model_wrapper.h |  2 +
 .../core/providers/qnn/builder/qnn_utils.h    |  8 ---
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index ddfb3355c50b0..83d0a65e6729d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -8,6 +8,8 @@
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
 
+#include "core/framework/tensorprotoutils.h"
+
 #include "base_op_builder.h"
 
 namespace onnxruntime {
@@ -75,36 +77,38 @@ void SliceOpBuilder::GetDataFromAttribute(const NodeUnit& node_unit,
 // Gets the data from initializer inputs (e.g., starts, ends, axes, or steps) as a TensorShapeVector.
 static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModelWrapper& qnn_model_wrapper,
                                       TensorShapeVector& output) {
-  OnnxInputInfo input_info = {};
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetOnnxInputInfo(input, input_info));
-  ORT_RETURN_IF_NOT(input_info.is_initializer,
-                    "QNN requires the starts, ends, axes, and steps inputs to be initializers");
-
-  std::vector<uint8_t> initializer_bytes;
-
-  // Note: UnpackInitializerData() uses ORT's protobuf utilities, which ensure that the initializer bytes are
-  // contiguous, aligned, and in the appropriate endianness. This is necessary to be able to reinterpret bytes
-  // as an array of larger elements.
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, initializer_bytes));
+  const auto& input_name = input.node_arg.Name();
+  const bool is_initializer = qnn_model_wrapper.IsInitializerInput(input_name);
+  ORT_RETURN_IF_NOT(is_initializer, "Expected input ", input_name.c_str(), " to be an initializer.");
+  gsl::not_null<const ONNX_NAMESPACE::TensorProto*> initializer_proto = qnn_model_wrapper
+                                                                            .GetInitializerTensors()
+                                                                            .at(input_name);
+  ORT_RETURN_IF_NOT(initializer_proto->has_data_type(), "Expected initializer ", input_name.c_str(),
+                    " to have a proto data type.");
+
+  // Create empty Tensor.
+  const auto* dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer_proto->data_type())->GetElementType();
+  TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*initializer_proto);
+  Tensor tensor(dtype, shape, std::make_shared<CPUAllocator>());
+
+  // Deserialize initializer into Tensor.
+  const auto& model_path = qnn_model_wrapper.GetGraphViewer().ModelPath();
+  const ORTCHAR_T* model_path_str = model_path.IsEmpty() ? nullptr : model_path.ToPathString().c_str();
+  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(onnxruntime::Env::Default(), model_path_str,
+                                                              *initializer_proto, tensor));
 
-  const auto data_type = input_info.initializer_tensor->data_type();
   Status status;
 
-  switch (data_type) {
-    case ONNX_NAMESPACE::TensorProto_DataType_INT64: {
-      gsl::span<const int64_t> elements = qnn::utils::ReinterpretBytesAsSpan<int64_t>(initializer_bytes);
-      output.insert(output.end(), elements.begin(), elements.end());
-      break;
-    }
-    case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
-      gsl::span<const int32_t> elements = qnn::utils::ReinterpretBytesAsSpan<int32_t>(initializer_bytes);
-      output.insert(output.end(), elements.begin(), elements.end());
-      break;
-    }
-    default:
-      status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", data_type,
-                               " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
-      break;
+  // Copy Tensor of int32_t or int64_t elems into output (int64_ts).
+  if (tensor.IsDataType<int64_t>()) {
+    gsl::span<const int64_t> tensor_elems = tensor.DataAsSpan<int64_t>();
+    output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
+  } else if (tensor.IsDataType<int32_t>()) {
+    gsl::span<const int32_t> tensor_elems = tensor.DataAsSpan<int32_t>();
+    output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
+  } else {
+    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", DataTypeImpl::ToString(dtype),
+                             " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
   }
 
   return status;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index 22f8d3a0eaa64..89f26e1f84d06 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -181,6 +181,8 @@ class QnnModelWrapper {
 
   QnnBackendType GetQnnBackendType() { return qnn_backend_type_; }
 
+  const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
+
  private:
   bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,
                                    const std::vector<std::string>& names,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index e2a0965fb8135..a54e0c8276e71 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -8,20 +8,12 @@
 #include <numeric>
 #include <vector>
 #include <string>
-#include <gsl/span>
 
 namespace onnxruntime {
 namespace qnn {
 class QnnOpConfigWrapper;
 
 namespace utils {
-
-// Reinterprets an array of contiguous, aligned bytes in the target's endianness to a span of elements.
-template <typename T>
-inline gsl::span<const T> ReinterpretBytesAsSpan(gsl::span<const uint8_t> bytes_span) {
-  return gsl::span<const T>(reinterpret_cast<const T*>(bytes_span.data()), bytes_span.size() / sizeof(T));
-}
-
 size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);

From 14bbb7d88d07727839b3e860c22a1da646400bd0 Mon Sep 17 00:00:00 2001
From: adrianlizarraga <adlizarraga@microsoft.com>
Date: Thu, 12 Oct 2023 23:17:49 -0700
Subject: [PATCH 6/6] Adjust model path handling

---
 .../core/providers/qnn/builder/opbuilder/slice_op_builder.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 83d0a65e6729d..88c94581a8887 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -92,8 +92,8 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
   Tensor tensor(dtype, shape, std::make_shared<CPUAllocator>());
 
   // Deserialize initializer into Tensor.
-  const auto& model_path = qnn_model_wrapper.GetGraphViewer().ModelPath();
-  const ORTCHAR_T* model_path_str = model_path.IsEmpty() ? nullptr : model_path.ToPathString().c_str();
+  onnxruntime::PathString model_path = qnn_model_wrapper.GetGraphViewer().ModelPath().ToPathString();
+  const ORTCHAR_T* model_path_str = model_path.empty() ? nullptr : model_path.c_str();
   ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(onnxruntime::Env::Default(), model_path_str,
                                                               *initializer_proto, tensor));