From 5603450f1e65750b7d61a5c741f80f9308bb4cde Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 31 Aug 2023 17:52:10 -0700
Subject: [PATCH 1/9] Enable QLinearAveragePooling DML EP

---
 .../src/External/DirectMLHelpers/ApiTraits.h  |  37 +++-
 .../External/DirectMLHelpers/DirectMLSchema.h |  25 +++
 .../DirectMLHelpers/GeneratedSchemaHelpers.h  |  22 ++
 .../src/Operators/DmlOperator.cpp             |  34 +++
 .../src/Operators/DmlOperator.h               |   7 +
 .../DmlOperatorQLinearAveragePooling.cpp      | 208 ++++++++++++++++++
 .../src/Operators/OperatorRegistration.cpp    |   6 +
 .../dml/OperatorAuthorHelper/Attributes.h     |   1 +
 .../OperatorAuthorHelper/OperatorHelper.cpp   |  18 ++
 .../dml/OperatorAuthorHelper/OperatorHelper.h |  12 +
 .../OperatorAuthorHelper/OperatorVersions.h   |   1 +
 11 files changed, 369 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
index f6d71ce629a8d..570a0f82b62ff 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h
@@ -14,7 +14,26 @@ struct DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC
     _Maybenull_ const DML_TENSOR_DESC* BiasTensor;
     const DML_TENSOR_DESC* OutputTensor;
 };
-const int DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT = 0x80000011; 
+const int DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT = 0x80000011;
+
+struct DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC
+{
+    const DML_TENSOR_DESC* InputTensor;
+    const DML_TENSOR_DESC* InputScaleTensor;
+    _Maybenull_ const DML_TENSOR_DESC* InputZeroPointTensor;
+    const DML_TENSOR_DESC* OutputScaleTensor;
+    _Maybenull_ const DML_TENSOR_DESC* OutputZeroPointTensor;
+    const DML_TENSOR_DESC* OutputTensor;
+    UINT DimensionCount;
+    _Field_size_(DimensionCount) const UINT* Strides;
+    _Field_size_(DimensionCount) const UINT* WindowSize;
+    _Field_size_(DimensionCount) const UINT* StartPadding;
+    _Field_size_(DimensionCount) const UINT* EndPadding;
+    _Field_size_(DimensionCount) const UINT* Dilations;
+    BOOL IncludePadding;
+};
+const int DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING = 0x8000000B;
+
 
 namespace ApiTraits
 {
@@ -38,7 +57,7 @@ struct EnumTraits<DML_TENSOR_TYPE>
 template <>
 struct EnumTraits<DML_OPERATOR_TYPE>
 {
-    static constexpr auto ValueCount = 161;
+    static constexpr auto ValueCount = 162;
     static constexpr size_t ActivationFunctionCount = 24;
 };
 
@@ -497,6 +516,12 @@ struct OperatorDescTraits<DML_ROI_POOLING_OPERATOR_DESC>
     static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ROI_POOLING;
 };
 
+template <>
+struct OperatorDescTraits<DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC>
+{
+    static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING;
+};
+
 template <>
 struct OperatorDescTraits<DML_SLICE_OPERATOR_DESC>
 {
@@ -1492,6 +1517,12 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ROI_POOLING>
     using DescType = DML_ROI_POOLING_OPERATOR_DESC;
 };
 
+template <>
+struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING>
+{
+    using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC;
+};
+
 template <>
 struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_SLICE>
 {
@@ -2524,6 +2555,8 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args
 #pragma warning(disable: 4063)
     case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT:
         return std::invoke(std::forward<Visitor>(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward<Ts>(args)...);
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+        return std::invoke(std::forward<Visitor>(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward<Ts>(args)...);
 #pragma warning(pop)
 
     default:
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index f3a3aec50e4b4..2e9217cf3f4f7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -829,6 +829,31 @@ constexpr DML_OPERATOR_SCHEMA DML_ROI_POOLING_OPERATOR_SCHEMA {
     DML_ROI_POOLING_OPERATOR_SCHEMA_FIELDS,
 };
 
+
+constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] {
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false },
+    DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA {
+    "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING",
+    static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING),
+    DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+    13,
+    DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS,
+};
+
 constexpr DML_SCHEMA_FIELD DML_SLICE_OPERATOR_SCHEMA_FIELDS[6] {
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
     DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
index 90915c7e757de..1b82295ea4f9e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h
@@ -473,6 +473,24 @@ inline std::vector<OperatorField> GetFields(const DML_ROI_POOLING_OPERATOR_DESC&
         OperatorField(&DML_ROI_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<DML_SIZE_2D>(desc.PooledSize))),
     };
 }
+inline std::vector<OperatorField> GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc)
+{
+    return {
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputScaleTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.InputZeroPointTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputScaleTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputZeroPointTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast<const DML_TENSOR_DESC*>(desc.OutputTensor))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast<UINT>(desc.DimensionCount))),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast<const UINT*>(desc.Strides), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast<const UINT*>(desc.WindowSize), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast<const UINT*>(desc.StartPadding), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast<const UINT*>(desc.EndPadding), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast<const UINT*>(desc.Dilations), desc.DimensionCount)),
+        OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast<UINT>(desc.IncludePadding))),
+    };
+}
 inline std::vector<OperatorField> GetFields(const DML_SLICE_OPERATOR_DESC& desc)
 {
     return {
@@ -2492,6 +2510,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc)
         return AbstractOperatorDesc(
             &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA,
             GetFields(*static_cast<const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC*>(opDesc.Desc)));
+    case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING:
+        return AbstractOperatorDesc(
+            &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA,
+            GetFields(*static_cast<const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC*>(opDesc.Desc)));
 #pragma warning(pop)
 
     default:
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 25c7be42d6425..78f7d1aef33bd 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -635,6 +635,40 @@ namespace Dml
             ));
     }
 
+    void DmlOperator::ConvertNHWCToNCHW(
+        const uint32_t& dimCount, 
+        const gsl::span<const uint32_t> nhwcSizes,
+        std::vector<uint32_t>& nchwSizes,
+        std::vector<uint32_t>& nchwInputStrides)
+    {
+        int i = 0;
+        const uint32_t inputBatch = nhwcSizes[ i++ ];
+        const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[ i++ ] : 0;
+        const uint32_t inputHeight = nhwcSizes[ i++ ];
+        const uint32_t inputWidth = nhwcSizes[ i++ ];
+        const uint32_t inputChannels = nhwcSizes[ i++ ];
+
+        if (dimCount == 4)
+        {
+            nchwSizes = { inputBatch, inputChannels, inputHeight, inputWidth };
+            nchwInputStrides = { inputHeight * inputWidth * inputChannels,
+                1,
+                inputWidth * inputChannels,
+                inputChannels
+            };
+        }
+        else
+        {
+            nchwSizes = { inputBatch, inputChannels, inputDepth, inputHeight, inputWidth };
+            nchwInputStrides = { inputDepth * inputChannels * inputWidth * inputHeight,
+                1,
+                inputChannels * inputWidth * inputHeight,
+                inputChannels * inputWidth,
+                inputChannels
+            };
+        }
+    }
+
     TensorDesc DmlOperator::CreateTensorDescFromInput(
         const MLOperatorKernelCreationContext& kernelInfo,
         uint32_t index,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index c1e8cf42a974c..394195ce14a6d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -119,6 +119,13 @@ namespace Dml
         ComPtr<IDMLCompiledOperator> InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes);
         void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor);
 
+        void DmlOperator::ConvertNHWCToNCHW(
+            const uint32_t& dimCount, 
+            const gsl::span<const uint32_t> nhwcSizes,
+            std::vector<uint32_t>& nchwSizes,
+            std::vector<uint32_t>& nchwInputStrides
+            );
+        
         TensorDesc CreateTensorDescFromInput(
             const MLOperatorKernelCreationContext& kernelInfo,
             uint32_t index,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
new file mode 100644
index 0000000000000..4303149eae347
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
@@ -0,0 +1,208 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+
+class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelperBase
+{
+    // For QLinear Avg Pool ORT and DML have same indexing order
+    enum OrtInputTensors : uint32_t
+    {
+        ortInput,
+        ortInputScale,
+        ortInputZeroPoint,
+        ortOutputScale,
+        ortOutputZeroPoint,
+        ortInputCount
+    };
+
+public:
+    using Self = DmlOperatorQLinearAveragePooling;
+
+    DmlOperatorQLinearAveragePooling(
+        const MLOperatorKernelCreationContext& kernelInfo,
+        bool useGlobalPooling
+        )
+    :   DmlOperator(kernelInfo),
+        PoolingHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), useGlobalPooling)
+    {
+        DmlOperator::Initialize(kernelInfo);
+
+        bool isNhwc = m_kernel.channelsLast;
+        std::vector<DimensionType> inputShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortInput);
+        std::vector<DimensionType> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
+
+        // Initialize the input descriptions with broadcasting
+        m_inputTensorDescs[OrtInputTensors::ortInput] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortInput, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape);
+
+        uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount();
+        // Resize the Input Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[OrtInputTensors::ortInputScale] = CreateTensorDescFromInput(
+            kernelInfo, 
+            OrtInputTensors::ortInputScale,
+            TensorAxis::DoNotCoerce, 
+            TensorAxis::H,
+            TensorAxis::LeftAligned,
+            std::nullopt,
+            dmlDimSize
+            );
+
+        // Resize the Input ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint))
+        {
+
+            m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = CreateTensorDescFromInput(
+                kernelInfo, 
+                OrtInputTensors::ortInputZeroPoint,
+                TensorAxis::DoNotCoerce, 
+                TensorAxis::H,
+                TensorAxis::LeftAligned,
+                std::nullopt,
+                dmlDimSize
+                );
+        }
+
+        // Resize the Output Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[OrtInputTensors::ortOutputScale] = CreateTensorDescFromInput(
+            kernelInfo, 
+            OrtInputTensors::ortInputScale,
+            TensorAxis::DoNotCoerce, 
+            TensorAxis::H,
+            TensorAxis::LeftAligned,
+            std::nullopt,
+            dmlDimSize
+            );
+
+        // Resize the Input ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint))
+        {
+
+            m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = CreateTensorDescFromInput(
+                kernelInfo, 
+                OrtInputTensors::ortOutputZeroPoint,
+                TensorAxis::DoNotCoerce, 
+                TensorAxis::H,
+                TensorAxis::LeftAligned,
+                std::nullopt,
+                dmlDimSize
+                );
+        }
+
+        // Initialize the output description while overriding the shape
+        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape);
+
+        assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize));
+
+        // DML requires that DimensionCount be equal to Input.DimCount - 2 for Pooling
+        uint32_t expectedSpatialDimCount = m_inputTensorDescs[0].GetDimensionCount() - 2;
+        if (m_kernel.spatialDimensionCount < expectedSpatialDimCount)
+        {
+            size_t shift = expectedSpatialDimCount - m_kernel.spatialDimensionCount;
+
+            for (int i = gsl::narrow_cast<int>(m_kernel.spatialDimensionCount) - 1; i >= 0; i--)
+            {
+                m_kernel.windowSize[i + shift] = m_kernel.windowSize[i];
+                m_kernel.windowSize[i] = 1;
+
+                m_kernel.strides[i + shift] = m_kernel.strides[i];
+                m_kernel.strides[i] = 1;
+
+                m_kernel.startPadding[i + shift] = m_kernel.startPadding[i];
+                m_kernel.startPadding[i] = 0;
+
+                m_kernel.endPadding[i + shift] = m_kernel.endPadding[i];
+                m_kernel.endPadding[i] = 0;
+
+                m_kernel.dilations[i + shift] = m_kernel.dilations[i];
+                m_kernel.dilations[i] = 1;
+            }
+
+            m_kernel.spatialDimensionCount = expectedSpatialDimCount;
+        }
+
+        if (isNhwc)
+        {
+            uint32_t dimCount = m_inputTensorDescs[0].GetDimensionCount();
+            const auto inputSizes = m_inputTensorDescs[OrtInputTensors::ortInput].GetSizes();
+            std::vector<uint32_t> nchwInputSizes;
+            std::vector<uint32_t> nchwInputStrides;
+            ConvertNHWCToNCHW(dimCount, inputSizes, nchwInputSizes, nchwInputStrides);
+            m_inputTensorDescs[OrtInputTensors::ortInput] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInput].GetDmlDataType(), nchwInputSizes, nchwInputStrides);
+
+            gsl::span<const uint32_t> inputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortInputScale].GetSizes();
+            std::vector<uint32_t> nchwInputScaleSizes;
+            std::vector<uint32_t> nchwInputScaleStrides;
+            ConvertNHWCToNCHW(dimCount, inputScaleSizes, nchwInputScaleSizes, nchwInputScaleStrides);
+            m_inputTensorDescs[OrtInputTensors::ortInputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputScale].GetDmlDataType(),     nchwInputScaleSizes, nchwInputScaleStrides);
+
+            gsl::span<const uint32_t> inputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetSizes();
+            std::vector<uint32_t> nchwInputZeroPointSizes;
+            std::vector<uint32_t> nchwInputZeroPointStrides;
+            ConvertNHWCToNCHW(dimCount, inputZeroPointSizes, nchwInputZeroPointSizes, nchwInputZeroPointStrides);
+            m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetDmlDataType(), nchwInputZeroPointSizes, nchwInputZeroPointStrides);
+
+            gsl::span<const uint32_t> outputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetSizes();
+            std::vector<uint32_t> nchwOutputScaleSizes;
+            std::vector<uint32_t> nchwOutputScaleStrides;
+            ConvertNHWCToNCHW(dimCount, outputScaleSizes, nchwOutputScaleSizes, nchwOutputScaleStrides);
+            m_inputTensorDescs[OrtInputTensors::ortOutputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetDmlDataType(), nchwOutputScaleSizes, nchwOutputScaleStrides);
+
+            gsl::span<const uint32_t> outputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetSizes();
+            std::vector<uint32_t> nchwOutputZeroPointSizes;
+            std::vector<uint32_t> nchwOutputZeroPointStrides;
+            ConvertNHWCToNCHW(dimCount, outputZeroPointSizes, nchwOutputZeroPointSizes, nchwOutputZeroPointStrides);
+            m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetDmlDataType(), nchwOutputZeroPointSizes, nchwOutputZeroPointStrides);
+
+            gsl::span<const uint32_t> outputSizes = m_outputTensorDescs[0].GetSizes();
+            std::vector<uint32_t> nchwOutputSizes;
+            std::vector<uint32_t> nchwOutputStrides;
+            ConvertNHWCToNCHW(dimCount, outputSizes, nchwOutputSizes, nchwOutputStrides);
+            m_outputTensorDescs[0] = TensorDesc(m_outputTensorDescs[0].GetDmlDataType(), nchwOutputSizes, nchwOutputStrides);
+        }
+
+        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
+        std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
+
+        DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC qLinearAvgPooldesc = {};
+
+        qLinearAvgPooldesc.InputTensor = &inputDescs[OrtInputTensors::ortInput];
+        qLinearAvgPooldesc.InputScaleTensor = &inputDescs[OrtInputTensors::ortInputScale];
+        qLinearAvgPooldesc.InputZeroPointTensor = &inputDescs[OrtInputTensors::ortInputZeroPoint];
+        qLinearAvgPooldesc.OutputScaleTensor = &inputDescs[OrtInputTensors::ortOutputScale];;
+        qLinearAvgPooldesc.OutputZeroPointTensor = &inputDescs[OrtInputTensors::ortOutputZeroPoint];;
+        qLinearAvgPooldesc.OutputTensor = &outputDescs[0];
+        qLinearAvgPooldesc.DimensionCount = m_kernel.spatialDimensionCount;
+        qLinearAvgPooldesc.WindowSize = m_kernel.windowSize;
+        qLinearAvgPooldesc.Strides = m_kernel.strides;
+        qLinearAvgPooldesc.StartPadding = m_kernel.startPadding;
+        qLinearAvgPooldesc.EndPadding = m_kernel.endPadding;
+        qLinearAvgPooldesc.Dilations = m_kernel.dilations;
+        qLinearAvgPooldesc.IncludePadding = kernelInfo.GetOptionalAttribute<bool>(AttrName::CountIncludePad, false);
+
+        DML_OPERATOR_DESC opDesc = { (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING, &qLinearAvgPooldesc };
+        SetDmlOperatorDesc(opDesc, kernelInfo);
+    }
+};
+
+template <bool UseGlobalPooling>
+class DmlOperatorQuantizedPoolingTemplate : public DmlOperatorQLinearAveragePooling
+{
+public:
+    DmlOperatorQuantizedPoolingTemplate(const MLOperatorKernelCreationContext& kernelInfo)
+    :   DmlOperatorQLinearAveragePooling(kernelInfo, UseGlobalPooling)
+    {
+    }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(QLinearAveragePool, DmlOperatorQuantizedPoolingTemplate<false>);
+DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQuantizedPoolingTemplate<true>);
+//DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQLinearAveragePooling, true); //useGobalPool
+
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index 07ff4f3145459..c22254fee76a7 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -257,6 +257,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(GlobalMaxPool);
 DML_OP_EXTERN_CREATION_FUNCTION(LpPool);
 DML_OP_EXTERN_CREATION_FUNCTION(GlobalLpPool);
 DML_OP_EXTERN_CREATION_FUNCTION(MaxRoiPool);
+DML_OP_EXTERN_CREATION_FUNCTION(QLinearAveragePool);
 DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign10);
 DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign16);
 DML_OP_EXTERN_CREATION_FUNCTION(InstanceNormalization);
@@ -587,6 +588,10 @@ constexpr static std::array<SupportedTensorDataTypes, 3> supportedTypeListQLinea
     SupportedTensorDataTypes::Ints8Bit|SupportedTensorDataTypes::Float32,
 };
 
+constexpr static std::array<SupportedTensorDataTypes, 1> supportedTypeListQLinearAveragePool = {
+    SupportedTensorDataTypes::Ints8Bit
+};
+
 template<typename... Args>
 constexpr auto requiredConstantCpuInputs(Args... args)
 {
@@ -992,6 +997,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(      9,  MaxUnpool,                          typeNameListTwo,                supportedTypeListMaxUnpool,             DmlGraphSupport::Supported,      requiredConstantCpuInputs(2))},
     {REG_INFO(     11,  MaxUnpool,                          typeNameListTwo,                supportedTypeListMaxUnpool,             DmlGraphSupport::Supported,      requiredConstantCpuInputs(2))},  // 11 is identical to 9.
 
+    {REG_INFO_MS(  1,   QLinearAveragePool,                 typeNameListDefault,            supportedTypeListQLinearAveragePool,    DmlGraphSupport::Supported)},
     {REG_INFO_MS(  1,   QLinearAdd,                         typeNameListDefault,            supportedTypeListInteger8,              DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearConv,                        typeNameListFour,               supportedTypeListQLinearConv,           DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearMatMul,                      typeNameListThree,              supportedTypeListQLinearMatMul,         DmlGraphSupport::Supported)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h
index 5be84a931f4f1..543e30fcd9722 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h
@@ -24,6 +24,7 @@ namespace AttrName
     static constexpr const char* Border = "border";
     static constexpr const char* Broadcast = "broadcast";
     static constexpr const char* CeilMode = "ceil_mode";
+    static constexpr const char* ChannelsLast = "channels_last";
     static constexpr const char* Clip = "clip";
     static constexpr const char* CoordinateTransformationMode = "coordinate_transformation_mode";
     static constexpr const char* CountIncludePad = "count_include_pad";
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 4d59964dcc664..aa6b5baa1aa07 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -495,6 +495,7 @@ namespace OperatorHelper
         }
 
         args.useCeilingOutputShape = kernelInfo.GetOptionalAttribute<bool>(AttrName::CeilMode, 0);
+        args.channelsLast = kernelInfo.GetOptionalAttribute<bool>(AttrName::ChannelsLast, 0);
 
         return args;
     }
@@ -2012,7 +2013,24 @@ namespace OperatorHelper
         }
         return outputShapes;
     }
+    
+    std::vector<EdgeShapes> QLinearAveragePoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        auto inputShape = shapeInfo.GetInputTensorShape(0);
+        std::vector<DimensionType> outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast);
 
+        // MaxPool may have both an output and an indices tensor (both the same size).
+        const uint32_t outputCount = shapeInfo.GetOutputCount();
+        assert(outputCount == 1 || outputCount == 2);
+
+        std::vector<EdgeShapes> outputShapes;
+        for (uint32_t i = 0; i < outputCount; ++i)
+        {
+            outputShapes.push_back(outputDimensions);
+        }
+        return outputShapes;
+    }
+    
     std::vector<EdgeShapes> RoiPoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
     {
         auto roiShape = shapeInfo.GetInputTensorShape(InputTensors::ROIS);
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 5add951dccb78..47e7573951803 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -160,6 +160,7 @@ struct KernelArgs
     bool autoPad = false;
     bool autoPadSameUpper = false;
     bool useCeilingOutputShape = false;
+    bool channelsLast = false;
     uint32_t spatialDimensionCount = 0;
 
     KernelArgs(uint32_t spatialDimensionCount) : spatialDimensionCount(spatialDimensionCount)
@@ -188,6 +189,7 @@ struct KernelArgs
     KernelArgs(KernelArgs const& kernelArgs, uint32_t minimumDimensionCount)
     :   autoPad(kernelArgs.autoPad),
         autoPadSameUpper(kernelArgs.autoPadSameUpper),
+        channelsLast(kernelArgs.channelsLast),
         spatialDimensionCount(std::max(kernelArgs.spatialDimensionCount, minimumDimensionCount))
     {
         ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount);
@@ -1168,6 +1170,15 @@ class RoiAlignHelper : public RoiPoolingHelperBase
     std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
 };
 
+class QLinearAveragePoolingHelper : public PoolingHelperBase
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape/*, bool useGlobalPooling */) : PoolingHelperBase(info, shape, false) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+};
+
 class SqueezeHelper
 {
 public:
@@ -1497,6 +1508,7 @@ using ShapeInferenceHelper_MaxUnpool = UnpoolingHelper;
 using ShapeInferenceHelper_LpPool = PoolingHelper;
 using ShapeInferenceHelper_GlobalLpPool = GlobalPoolingHelper;
 using ShapeInferenceHelper_MaxRoiPool = RoiPoolingHelper;
+using ShapeInferenceHelper_QLinearAveragePool = QLinearAveragePoolingHelper;
 using ShapeInferenceHelper_RoiAlign10 = VersionedOpsetHelper<RoiAlignHelper, 10>;
 using ShapeInferenceHelper_RoiAlign16 = VersionedOpsetHelper<RoiAlignHelper, 16>;
 using ShapeInferenceHelper_InstanceNormalization = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index d785f77e24344..5293d630f40f2 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -441,6 +441,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_GroupNorm = 1;
         static const int sc_sinceVer_DynamicQuantizeMatMul = 1;
         static const int sc_sinceVer_QLinearConcat = 1;
+        static const int sc_sinceVer_QLinearAveragePool = 1;
     } // namespace MsftOperatorSet1
 
 } // namespace OperatorHelper

From bf31e674c99ce7c544c9dd0fa561e894bb6f7157 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Mon, 18 Sep 2023 13:20:42 -0700
Subject: [PATCH 2/9] Implementation for QLinearGlobalAveragePool and
 addressing few review comments

---
 .../src/Operators/DmlOperator.cpp             | 12 ++++----
 .../src/Operators/DmlOperator.h               |  4 +--
 .../DmlOperatorQLinearAveragePooling.cpp      |  6 +---
 .../src/Operators/OperatorRegistration.cpp    |  2 ++
 .../OperatorAuthorHelper/OperatorHelper.cpp   | 28 ++++++++++++++++---
 .../dml/OperatorAuthorHelper/OperatorHelper.h | 18 ++++++++++--
 .../OperatorAuthorHelper/OperatorVersions.h   |  1 +
 .../qlinear_global_average_pool_test.cc       |  3 ++
 .../test/contrib_ops/qlinear_pool_test.cc     | 10 +++++++
 9 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 78f7d1aef33bd..5c2b75d197352 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -636,17 +636,17 @@ namespace Dml
     }
 
     void DmlOperator::ConvertNHWCToNCHW(
-        const uint32_t& dimCount, 
+        const uint32_t dimCount, 
         const gsl::span<const uint32_t> nhwcSizes,
         std::vector<uint32_t>& nchwSizes,
         std::vector<uint32_t>& nchwInputStrides)
     {
         int i = 0;
-        const uint32_t inputBatch = nhwcSizes[ i++ ];
-        const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[ i++ ] : 0;
-        const uint32_t inputHeight = nhwcSizes[ i++ ];
-        const uint32_t inputWidth = nhwcSizes[ i++ ];
-        const uint32_t inputChannels = nhwcSizes[ i++ ];
+        const uint32_t inputBatch = nhwcSizes[i++];
+        const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[i++] : 0;
+        const uint32_t inputHeight = nhwcSizes[i++];
+        const uint32_t inputWidth = nhwcSizes[i++];
+        const uint32_t inputChannels = nhwcSizes[i++];
 
         if (dimCount == 4)
         {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index 394195ce14a6d..cecb943c382cf 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -119,8 +119,8 @@ namespace Dml
         ComPtr<IDMLCompiledOperator> InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes);
         void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor);
 
-        void DmlOperator::ConvertNHWCToNCHW(
-            const uint32_t& dimCount, 
+        void ConvertNHWCToNCHW(
+            const uint32_t dimCount, 
             const gsl::span<const uint32_t> nhwcSizes,
             std::vector<uint32_t>& nchwSizes,
             std::vector<uint32_t>& nchwInputStrides
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
index 4303149eae347..2d4b28b69126e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
@@ -35,11 +35,8 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe
         std::vector<DimensionType> inputShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortInput);
         std::vector<DimensionType> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
 
-        // Initialize the input descriptions with broadcasting
-        m_inputTensorDescs[OrtInputTensors::ortInput] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortInput, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape);
-
         uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount();
-        // Resize the Input Scale to be the same dimension as the input tensor.
+        // Reshape the Input Scale to be the same dimension as the input tensor.
         // The 1D tensor needs to be moved to the H channel.
         m_inputTensorDescs[OrtInputTensors::ortInputScale] = CreateTensorDescFromInput(
             kernelInfo, 
@@ -203,6 +200,5 @@ class DmlOperatorQuantizedPoolingTemplate : public DmlOperatorQLinearAveragePool
 
 DML_OP_DEFINE_CREATION_FUNCTION(QLinearAveragePool, DmlOperatorQuantizedPoolingTemplate<false>);
 DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQuantizedPoolingTemplate<true>);
-//DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQLinearAveragePooling, true); //useGobalPool
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index c22254fee76a7..daa8d70b6dac2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -258,6 +258,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(LpPool);
 DML_OP_EXTERN_CREATION_FUNCTION(GlobalLpPool);
 DML_OP_EXTERN_CREATION_FUNCTION(MaxRoiPool);
 DML_OP_EXTERN_CREATION_FUNCTION(QLinearAveragePool);
+DML_OP_EXTERN_CREATION_FUNCTION(QLinearGlobalAveragePool);
 DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign10);
 DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign16);
 DML_OP_EXTERN_CREATION_FUNCTION(InstanceNormalization);
@@ -998,6 +999,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(     11,  MaxUnpool,                          typeNameListTwo,                supportedTypeListMaxUnpool,             DmlGraphSupport::Supported,      requiredConstantCpuInputs(2))},  // 11 is identical to 9.
 
     {REG_INFO_MS(  1,   QLinearAveragePool,                 typeNameListDefault,            supportedTypeListQLinearAveragePool,    DmlGraphSupport::Supported)},
+    {REG_INFO_MS(  1,   QLinearGlobalAveragePool,           typeNameListDefault,            supportedTypeListQLinearAveragePool,    DmlGraphSupport::Supported)},
     {REG_INFO_MS(  1,   QLinearAdd,                         typeNameListDefault,            supportedTypeListInteger8,              DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearConv,                        typeNameListFour,               supportedTypeListQLinearConv,           DmlGraphSupport::Supported)},
     {REG_INFO(     10,  QLinearMatMul,                      typeNameListThree,              supportedTypeListQLinearMatMul,         DmlGraphSupport::Supported)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index aa6b5baa1aa07..1fcd3b04300f4 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -365,13 +365,20 @@ namespace OperatorHelper
     }
 
     // Creates a kernel that spans the entire spatial dimensions of the input.
-    KernelArgs InitializeGlobalKernel(gsl::span<const DimensionType> inputDimensions)
+    KernelArgs InitializeGlobalKernel(
+        const MLOperatorAttributes& kernelInfo,
+        gsl::span<const DimensionType> inputDimensions)
     {
         ML_CHECK_VALID_ARGUMENT(inputDimensions.size() > NonspatialDimensionCount); // Must be at least 1D convolution (in 3D tensor)
         uint32_t spatialDimensionCount = gsl::narrow_cast<uint32_t>(inputDimensions.size()) - NonspatialDimensionCount;
         ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount); // Support up to 3D convolution (in 5D tensor).
 
         KernelArgs args(spatialDimensionCount);
+        args.useCeilingOutputShape = kernelInfo.GetOptionalAttribute<bool>(AttrName::CeilMode, 0);
+        args.channelsLast = kernelInfo.GetOptionalAttribute<bool>(AttrName::ChannelsLast, 0);
+        // For Global Pooling, kernel size equal to the spatial dimension of input tensor
+        // NHWC layout need to offset by one dim to acount for channel placed at the end
+        int dimOffset = args.channelsLast ? 1 : 0;
 
         for (size_t dim = 0; dim < spatialDimensionCount; ++dim)
         {
@@ -379,7 +386,7 @@ namespace OperatorHelper
             args.dilations[dim] = 1;
             args.startPadding[dim] = 0;
             args.endPadding[dim] = 0;
-            args.windowSize[dim] = gsl::narrow_cast<uint32_t>(inputDimensions[inputDimensions.size() - spatialDimensionCount + dim]);
+            args.windowSize[dim] = gsl::narrow_cast<uint32_t>(inputDimensions[inputDimensions.size() - spatialDimensionCount + dim - dimOffset]);
         }
 
         return args;
@@ -2019,9 +2026,22 @@ namespace OperatorHelper
         auto inputShape = shapeInfo.GetInputTensorShape(0);
         std::vector<DimensionType> outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast);
 
-        // MaxPool may have both an output and an indices tensor (both the same size).
         const uint32_t outputCount = shapeInfo.GetOutputCount();
-        assert(outputCount == 1 || outputCount == 2);
+
+        std::vector<EdgeShapes> outputShapes;
+        for (uint32_t i = 0; i < outputCount; ++i)
+        {
+            outputShapes.push_back(outputDimensions);
+        }
+        return outputShapes;
+    }
+
+    std::vector<EdgeShapes> QLinearGlobalAveragePoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        auto inputShape = shapeInfo.GetInputTensorShape(0);
+        std::vector<DimensionType> outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast);
+
+        const uint32_t outputCount = shapeInfo.GetOutputCount();
 
         std::vector<EdgeShapes> outputShapes;
         for (uint32_t i = 0; i < outputCount; ++i)
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 47e7573951803..8d7f0b5b043d0 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -213,7 +213,9 @@ std::vector<DimensionType> InitializeKernelOutputDimsTranspose(
     gsl::span<const DimensionType> inputDimensions,
     const KernelArgs& args);
 
-KernelArgs InitializeGlobalKernel(gsl::span<const DimensionType> inputDimensions);
+KernelArgs InitializeGlobalKernel(
+        const MLOperatorAttributes& kernelInfo,
+        gsl::span<const DimensionType> inputDimensions);
 
 KernelArgs InitializeKernel(
     const MLOperatorAttributes& kernelInfo,
@@ -1068,7 +1070,7 @@ class PoolingHelperBase
         bool useGlobalPooling
     )
     :   m_kernel(useGlobalPooling
-            ? InitializeGlobalKernel(shape.GetInputTensorShape(0))
+            ? InitializeGlobalKernel(info, shape.GetInputTensorShape(0))
             : InitializeKernel(info, static_cast<uint32_t>(shape.GetInputTensorShape(0).size()), gsl::span<uint32_t>()))
     {
         if (!useGlobalPooling)
@@ -1174,7 +1176,16 @@ class QLinearAveragePoolingHelper : public PoolingHelperBase
 {
 public:
     template <typename Info_t, typename Shape_t>
-    QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape/*, bool useGlobalPooling */) : PoolingHelperBase(info, shape, false) {}
+    QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape) : PoolingHelperBase(info, shape, false) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+
+};
+
+class QLinearGlobalAveragePoolingHelper : public PoolingHelperBase
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    QLinearGlobalAveragePoolingHelper(const Info_t& info, const Shape_t& shape) : PoolingHelperBase(info, shape, true) {}
     std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
 
 };
@@ -1509,6 +1520,7 @@ using ShapeInferenceHelper_LpPool = PoolingHelper;
 using ShapeInferenceHelper_GlobalLpPool = GlobalPoolingHelper;
 using ShapeInferenceHelper_MaxRoiPool = RoiPoolingHelper;
 using ShapeInferenceHelper_QLinearAveragePool = QLinearAveragePoolingHelper;
+using ShapeInferenceHelper_QLinearGlobalAveragePool = QLinearGlobalAveragePoolingHelper;
 using ShapeInferenceHelper_RoiAlign10 = VersionedOpsetHelper<RoiAlignHelper, 10>;
 using ShapeInferenceHelper_RoiAlign16 = VersionedOpsetHelper<RoiAlignHelper, 16>;
 using ShapeInferenceHelper_InstanceNormalization = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index 5293d630f40f2..078f4a7aef6b0 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -442,6 +442,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_DynamicQuantizeMatMul = 1;
         static const int sc_sinceVer_QLinearConcat = 1;
         static const int sc_sinceVer_QLinearAveragePool = 1;
+        static const int sc_sinceVer_QLinearGlobalAveragePool = 1;
     } // namespace MsftOperatorSet1
 
 } // namespace OperatorHelper
diff --git a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
index 8fb245819fd26..71b6f27b5391f 100644
--- a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
@@ -66,6 +66,9 @@ void RunQLinearGlobalAveragePool(
   test.AddInput<float>("y_scale", {}, {y_scale});
   test.AddInput<T8Bits>("y_zero_point", {}, {y_zero_point});
   test.AddOutput<T8Bits>("Y", y_dims, y_data);
+  if (channels_last) {
+    test.AddAttribute("channels_last", (int64_t)1LL);
+  }
 
   auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
     const OrtValue& ort_value = fetches[0];
diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
index 78f7f431aa66e..dfe50b8486857 100644
--- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
@@ -646,5 +646,15 @@ TEST(QLinearPoolTest, AveragePool2D_Global_nhwc_S8) {
       1);              // count_include_pad
 }
 
+TEST(QLinearPoolTest, AveragePool2D_Global_mock) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 32, 32},  // x shape
+      {1, 1, 1, 1},    // expected y shape
+      {32, 32},        // kernel shape
+      {1, 1},          // strides
+      {0, 0, 0, 0},    // pads
+      1);              // count_include_pad
+}
+
 }  // namespace test
 }  // namespace onnxruntime

From 38ed2212b4133c344b28fd26491c193ac7214055 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Wed, 20 Sep 2023 02:44:20 -0700
Subject: [PATCH 3/9] Update Layout Conversion logic

---
 .../DmlOperatorQLinearAveragePooling.cpp      | 143 ++++++------------
 .../DmlExecutionProvider/src/TensorDesc.cpp   |  49 ++++++
 .../dml/DmlExecutionProvider/src/TensorDesc.h |   4 +
 3 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
index 2d4b28b69126e..8433e90bb9b24 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
@@ -36,68 +36,9 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe
         std::vector<DimensionType> outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0);
 
         uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount();
-        // Reshape the Input Scale to be the same dimension as the input tensor.
-        // The 1D tensor needs to be moved to the H channel.
-        m_inputTensorDescs[OrtInputTensors::ortInputScale] = CreateTensorDescFromInput(
-            kernelInfo, 
-            OrtInputTensors::ortInputScale,
-            TensorAxis::DoNotCoerce, 
-            TensorAxis::H,
-            TensorAxis::LeftAligned,
-            std::nullopt,
-            dmlDimSize
-            );
-
-        // Resize the Input ZeroPoint to be the same dimension as the input tensor.
-        // The 1D tensor needs to be moved to the H channel.
-        if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint))
-        {
-
-            m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = CreateTensorDescFromInput(
-                kernelInfo, 
-                OrtInputTensors::ortInputZeroPoint,
-                TensorAxis::DoNotCoerce, 
-                TensorAxis::H,
-                TensorAxis::LeftAligned,
-                std::nullopt,
-                dmlDimSize
-                );
-        }
-
-        // Resize the Output Scale to be the same dimension as the input tensor.
-        // The 1D tensor needs to be moved to the H channel.
-        m_inputTensorDescs[OrtInputTensors::ortOutputScale] = CreateTensorDescFromInput(
-            kernelInfo, 
-            OrtInputTensors::ortInputScale,
-            TensorAxis::DoNotCoerce, 
-            TensorAxis::H,
-            TensorAxis::LeftAligned,
-            std::nullopt,
-            dmlDimSize
-            );
-
-        // Resize the Input ZeroPoint to be the same dimension as the input tensor.
-        // The 1D tensor needs to be moved to the H channel.
-        if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint))
-        {
-
-            m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = CreateTensorDescFromInput(
-                kernelInfo, 
-                OrtInputTensors::ortOutputZeroPoint,
-                TensorAxis::DoNotCoerce, 
-                TensorAxis::H,
-                TensorAxis::LeftAligned,
-                std::nullopt,
-                dmlDimSize
-                );
-        }
-
-        // Initialize the output description while overriding the shape
-        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape);
-
-        assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize));
-
-        // DML requires that DimensionCount be equal to Input.DimCount - 2 for Pooling
+        ML_CHECK_VALID_ARGUMENT(dmlDimSize >= 2);
+        
+        // DML requires that DimensionCount be equal to Input.dmlDimSize - 2 for Pooling
         uint32_t expectedSpatialDimCount = m_inputTensorDescs[0].GetDimensionCount() - 2;
         if (m_kernel.spatialDimensionCount < expectedSpatialDimCount)
         {
@@ -124,45 +65,51 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe
             m_kernel.spatialDimensionCount = expectedSpatialDimCount;
         }
 
+        // Initialize dimensionMapping for NCHW or NHWC layout
+        std::vector<uint32_t> dimensionMapping = {0u, dmlDimSize - 1u};
+        dimensionMapping.resize(dmlDimSize);
         if (isNhwc)
         {
-            uint32_t dimCount = m_inputTensorDescs[0].GetDimensionCount();
-            const auto inputSizes = m_inputTensorDescs[OrtInputTensors::ortInput].GetSizes();
-            std::vector<uint32_t> nchwInputSizes;
-            std::vector<uint32_t> nchwInputStrides;
-            ConvertNHWCToNCHW(dimCount, inputSizes, nchwInputSizes, nchwInputStrides);
-            m_inputTensorDescs[OrtInputTensors::ortInput] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInput].GetDmlDataType(), nchwInputSizes, nchwInputStrides);
-
-            gsl::span<const uint32_t> inputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortInputScale].GetSizes();
-            std::vector<uint32_t> nchwInputScaleSizes;
-            std::vector<uint32_t> nchwInputScaleStrides;
-            ConvertNHWCToNCHW(dimCount, inputScaleSizes, nchwInputScaleSizes, nchwInputScaleStrides);
-            m_inputTensorDescs[OrtInputTensors::ortInputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputScale].GetDmlDataType(),     nchwInputScaleSizes, nchwInputScaleStrides);
-
-            gsl::span<const uint32_t> inputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetSizes();
-            std::vector<uint32_t> nchwInputZeroPointSizes;
-            std::vector<uint32_t> nchwInputZeroPointStrides;
-            ConvertNHWCToNCHW(dimCount, inputZeroPointSizes, nchwInputZeroPointSizes, nchwInputZeroPointStrides);
-            m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetDmlDataType(), nchwInputZeroPointSizes, nchwInputZeroPointStrides);
-
-            gsl::span<const uint32_t> outputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetSizes();
-            std::vector<uint32_t> nchwOutputScaleSizes;
-            std::vector<uint32_t> nchwOutputScaleStrides;
-            ConvertNHWCToNCHW(dimCount, outputScaleSizes, nchwOutputScaleSizes, nchwOutputScaleStrides);
-            m_inputTensorDescs[OrtInputTensors::ortOutputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetDmlDataType(), nchwOutputScaleSizes, nchwOutputScaleStrides);
-
-            gsl::span<const uint32_t> outputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetSizes();
-            std::vector<uint32_t> nchwOutputZeroPointSizes;
-            std::vector<uint32_t> nchwOutputZeroPointStrides;
-            ConvertNHWCToNCHW(dimCount, outputZeroPointSizes, nchwOutputZeroPointSizes, nchwOutputZeroPointStrides);
-            m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetDmlDataType(), nchwOutputZeroPointSizes, nchwOutputZeroPointStrides);
-
-            gsl::span<const uint32_t> outputSizes = m_outputTensorDescs[0].GetSizes();
-            std::vector<uint32_t> nchwOutputSizes;
-            std::vector<uint32_t> nchwOutputStrides;
-            ConvertNHWCToNCHW(dimCount, outputSizes, nchwOutputSizes, nchwOutputStrides);
-            m_outputTensorDescs[0] = TensorDesc(m_outputTensorDescs[0].GetDmlDataType(), nchwOutputSizes, nchwOutputStrides);
+            // Form a remapping for dimensions so C is moved before the spatial dimensions.
+            // e.g. NWC   -> {0,2,1}     -> NCW
+            //      NHWC  -> {0,3,1,2}   -> NCHW
+            //      NDHWC -> {0,4,1,2,3} -> NCDHW
+            std::iota(dimensionMapping.begin() + 2, dimensionMapping.end(), 1u);
+        }
+        else
+        {
+            // Use NCHW {0,1,2,3} format with increasing order of indexs 
+            std::iota(dimensionMapping.begin() + 1, dimensionMapping.end(), 1u);
         }
+        m_inputTensorDescs[OrtInputTensors::ortInput].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        // Reshape the Input Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[OrtInputTensors::ortInputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        // Reshape the Input ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint))
+        {
+            m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+        }
+
+        // Reshape the Output Scale to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        m_inputTensorDescs[OrtInputTensors::ortOutputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+
+        // Reshape the Input ZeroPoint to be the same dimension as the input tensor.
+        // The 1D tensor needs to be moved to the H channel.
+        if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint))
+        {
+            m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+        }
+
+        // Initialize the output description while overriding the shape
+        m_outputTensorDescs[0].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
+
+        assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize));
 
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
         std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 067a320dd8000..36156e98dd311 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -315,3 +315,52 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm
     }
     m_bufferTensorDesc.DimensionCount = newDimensionCount;
 }
+
+// Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout
+void TensorDesc::PermuteDimensions(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment)
+{
+    EnsureMinimumDimensionCount(dimensionMapping.size(), alignment);
+    InitializeStrides(static_cast<int>(dimensionMapping.size()), alignment);
+    PermuteArray(dimensionMapping, alignment);
+}
+
+// Shuffle m_sizes and m_strides acording to the indexes pointed by dimensionMapping
+void TensorDesc::PermuteArray(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment)
+{
+    std::vector<uint32_t> temp_sizes(dimensionMapping.size());
+    std::vector<uint32_t> temp_strides(dimensionMapping.size());
+    // Right alignment values are shifted to the end
+    int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast<int>(dimensionMapping.size()) : 0;
+
+    for (size_t i = 0; i < dimensionMapping.size(); i++) {
+        temp_sizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset];
+        temp_strides[i] = m_strides[dimensionMapping[i] + alignmentOffset];
+    }
+
+    std::copy(temp_sizes.begin(), temp_sizes.end(), m_sizes + alignmentOffset);
+    std::copy(temp_strides.begin(), temp_strides.end(), m_strides + alignmentOffset);
+    m_bufferTensorDesc.Sizes = m_sizes;
+    m_bufferTensorDesc.Strides = m_strides;
+}
+
+void TensorDesc::InitializeStrides(int dimensionCount, const TensorAxis alignment)
+{
+    ML_CHECK_VALID_ARGUMENT(alignment == TensorAxis::RightAligned || alignment == TensorAxis::LeftAligned);
+    // Right alignment values are shifted to the end
+    int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - dimensionCount : 0;
+    int index = dimensionCount + alignmentOffset;
+    uint32_t stride = 1;
+    m_strides[index - 1] = 1;
+    for (int i = index - 2; i >= 0; i--)
+    {
+        stride *= m_sizes[i + 1];
+        m_strides[i] = stride;
+    }
+}
+
+void TensorDesc::EnsureMinimumDimensionCount(const size_t dimensionCount, const TensorAxis alignment)
+{
+    // m_sizes and m_strides are arrays of size MaximumDimensionCount
+    assert(MaximumDimensionCount >= dimensionCount);
+    SetDimensionCount(static_cast<uint32_t>(dimensionCount), alignment);
+}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index ff70dec5b8871..041fee90284e5 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -44,6 +44,10 @@ namespace Dml
         gsl::span<const uint32_t> GetSizes() const { return { m_sizes, m_sizes + m_bufferTensorDesc.DimensionCount }; }
         gsl::span<const uint32_t> GetStrides() const;
         void SetStrides(gsl::span<const uint32_t> strides);
+        void PermuteDimensions(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment);
+        void PermuteArray(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment);
+        void InitializeStrides(int count, const TensorAxis alignment);
+        void EnsureMinimumDimensionCount(const size_t count, const TensorAxis alignment);
 
         inline uint64_t GetBufferSizeInBytes() const
         {

From f1905765ec8d072359a685572fe5a16294640788 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 21 Sep 2023 00:08:36 -0700
Subject: [PATCH 4/9] Address Reviews

---
 .../src/Operators/DmlOperator.cpp             | 34 --------------
 .../src/Operators/DmlOperator.h               |  7 ---
 .../DmlOperatorQLinearAveragePooling.cpp      |  1 -
 .../DmlExecutionProvider/src/TensorDesc.cpp   | 46 +++++++++++--------
 .../dml/DmlExecutionProvider/src/TensorDesc.h |  8 ++--
 .../test/contrib_ops/qlinear_pool_test.cc     | 10 ----
 6 files changed, 31 insertions(+), 75 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
index 5c2b75d197352..25c7be42d6425 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp
@@ -635,40 +635,6 @@ namespace Dml
             ));
     }
 
-    void DmlOperator::ConvertNHWCToNCHW(
-        const uint32_t dimCount, 
-        const gsl::span<const uint32_t> nhwcSizes,
-        std::vector<uint32_t>& nchwSizes,
-        std::vector<uint32_t>& nchwInputStrides)
-    {
-        int i = 0;
-        const uint32_t inputBatch = nhwcSizes[i++];
-        const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[i++] : 0;
-        const uint32_t inputHeight = nhwcSizes[i++];
-        const uint32_t inputWidth = nhwcSizes[i++];
-        const uint32_t inputChannels = nhwcSizes[i++];
-
-        if (dimCount == 4)
-        {
-            nchwSizes = { inputBatch, inputChannels, inputHeight, inputWidth };
-            nchwInputStrides = { inputHeight * inputWidth * inputChannels,
-                1,
-                inputWidth * inputChannels,
-                inputChannels
-            };
-        }
-        else
-        {
-            nchwSizes = { inputBatch, inputChannels, inputDepth, inputHeight, inputWidth };
-            nchwInputStrides = { inputDepth * inputChannels * inputWidth * inputHeight,
-                1,
-                inputChannels * inputWidth * inputHeight,
-                inputChannels * inputWidth,
-                inputChannels
-            };
-        }
-    }
-
     TensorDesc DmlOperator::CreateTensorDescFromInput(
         const MLOperatorKernelCreationContext& kernelInfo,
         uint32_t index,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index cecb943c382cf..df123f8db4658 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -118,13 +118,6 @@ namespace Dml
         //
         ComPtr<IDMLCompiledOperator> InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes);
         void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor);
-
-        void ConvertNHWCToNCHW(
-            const uint32_t dimCount, 
-            const gsl::span<const uint32_t> nhwcSizes,
-            std::vector<uint32_t>& nchwSizes,
-            std::vector<uint32_t>& nchwInputStrides
-            );
         
         TensorDesc CreateTensorDescFromInput(
             const MLOperatorKernelCreationContext& kernelInfo,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
index 8433e90bb9b24..0fccedfe311c1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp
@@ -98,7 +98,6 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe
         // The 1D tensor needs to be moved to the H channel.
         m_inputTensorDescs[OrtInputTensors::ortOutputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned);
 
-
         // Reshape the Input ZeroPoint to be the same dimension as the input tensor.
         // The 1D tensor needs to be moved to the H channel.
         if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint))
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 36156e98dd311..08ff04e0d5b57 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -317,38 +317,45 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm
 }
 
 // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout
-void TensorDesc::PermuteDimensions(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment)
+void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment)
 {
-    EnsureMinimumDimensionCount(dimensionMapping.size(), alignment);
-    InitializeStrides(static_cast<int>(dimensionMapping.size()), alignment);
+    EnsureMinimumDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
+    EnsureStridesExist(static_cast<uint32_t>(dimensionMapping.size()), alignment);
     PermuteArray(dimensionMapping, alignment);
 }
 
-// Shuffle m_sizes and m_strides acording to the indexes pointed by dimensionMapping
-void TensorDesc::PermuteArray(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment)
+// Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
+void TensorDesc::PermuteArray(gsl::span<const uint32_t> dimensionMapping, TensorAxis alignment)
 {
-    std::vector<uint32_t> temp_sizes(dimensionMapping.size());
-    std::vector<uint32_t> temp_strides(dimensionMapping.size());
+    std::vector<uint32_t> tempSizes(dimensionMapping.size());
+    std::vector<uint32_t> tempStrides(dimensionMapping.size());
     // Right alignment values are shifted to the end
     int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast<int>(dimensionMapping.size()) : 0;
 
-    for (size_t i = 0; i < dimensionMapping.size(); i++) {
-        temp_sizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset];
-        temp_strides[i] = m_strides[dimensionMapping[i] + alignmentOffset];
+    for (size_t i = 0; i < dimensionMapping.size(); i++)
+    {
+        tempSizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset];
+        tempStrides[i] = m_strides[dimensionMapping[i] + alignmentOffset];
     }
 
-    std::copy(temp_sizes.begin(), temp_sizes.end(), m_sizes + alignmentOffset);
-    std::copy(temp_strides.begin(), temp_strides.end(), m_strides + alignmentOffset);
+    std::copy(tempSizes.begin(), tempSizes.end(), m_sizes + alignmentOffset);
+    std::copy(tempStrides.begin(), tempStrides.end(), m_strides + alignmentOffset);
     m_bufferTensorDesc.Sizes = m_sizes;
     m_bufferTensorDesc.Strides = m_strides;
 }
 
-void TensorDesc::InitializeStrides(int dimensionCount, const TensorAxis alignment)
+void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignment)
 {
+    if (m_bufferTensorDesc.Strides != nullptr)
+    {
+        // Strides are populated
+        return;
+    }
+
     ML_CHECK_VALID_ARGUMENT(alignment == TensorAxis::RightAligned || alignment == TensorAxis::LeftAligned);
     // Right alignment values are shifted to the end
-    int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - dimensionCount : 0;
-    int index = dimensionCount + alignmentOffset;
+    int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast<int>(dimensionCount) : 0;
+    int index = static_cast<int>(dimensionCount) + alignmentOffset;
     uint32_t stride = 1;
     m_strides[index - 1] = 1;
     for (int i = index - 2; i >= 0; i--)
@@ -358,9 +365,10 @@ void TensorDesc::InitializeStrides(int dimensionCount, const TensorAxis alignmen
     }
 }
 
-void TensorDesc::EnsureMinimumDimensionCount(const size_t dimensionCount, const TensorAxis alignment)
+void TensorDesc::EnsureMinimumDimensionCount(uint32_t dimensionCount, TensorAxis alignment)
 {
-    // m_sizes and m_strides are arrays of size MaximumDimensionCount
-    assert(MaximumDimensionCount >= dimensionCount);
-    SetDimensionCount(static_cast<uint32_t>(dimensionCount), alignment);
+    if(dimensionCount != m_bufferTensorDesc.DimensionCount)
+    {
+        SetDimensionCount(dimensionCount, alignment);
+    }
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index 041fee90284e5..fbf28f2b425c4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -44,10 +44,8 @@ namespace Dml
         gsl::span<const uint32_t> GetSizes() const { return { m_sizes, m_sizes + m_bufferTensorDesc.DimensionCount }; }
         gsl::span<const uint32_t> GetStrides() const;
         void SetStrides(gsl::span<const uint32_t> strides);
-        void PermuteDimensions(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment);
-        void PermuteArray(const std::vector<uint32_t> dimensionMapping, const TensorAxis alignment);
-        void InitializeStrides(int count, const TensorAxis alignment);
-        void EnsureMinimumDimensionCount(const size_t count, const TensorAxis alignment);
+        void PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment);
+        void EnsureMinimumDimensionCount(uint32_t count, TensorAxis alignment);
 
         inline uint64_t GetBufferSizeInBytes() const
         {
@@ -94,6 +92,8 @@ namespace Dml
         uint32_t m_sizes[MaximumDimensionCount] = {};
         uint32_t m_strides[MaximumDimensionCount] = {};
         DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {};
+        void PermuteArray(gsl::span<const uint32_t> dimensionMapping, TensorAxis alignment);
+        void EnsureStridesExist(uint32_t count, TensorAxis alignment);
     };
 
     class TensorDescBuilder
diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
index dfe50b8486857..78f7f431aa66e 100644
--- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
@@ -646,15 +646,5 @@ TEST(QLinearPoolTest, AveragePool2D_Global_nhwc_S8) {
       1);              // count_include_pad
 }
 
-TEST(QLinearPoolTest, AveragePool2D_Global_mock) {
-  RunQLinearAveragePoolNhwc<int8_t>(
-      {1, 1, 32, 32},  // x shape
-      {1, 1, 1, 1},    // expected y shape
-      {32, 32},        // kernel shape
-      {1, 1},          // strides
-      {0, 0, 0, 0},    // pads
-      1);              // count_include_pad
-}
-
 }  // namespace test
 }  // namespace onnxruntime

From cb20fa443592fd0344c88934572df23d4790524e Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 21 Sep 2023 15:14:55 -0700
Subject: [PATCH 5/9] Address review 2

---
 .../src/Operators/DmlOperator.h               |  2 +-
 .../DmlExecutionProvider/src/TensorDesc.cpp   | 33 ++++++-------------
 .../dml/DmlExecutionProvider/src/TensorDesc.h |  5 ++-
 3 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
index df123f8db4658..c1e8cf42a974c 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h
@@ -118,7 +118,7 @@ namespace Dml
         //
         ComPtr<IDMLCompiledOperator> InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes);
         void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor);
-        
+
         TensorDesc CreateTensorDescFromInput(
             const MLOperatorKernelCreationContext& kernelInfo,
             uint32_t index,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 08ff04e0d5b57..653836f72bcae 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -319,32 +319,30 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm
 // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout
 void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment)
 {
-    EnsureMinimumDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
-    EnsureStridesExist(static_cast<uint32_t>(dimensionMapping.size()), alignment);
-    PermuteArray(dimensionMapping, alignment);
+    EnsureStridesExist(static_cast<uint32_t>(dimensionMapping.size()));
+    SetDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
+    PermuteArray(dimensionMapping);
 }
 
 // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
-void TensorDesc::PermuteArray(gsl::span<const uint32_t> dimensionMapping, TensorAxis alignment)
+void TensorDesc::PermuteArray(gsl::span<const uint32_t> dimensionMapping)
 {
     std::vector<uint32_t> tempSizes(dimensionMapping.size());
     std::vector<uint32_t> tempStrides(dimensionMapping.size());
-    // Right alignment values are shifted to the end
-    int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast<int>(dimensionMapping.size()) : 0;
 
     for (size_t i = 0; i < dimensionMapping.size(); i++)
     {
-        tempSizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset];
-        tempStrides[i] = m_strides[dimensionMapping[i] + alignmentOffset];
+        tempSizes[i] = m_sizes[dimensionMapping[i]];
+        tempStrides[i] = m_strides[dimensionMapping[i]];
     }
 
-    std::copy(tempSizes.begin(), tempSizes.end(), m_sizes + alignmentOffset);
-    std::copy(tempStrides.begin(), tempStrides.end(), m_strides + alignmentOffset);
+    std::copy(tempSizes.begin(), tempSizes.end(), m_sizes);
+    std::copy(tempStrides.begin(), tempStrides.end(), m_strides);
     m_bufferTensorDesc.Sizes = m_sizes;
     m_bufferTensorDesc.Strides = m_strides;
 }
 
-void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignment)
+void TensorDesc::EnsureStridesExist(uint32_t dimensionCount)
 {
     if (m_bufferTensorDesc.Strides != nullptr)
     {
@@ -352,10 +350,7 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignmen
         return;
     }
 
-    ML_CHECK_VALID_ARGUMENT(alignment == TensorAxis::RightAligned || alignment == TensorAxis::LeftAligned);
-    // Right alignment values are shifted to the end
-    int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast<int>(dimensionCount) : 0;
-    int index = static_cast<int>(dimensionCount) + alignmentOffset;
+    int index = static_cast<int>(dimensionCount);
     uint32_t stride = 1;
     m_strides[index - 1] = 1;
     for (int i = index - 2; i >= 0; i--)
@@ -364,11 +359,3 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignmen
         m_strides[i] = stride;
     }
 }
-
-void TensorDesc::EnsureMinimumDimensionCount(uint32_t dimensionCount, TensorAxis alignment)
-{
-    if(dimensionCount != m_bufferTensorDesc.DimensionCount)
-    {
-        SetDimensionCount(dimensionCount, alignment);
-    }
-}
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index fbf28f2b425c4..2a7b0f3714b5d 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -45,7 +45,6 @@ namespace Dml
         gsl::span<const uint32_t> GetStrides() const;
         void SetStrides(gsl::span<const uint32_t> strides);
         void PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment);
-        void EnsureMinimumDimensionCount(uint32_t count, TensorAxis alignment);
 
         inline uint64_t GetBufferSizeInBytes() const
         {
@@ -92,8 +91,8 @@ namespace Dml
         uint32_t m_sizes[MaximumDimensionCount] = {};
         uint32_t m_strides[MaximumDimensionCount] = {};
         DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {};
-        void PermuteArray(gsl::span<const uint32_t> dimensionMapping, TensorAxis alignment);
-        void EnsureStridesExist(uint32_t count, TensorAxis alignment);
+        void PermuteArray(gsl::span<const uint32_t> dimensionMapping);
+        void EnsureStridesExist(uint32_t count);
     };
 
     class TensorDescBuilder

From 24f1392091feab2f887fb61996c59e5e3043978e Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 21 Sep 2023 16:04:49 -0700
Subject: [PATCH 6/9] Clean up

---
 .../dml/DmlExecutionProvider/src/TensorDesc.cpp        | 10 +++-------
 .../dml/DmlExecutionProvider/src/TensorDesc.h          |  1 -
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 653836f72bcae..c685ec460dfe0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -321,14 +321,10 @@ void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, c
 {
     EnsureStridesExist(static_cast<uint32_t>(dimensionMapping.size()));
     SetDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
-    PermuteArray(dimensionMapping);
-}
 
-// Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
-void TensorDesc::PermuteArray(gsl::span<const uint32_t> dimensionMapping)
-{
-    std::vector<uint32_t> tempSizes(dimensionMapping.size());
-    std::vector<uint32_t> tempStrides(dimensionMapping.size());
+    // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
+    std::vector<uint32_t> tempSizes{m_sizes, m_sizes + MaximumDimensionCount};
+    std::vector<uint32_t> tempStrides{m_strides, m_strides + MaximumDimensionCount};
 
     for (size_t i = 0; i < dimensionMapping.size(); i++)
     {
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index 2a7b0f3714b5d..5925805025cc4 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -91,7 +91,6 @@ namespace Dml
         uint32_t m_sizes[MaximumDimensionCount] = {};
         uint32_t m_strides[MaximumDimensionCount] = {};
         DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {};
-        void PermuteArray(gsl::span<const uint32_t> dimensionMapping);
         void EnsureStridesExist(uint32_t count);
     };
 

From 2c4b8f91e8a0e53c98010c833631ea95995538ec Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 21 Sep 2023 16:57:31 -0700
Subject: [PATCH 7/9] remove some redundancies

---
 .../providers/dml/DmlExecutionProvider/src/TensorDesc.cpp   | 6 ++----
 .../providers/dml/DmlExecutionProvider/src/TensorDesc.h     | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index c685ec460dfe0..92d7e500afe15 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -328,12 +328,10 @@ void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, c
 
     for (size_t i = 0; i < dimensionMapping.size(); i++)
     {
-        tempSizes[i] = m_sizes[dimensionMapping[i]];
-        tempStrides[i] = m_strides[dimensionMapping[i]];
+        m_sizes[i] = tempSizes[dimensionMapping[i]];
+        m_strides[i] = tempStrides[dimensionMapping[i]];
     }
 
-    std::copy(tempSizes.begin(), tempSizes.end(), m_sizes);
-    std::copy(tempStrides.begin(), tempStrides.end(), m_strides);
     m_bufferTensorDesc.Sizes = m_sizes;
     m_bufferTensorDesc.Strides = m_strides;
 }
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index 5925805025cc4..57015e3fb58d1 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -91,6 +91,7 @@ namespace Dml
         uint32_t m_sizes[MaximumDimensionCount] = {};
         uint32_t m_strides[MaximumDimensionCount] = {};
         DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {};
+
         void EnsureStridesExist(uint32_t count);
     };
 

From 78ebdeadd6c88d4158f31c126ea7c80bad877b49 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 21 Sep 2023 18:25:11 -0700
Subject: [PATCH 8/9]  Update Stride Calculation

---
 .../providers/dml/DmlExecutionProvider/src/TensorDesc.cpp  | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 92d7e500afe15..85d54da120425 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -319,7 +319,7 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm
 // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout
 void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment)
 {
-    EnsureStridesExist(static_cast<uint32_t>(dimensionMapping.size()));
+    EnsureStridesExist(m_bufferTensorDesc.DimensionCount);
     SetDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
 
     // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
@@ -346,10 +346,9 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount)
 
     int index = static_cast<int>(dimensionCount);
     uint32_t stride = 1;
-    m_strides[index - 1] = 1;
-    for (int i = index - 2; i >= 0; i--)
+    for (int i = index; i-- > 0;)
     {
-        stride *= m_sizes[i + 1];
         m_strides[i] = stride;
+        stride *= m_sizes[i];
     }
 }

From 534985647b95d669550f684ab4fc0f2a923893f9 Mon Sep 17 00:00:00 2001
From: Anagha Rao <anagrao@microsoft.com>
Date: Thu, 21 Sep 2023 19:03:53 -0700
Subject: [PATCH 9/9] clean up 2

---
 .../providers/dml/DmlExecutionProvider/src/TensorDesc.cpp  | 7 +++----
 .../providers/dml/DmlExecutionProvider/src/TensorDesc.h    | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
index 85d54da120425..a2183aab52eed 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp
@@ -319,7 +319,7 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm
 // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout
 void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, const TensorAxis alignment)
 {
-    EnsureStridesExist(m_bufferTensorDesc.DimensionCount);
+    EnsureStridesExist();
     SetDimensionCount(static_cast<uint32_t>(dimensionMapping.size()), alignment);
 
     // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping
@@ -336,7 +336,7 @@ void TensorDesc::PermuteDimensions(gsl::span<const uint32_t> dimensionMapping, c
     m_bufferTensorDesc.Strides = m_strides;
 }
 
-void TensorDesc::EnsureStridesExist(uint32_t dimensionCount)
+void TensorDesc::EnsureStridesExist()
 {
     if (m_bufferTensorDesc.Strides != nullptr)
     {
@@ -344,9 +344,8 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount)
         return;
     }
 
-    int index = static_cast<int>(dimensionCount);
     uint32_t stride = 1;
-    for (int i = index; i-- > 0;)
+    for (uint32_t i = m_bufferTensorDesc.DimensionCount; i-- > 0;)
     {
         m_strides[i] = stride;
         stride *= m_sizes[i];
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
index 57015e3fb58d1..909e2084d0163 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h
@@ -92,7 +92,7 @@ namespace Dml
         uint32_t m_strides[MaximumDimensionCount] = {};
         DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {};
 
-        void EnsureStridesExist(uint32_t count);
+        void EnsureStridesExist();
     };
 
     class TensorDescBuilder