From 5603450f1e65750b7d61a5c741f80f9308bb4cde Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 31 Aug 2023 17:52:10 -0700 Subject: [PATCH 1/9] Enable QLinearAveragePooling DML EP --- .../src/External/DirectMLHelpers/ApiTraits.h | 37 +++- .../External/DirectMLHelpers/DirectMLSchema.h | 25 +++ .../DirectMLHelpers/GeneratedSchemaHelpers.h | 22 ++ .../src/Operators/DmlOperator.cpp | 34 +++ .../src/Operators/DmlOperator.h | 7 + .../DmlOperatorQLinearAveragePooling.cpp | 208 ++++++++++++++++++ .../src/Operators/OperatorRegistration.cpp | 6 + .../dml/OperatorAuthorHelper/Attributes.h | 1 + .../OperatorAuthorHelper/OperatorHelper.cpp | 18 ++ .../dml/OperatorAuthorHelper/OperatorHelper.h | 12 + .../OperatorAuthorHelper/OperatorVersions.h | 1 + 11 files changed, 369 insertions(+), 2 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h index f6d71ce629a8d..570a0f82b62ff 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h @@ -14,7 +14,26 @@ struct DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC _Maybenull_ const DML_TENSOR_DESC* BiasTensor; const DML_TENSOR_DESC* OutputTensor; }; -const int DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT = 0x80000011; +const int DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT = 0x80000011; + +struct DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC +{ + const DML_TENSOR_DESC* InputTensor; + const DML_TENSOR_DESC* InputScaleTensor; + _Maybenull_ const DML_TENSOR_DESC* InputZeroPointTensor; + const DML_TENSOR_DESC* OutputScaleTensor; + _Maybenull_ const DML_TENSOR_DESC* OutputZeroPointTensor; + const DML_TENSOR_DESC* OutputTensor; + UINT DimensionCount; + _Field_size_(DimensionCount) const UINT* Strides; + _Field_size_(DimensionCount) const UINT* WindowSize; + _Field_size_(DimensionCount) const UINT* StartPadding; + _Field_size_(DimensionCount) const UINT* EndPadding; + _Field_size_(DimensionCount) const UINT* Dilations; + BOOL IncludePadding; +}; +const int DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING = 0x8000000B; + namespace ApiTraits { @@ -38,7 +57,7 @@ struct EnumTraits template <> struct EnumTraits { - static constexpr auto ValueCount = 161; + static constexpr auto ValueCount = 162; static constexpr size_t ActivationFunctionCount = 24; }; @@ -497,6 +516,12 @@ struct OperatorDescTraits static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ROI_POOLING; }; +template <> +struct OperatorDescTraits +{ + static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING; +}; + template <> struct OperatorDescTraits { @@ -1492,6 +1517,12 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ROI_POOLING> using DescType = DML_ROI_POOLING_OPERATOR_DESC; }; +template <> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING> +{ + using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC; +}; + template <> struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_SLICE> { @@ -2524,6 +2555,8 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args #pragma warning(disable: 4063) case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return std::invoke(std::forward(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward(args)...); + case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: + return std::invoke(std::forward(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward(args)...); #pragma warning(pop) default: diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h index f3a3aec50e4b4..2e9217cf3f4f7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h @@ -829,6 +829,31 @@ constexpr DML_OPERATOR_SCHEMA DML_ROI_POOLING_OPERATOR_SCHEMA { DML_ROI_POOLING_OPERATOR_SCHEMA_FIELDS, }; + +constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] { + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false }, +}; + +constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA { + "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING", + static_cast(DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING), + DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, + 13, + DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS, +}; + constexpr DML_SCHEMA_FIELD DML_SLICE_OPERATOR_SCHEMA_FIELDS[6] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h index 90915c7e757de..1b82295ea4f9e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h @@ -473,6 +473,24 @@ inline std::vector GetFields(const DML_ROI_POOLING_OPERATOR_DESC& OperatorField(&DML_ROI_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.PooledSize))), }; } +inline std::vector GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc) +{ + return { + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.InputTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.InputScaleTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.InputZeroPointTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.OutputScaleTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.OutputZeroPointTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast(desc.OutputTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast(desc.DimensionCount))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.Strides), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast(desc.WindowSize), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast(desc.StartPadding), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast(desc.EndPadding), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast(desc.Dilations), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast(desc.IncludePadding))), + }; +} inline std::vector GetFields(const DML_SLICE_OPERATOR_DESC& desc) { return { @@ -2492,6 +2510,10 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc) return AbstractOperatorDesc( &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA, GetFields(*static_cast(opDesc.Desc))); + case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: + return AbstractOperatorDesc( + &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA, + GetFields(*static_cast(opDesc.Desc))); #pragma warning(pop) default: diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index 25c7be42d6425..78f7d1aef33bd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -635,6 +635,40 @@ namespace Dml )); } + void DmlOperator::ConvertNHWCToNCHW( + const uint32_t& dimCount, + const gsl::span nhwcSizes, + std::vector& nchwSizes, + std::vector& nchwInputStrides) + { + int i = 0; + const uint32_t inputBatch = nhwcSizes[ i++ ]; + const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[ i++ ] : 0; + const uint32_t inputHeight = nhwcSizes[ i++ ]; + const uint32_t inputWidth = nhwcSizes[ i++ ]; + const uint32_t inputChannels = nhwcSizes[ i++ ]; + + if (dimCount == 4) + { + nchwSizes = { inputBatch, inputChannels, inputHeight, inputWidth }; + nchwInputStrides = { inputHeight * inputWidth * inputChannels, + 1, + inputWidth * inputChannels, + inputChannels + }; + } + else + { + nchwSizes = { inputBatch, inputChannels, inputDepth, inputHeight, inputWidth }; + nchwInputStrides = { inputDepth * inputChannels * inputWidth * inputHeight, + 1, + inputChannels * inputWidth * inputHeight, + inputChannels * inputWidth, + inputChannels + }; + } + } + TensorDesc DmlOperator::CreateTensorDescFromInput( const MLOperatorKernelCreationContext& kernelInfo, uint32_t index, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h index c1e8cf42a974c..394195ce14a6d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h @@ -119,6 +119,13 @@ namespace Dml ComPtr InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes); void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor); + void DmlOperator::ConvertNHWCToNCHW( + const uint32_t& dimCount, + const gsl::span nhwcSizes, + std::vector& nchwSizes, + std::vector& nchwInputStrides + ); + TensorDesc CreateTensorDescFromInput( const MLOperatorKernelCreationContext& kernelInfo, uint32_t index, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp new file mode 100644 index 0000000000000..4303149eae347 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp @@ -0,0 +1,208 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "precomp.h" + +namespace Dml +{ + +class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelperBase +{ + // For QLinear Avg Pool ORT and DML have same indexing order + enum OrtInputTensors : uint32_t + { + ortInput, + ortInputScale, + ortInputZeroPoint, + ortOutputScale, + ortOutputZeroPoint, + ortInputCount + }; + +public: + using Self = DmlOperatorQLinearAveragePooling; + + DmlOperatorQLinearAveragePooling( + const MLOperatorKernelCreationContext& kernelInfo, + bool useGlobalPooling + ) + : DmlOperator(kernelInfo), + PoolingHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), useGlobalPooling) + { + DmlOperator::Initialize(kernelInfo); + + bool isNhwc = m_kernel.channelsLast; + std::vector inputShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortInput); + std::vector outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); + + // Initialize the input descriptions with broadcasting + m_inputTensorDescs[OrtInputTensors::ortInput] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortInput, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape); + + uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount(); + // Resize the Input Scale to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + m_inputTensorDescs[OrtInputTensors::ortInputScale] = CreateTensorDescFromInput( + kernelInfo, + OrtInputTensors::ortInputScale, + TensorAxis::DoNotCoerce, + TensorAxis::H, + TensorAxis::LeftAligned, + std::nullopt, + dmlDimSize + ); + + // Resize the Input ZeroPoint to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint)) + { + + m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = CreateTensorDescFromInput( + kernelInfo, + OrtInputTensors::ortInputZeroPoint, + TensorAxis::DoNotCoerce, + TensorAxis::H, + TensorAxis::LeftAligned, + std::nullopt, + dmlDimSize + ); + } + + // Resize the Output Scale to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + m_inputTensorDescs[OrtInputTensors::ortOutputScale] = CreateTensorDescFromInput( + kernelInfo, + OrtInputTensors::ortInputScale, + TensorAxis::DoNotCoerce, + TensorAxis::H, + TensorAxis::LeftAligned, + std::nullopt, + dmlDimSize + ); + + // Resize the Input ZeroPoint to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint)) + { + + m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = CreateTensorDescFromInput( + kernelInfo, + OrtInputTensors::ortOutputZeroPoint, + TensorAxis::DoNotCoerce, + TensorAxis::H, + TensorAxis::LeftAligned, + std::nullopt, + dmlDimSize + ); + } + + // Initialize the output description while overriding the shape + m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape); + + assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize)); + + // DML requires that DimensionCount be equal to Input.DimCount - 2 for Pooling + uint32_t expectedSpatialDimCount = m_inputTensorDescs[0].GetDimensionCount() - 2; + if (m_kernel.spatialDimensionCount < expectedSpatialDimCount) + { + size_t shift = expectedSpatialDimCount - m_kernel.spatialDimensionCount; + + for (int i = gsl::narrow_cast(m_kernel.spatialDimensionCount) - 1; i >= 0; i--) + { + m_kernel.windowSize[i + shift] = m_kernel.windowSize[i]; + m_kernel.windowSize[i] = 1; + + m_kernel.strides[i + shift] = m_kernel.strides[i]; + m_kernel.strides[i] = 1; + + m_kernel.startPadding[i + shift] = m_kernel.startPadding[i]; + m_kernel.startPadding[i] = 0; + + m_kernel.endPadding[i + shift] = m_kernel.endPadding[i]; + m_kernel.endPadding[i] = 0; + + m_kernel.dilations[i + shift] = m_kernel.dilations[i]; + m_kernel.dilations[i] = 1; + } + + m_kernel.spatialDimensionCount = expectedSpatialDimCount; + } + + if (isNhwc) + { + uint32_t dimCount = m_inputTensorDescs[0].GetDimensionCount(); + const auto inputSizes = m_inputTensorDescs[OrtInputTensors::ortInput].GetSizes(); + std::vector nchwInputSizes; + std::vector nchwInputStrides; + ConvertNHWCToNCHW(dimCount, inputSizes, nchwInputSizes, nchwInputStrides); + m_inputTensorDescs[OrtInputTensors::ortInput] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInput].GetDmlDataType(), nchwInputSizes, nchwInputStrides); + + gsl::span inputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortInputScale].GetSizes(); + std::vector nchwInputScaleSizes; + std::vector nchwInputScaleStrides; + ConvertNHWCToNCHW(dimCount, inputScaleSizes, nchwInputScaleSizes, nchwInputScaleStrides); + m_inputTensorDescs[OrtInputTensors::ortInputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputScale].GetDmlDataType(), nchwInputScaleSizes, nchwInputScaleStrides); + + gsl::span inputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetSizes(); + std::vector nchwInputZeroPointSizes; + std::vector nchwInputZeroPointStrides; + ConvertNHWCToNCHW(dimCount, inputZeroPointSizes, nchwInputZeroPointSizes, nchwInputZeroPointStrides); + m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetDmlDataType(), nchwInputZeroPointSizes, nchwInputZeroPointStrides); + + gsl::span outputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetSizes(); + std::vector nchwOutputScaleSizes; + std::vector nchwOutputScaleStrides; + ConvertNHWCToNCHW(dimCount, outputScaleSizes, nchwOutputScaleSizes, nchwOutputScaleStrides); + m_inputTensorDescs[OrtInputTensors::ortOutputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetDmlDataType(), nchwOutputScaleSizes, nchwOutputScaleStrides); + + gsl::span outputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetSizes(); + std::vector nchwOutputZeroPointSizes; + std::vector nchwOutputZeroPointStrides; + ConvertNHWCToNCHW(dimCount, outputZeroPointSizes, nchwOutputZeroPointSizes, nchwOutputZeroPointStrides); + m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetDmlDataType(), nchwOutputZeroPointSizes, nchwOutputZeroPointStrides); + + gsl::span outputSizes = m_outputTensorDescs[0].GetSizes(); + std::vector nchwOutputSizes; + std::vector nchwOutputStrides; + ConvertNHWCToNCHW(dimCount, outputSizes, nchwOutputSizes, nchwOutputStrides); + m_outputTensorDescs[0] = TensorDesc(m_outputTensorDescs[0].GetDmlDataType(), nchwOutputSizes, nchwOutputStrides); + } + + std::vector inputDescs = GetDmlInputDescs(); + std::vector outputDescs = GetDmlOutputDescs(); + + DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC qLinearAvgPooldesc = {}; + + qLinearAvgPooldesc.InputTensor = &inputDescs[OrtInputTensors::ortInput]; + qLinearAvgPooldesc.InputScaleTensor = &inputDescs[OrtInputTensors::ortInputScale]; + qLinearAvgPooldesc.InputZeroPointTensor = &inputDescs[OrtInputTensors::ortInputZeroPoint]; + qLinearAvgPooldesc.OutputScaleTensor = &inputDescs[OrtInputTensors::ortOutputScale];; + qLinearAvgPooldesc.OutputZeroPointTensor = &inputDescs[OrtInputTensors::ortOutputZeroPoint];; + qLinearAvgPooldesc.OutputTensor = &outputDescs[0]; + qLinearAvgPooldesc.DimensionCount = m_kernel.spatialDimensionCount; + qLinearAvgPooldesc.WindowSize = m_kernel.windowSize; + qLinearAvgPooldesc.Strides = m_kernel.strides; + qLinearAvgPooldesc.StartPadding = m_kernel.startPadding; + qLinearAvgPooldesc.EndPadding = m_kernel.endPadding; + qLinearAvgPooldesc.Dilations = m_kernel.dilations; + qLinearAvgPooldesc.IncludePadding = kernelInfo.GetOptionalAttribute(AttrName::CountIncludePad, false); + + DML_OPERATOR_DESC opDesc = { (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING, &qLinearAvgPooldesc }; + SetDmlOperatorDesc(opDesc, kernelInfo); + } +}; + +template +class DmlOperatorQuantizedPoolingTemplate : public DmlOperatorQLinearAveragePooling +{ +public: + DmlOperatorQuantizedPoolingTemplate(const MLOperatorKernelCreationContext& kernelInfo) + : DmlOperatorQLinearAveragePooling(kernelInfo, UseGlobalPooling) + { + } +}; + +DML_OP_DEFINE_CREATION_FUNCTION(QLinearAveragePool, DmlOperatorQuantizedPoolingTemplate); +DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQuantizedPoolingTemplate); +//DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQLinearAveragePooling, true); //useGobalPool + +} // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index 07ff4f3145459..c22254fee76a7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -257,6 +257,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(GlobalMaxPool); DML_OP_EXTERN_CREATION_FUNCTION(LpPool); DML_OP_EXTERN_CREATION_FUNCTION(GlobalLpPool); DML_OP_EXTERN_CREATION_FUNCTION(MaxRoiPool); +DML_OP_EXTERN_CREATION_FUNCTION(QLinearAveragePool); DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign10); DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign16); DML_OP_EXTERN_CREATION_FUNCTION(InstanceNormalization); @@ -587,6 +588,10 @@ constexpr static std::array supportedTypeListQLinea SupportedTensorDataTypes::Ints8Bit|SupportedTensorDataTypes::Float32, }; +constexpr static std::array supportedTypeListQLinearAveragePool = { + SupportedTensorDataTypes::Ints8Bit +}; + template constexpr auto requiredConstantCpuInputs(Args... args) { @@ -992,6 +997,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation {REG_INFO( 9, MaxUnpool, typeNameListTwo, supportedTypeListMaxUnpool, DmlGraphSupport::Supported, requiredConstantCpuInputs(2))}, {REG_INFO( 11, MaxUnpool, typeNameListTwo, supportedTypeListMaxUnpool, DmlGraphSupport::Supported, requiredConstantCpuInputs(2))}, // 11 is identical to 9. + {REG_INFO_MS( 1, QLinearAveragePool, typeNameListDefault, supportedTypeListQLinearAveragePool, DmlGraphSupport::Supported)}, {REG_INFO_MS( 1, QLinearAdd, typeNameListDefault, supportedTypeListInteger8, DmlGraphSupport::Supported)}, {REG_INFO( 10, QLinearConv, typeNameListFour, supportedTypeListQLinearConv, DmlGraphSupport::Supported)}, {REG_INFO( 10, QLinearMatMul, typeNameListThree, supportedTypeListQLinearMatMul, DmlGraphSupport::Supported)}, diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h index 5be84a931f4f1..543e30fcd9722 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Attributes.h @@ -24,6 +24,7 @@ namespace AttrName static constexpr const char* Border = "border"; static constexpr const char* Broadcast = "broadcast"; static constexpr const char* CeilMode = "ceil_mode"; + static constexpr const char* ChannelsLast = "channels_last"; static constexpr const char* Clip = "clip"; static constexpr const char* CoordinateTransformationMode = "coordinate_transformation_mode"; static constexpr const char* CountIncludePad = "count_include_pad"; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp index 4d59964dcc664..aa6b5baa1aa07 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp @@ -495,6 +495,7 @@ namespace OperatorHelper } args.useCeilingOutputShape = kernelInfo.GetOptionalAttribute(AttrName::CeilMode, 0); + args.channelsLast = kernelInfo.GetOptionalAttribute(AttrName::ChannelsLast, 0); return args; } @@ -2012,7 +2013,24 @@ namespace OperatorHelper } return outputShapes; } + + std::vector QLinearAveragePoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const + { + auto inputShape = shapeInfo.GetInputTensorShape(0); + std::vector outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast); + // MaxPool may have both an output and an indices tensor (both the same size). + const uint32_t outputCount = shapeInfo.GetOutputCount(); + assert(outputCount == 1 || outputCount == 2); + + std::vector outputShapes; + for (uint32_t i = 0; i < outputCount; ++i) + { + outputShapes.push_back(outputDimensions); + } + return outputShapes; + } + std::vector RoiPoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const { auto roiShape = shapeInfo.GetInputTensorShape(InputTensors::ROIS); diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 5add951dccb78..47e7573951803 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -160,6 +160,7 @@ struct KernelArgs bool autoPad = false; bool autoPadSameUpper = false; bool useCeilingOutputShape = false; + bool channelsLast = false; uint32_t spatialDimensionCount = 0; KernelArgs(uint32_t spatialDimensionCount) : spatialDimensionCount(spatialDimensionCount) @@ -188,6 +189,7 @@ struct KernelArgs KernelArgs(KernelArgs const& kernelArgs, uint32_t minimumDimensionCount) : autoPad(kernelArgs.autoPad), autoPadSameUpper(kernelArgs.autoPadSameUpper), + channelsLast(kernelArgs.channelsLast), spatialDimensionCount(std::max(kernelArgs.spatialDimensionCount, minimumDimensionCount)) { ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount); @@ -1168,6 +1170,15 @@ class RoiAlignHelper : public RoiPoolingHelperBase std::vector GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const; }; +class QLinearAveragePoolingHelper : public PoolingHelperBase +{ +public: + template + QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape/*, bool useGlobalPooling */) : PoolingHelperBase(info, shape, false) {} + std::vector GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const; + +}; + class SqueezeHelper { public: @@ -1497,6 +1508,7 @@ using ShapeInferenceHelper_MaxUnpool = UnpoolingHelper; using ShapeInferenceHelper_LpPool = PoolingHelper; using ShapeInferenceHelper_GlobalLpPool = GlobalPoolingHelper; using ShapeInferenceHelper_MaxRoiPool = RoiPoolingHelper; +using ShapeInferenceHelper_QLinearAveragePool = QLinearAveragePoolingHelper; using ShapeInferenceHelper_RoiAlign10 = VersionedOpsetHelper; using ShapeInferenceHelper_RoiAlign16 = VersionedOpsetHelper; using ShapeInferenceHelper_InstanceNormalization = GetOutputShapeAsInputShapeHelper; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h index d785f77e24344..5293d630f40f2 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h @@ -441,6 +441,7 @@ namespace OperatorHelper static const int sc_sinceVer_GroupNorm = 1; static const int sc_sinceVer_DynamicQuantizeMatMul = 1; static const int sc_sinceVer_QLinearConcat = 1; + static const int sc_sinceVer_QLinearAveragePool = 1; } // namespace MsftOperatorSet1 } // namespace OperatorHelper From bf31e674c99ce7c544c9dd0fa561e894bb6f7157 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Mon, 18 Sep 2023 13:20:42 -0700 Subject: [PATCH 2/9] Implementation for QLinearGlobalAveragePool and addressing few review comments --- .../src/Operators/DmlOperator.cpp | 12 ++++---- .../src/Operators/DmlOperator.h | 4 +-- .../DmlOperatorQLinearAveragePooling.cpp | 6 +--- .../src/Operators/OperatorRegistration.cpp | 2 ++ .../OperatorAuthorHelper/OperatorHelper.cpp | 28 ++++++++++++++++--- .../dml/OperatorAuthorHelper/OperatorHelper.h | 18 ++++++++++-- .../OperatorAuthorHelper/OperatorVersions.h | 1 + .../qlinear_global_average_pool_test.cc | 3 ++ .../test/contrib_ops/qlinear_pool_test.cc | 10 +++++++ 9 files changed, 64 insertions(+), 20 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index 78f7d1aef33bd..5c2b75d197352 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -636,17 +636,17 @@ namespace Dml } void DmlOperator::ConvertNHWCToNCHW( - const uint32_t& dimCount, + const uint32_t dimCount, const gsl::span nhwcSizes, std::vector& nchwSizes, std::vector& nchwInputStrides) { int i = 0; - const uint32_t inputBatch = nhwcSizes[ i++ ]; - const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[ i++ ] : 0; - const uint32_t inputHeight = nhwcSizes[ i++ ]; - const uint32_t inputWidth = nhwcSizes[ i++ ]; - const uint32_t inputChannels = nhwcSizes[ i++ ]; + const uint32_t inputBatch = nhwcSizes[i++]; + const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[i++] : 0; + const uint32_t inputHeight = nhwcSizes[i++]; + const uint32_t inputWidth = nhwcSizes[i++]; + const uint32_t inputChannels = nhwcSizes[i++]; if (dimCount == 4) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h index 394195ce14a6d..cecb943c382cf 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h @@ -119,8 +119,8 @@ namespace Dml ComPtr InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes); void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor); - void DmlOperator::ConvertNHWCToNCHW( - const uint32_t& dimCount, + void ConvertNHWCToNCHW( + const uint32_t dimCount, const gsl::span nhwcSizes, std::vector& nchwSizes, std::vector& nchwInputStrides diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp index 4303149eae347..2d4b28b69126e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp @@ -35,11 +35,8 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe std::vector inputShape = kernelInfo.GetTensorShapeDescription().GetInputTensorShape(OrtInputTensors::ortInput); std::vector outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); - // Initialize the input descriptions with broadcasting - m_inputTensorDescs[OrtInputTensors::ortInput] = CreateTensorDescFromInput(kernelInfo, OrtInputTensors::ortInput, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, inputShape); - uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount(); - // Resize the Input Scale to be the same dimension as the input tensor. + // Reshape the Input Scale to be the same dimension as the input tensor. // The 1D tensor needs to be moved to the H channel. m_inputTensorDescs[OrtInputTensors::ortInputScale] = CreateTensorDescFromInput( kernelInfo, @@ -203,6 +200,5 @@ class DmlOperatorQuantizedPoolingTemplate : public DmlOperatorQLinearAveragePool DML_OP_DEFINE_CREATION_FUNCTION(QLinearAveragePool, DmlOperatorQuantizedPoolingTemplate); DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQuantizedPoolingTemplate); -//DML_OP_DEFINE_CREATION_FUNCTION(QLinearGlobalAveragePool, DmlOperatorQLinearAveragePooling, true); //useGobalPool } // namespace Dml diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index c22254fee76a7..daa8d70b6dac2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -258,6 +258,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(LpPool); DML_OP_EXTERN_CREATION_FUNCTION(GlobalLpPool); DML_OP_EXTERN_CREATION_FUNCTION(MaxRoiPool); DML_OP_EXTERN_CREATION_FUNCTION(QLinearAveragePool); +DML_OP_EXTERN_CREATION_FUNCTION(QLinearGlobalAveragePool); DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign10); DML_OP_EXTERN_CREATION_FUNCTION(RoiAlign16); DML_OP_EXTERN_CREATION_FUNCTION(InstanceNormalization); @@ -998,6 +999,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation {REG_INFO( 11, MaxUnpool, typeNameListTwo, supportedTypeListMaxUnpool, DmlGraphSupport::Supported, requiredConstantCpuInputs(2))}, // 11 is identical to 9. {REG_INFO_MS( 1, QLinearAveragePool, typeNameListDefault, supportedTypeListQLinearAveragePool, DmlGraphSupport::Supported)}, + {REG_INFO_MS( 1, QLinearGlobalAveragePool, typeNameListDefault, supportedTypeListQLinearAveragePool, DmlGraphSupport::Supported)}, {REG_INFO_MS( 1, QLinearAdd, typeNameListDefault, supportedTypeListInteger8, DmlGraphSupport::Supported)}, {REG_INFO( 10, QLinearConv, typeNameListFour, supportedTypeListQLinearConv, DmlGraphSupport::Supported)}, {REG_INFO( 10, QLinearMatMul, typeNameListThree, supportedTypeListQLinearMatMul, DmlGraphSupport::Supported)}, diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp index aa6b5baa1aa07..1fcd3b04300f4 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp @@ -365,13 +365,20 @@ namespace OperatorHelper } // Creates a kernel that spans the entire spatial dimensions of the input. - KernelArgs InitializeGlobalKernel(gsl::span inputDimensions) + KernelArgs InitializeGlobalKernel( + const MLOperatorAttributes& kernelInfo, + gsl::span inputDimensions) { ML_CHECK_VALID_ARGUMENT(inputDimensions.size() > NonspatialDimensionCount); // Must be at least 1D convolution (in 3D tensor) uint32_t spatialDimensionCount = gsl::narrow_cast(inputDimensions.size()) - NonspatialDimensionCount; ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount); // Support up to 3D convolution (in 5D tensor). KernelArgs args(spatialDimensionCount); + args.useCeilingOutputShape = kernelInfo.GetOptionalAttribute(AttrName::CeilMode, 0); + args.channelsLast = kernelInfo.GetOptionalAttribute(AttrName::ChannelsLast, 0); + // For Global Pooling, kernel size equal to the spatial dimension of input tensor + // NHWC layout need to offset by one dim to acount for channel placed at the end + int dimOffset = args.channelsLast ? 1 : 0; for (size_t dim = 0; dim < spatialDimensionCount; ++dim) { @@ -379,7 +386,7 @@ namespace OperatorHelper args.dilations[dim] = 1; args.startPadding[dim] = 0; args.endPadding[dim] = 0; - args.windowSize[dim] = gsl::narrow_cast(inputDimensions[inputDimensions.size() - spatialDimensionCount + dim]); + args.windowSize[dim] = gsl::narrow_cast(inputDimensions[inputDimensions.size() - spatialDimensionCount + dim - dimOffset]); } return args; @@ -2019,9 +2026,22 @@ namespace OperatorHelper auto inputShape = shapeInfo.GetInputTensorShape(0); std::vector outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast); - // MaxPool may have both an output and an indices tensor (both the same size). const uint32_t outputCount = shapeInfo.GetOutputCount(); - assert(outputCount == 1 || outputCount == 2); + + std::vector outputShapes; + for (uint32_t i = 0; i < outputCount; ++i) + { + outputShapes.push_back(outputDimensions); + } + return outputShapes; + } + + std::vector QLinearGlobalAveragePoolingHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const + { + auto inputShape = shapeInfo.GetInputTensorShape(0); + std::vector outputDimensions = InitializeKernelOutputDimensions(inputShape, m_kernel, m_kernel.channelsLast); + + const uint32_t outputCount = shapeInfo.GetOutputCount(); std::vector outputShapes; for (uint32_t i = 0; i < outputCount; ++i) diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h index 47e7573951803..8d7f0b5b043d0 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h @@ -213,7 +213,9 @@ std::vector InitializeKernelOutputDimsTranspose( gsl::span inputDimensions, const KernelArgs& args); -KernelArgs InitializeGlobalKernel(gsl::span inputDimensions); +KernelArgs InitializeGlobalKernel( + const MLOperatorAttributes& kernelInfo, + gsl::span inputDimensions); KernelArgs InitializeKernel( const MLOperatorAttributes& kernelInfo, @@ -1068,7 +1070,7 @@ class PoolingHelperBase bool useGlobalPooling ) : m_kernel(useGlobalPooling - ? InitializeGlobalKernel(shape.GetInputTensorShape(0)) + ? InitializeGlobalKernel(info, shape.GetInputTensorShape(0)) : InitializeKernel(info, static_cast(shape.GetInputTensorShape(0).size()), gsl::span())) { if (!useGlobalPooling) @@ -1174,7 +1176,16 @@ class QLinearAveragePoolingHelper : public PoolingHelperBase { public: template - QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape/*, bool useGlobalPooling */) : PoolingHelperBase(info, shape, false) {} + QLinearAveragePoolingHelper(const Info_t& info, const Shape_t& shape) : PoolingHelperBase(info, shape, false) {} + std::vector GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const; + +}; + +class QLinearGlobalAveragePoolingHelper : public PoolingHelperBase +{ +public: + template + QLinearGlobalAveragePoolingHelper(const Info_t& info, const Shape_t& shape) : PoolingHelperBase(info, shape, true) {} std::vector GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const; }; @@ -1509,6 +1520,7 @@ using ShapeInferenceHelper_LpPool = PoolingHelper; using ShapeInferenceHelper_GlobalLpPool = GlobalPoolingHelper; using ShapeInferenceHelper_MaxRoiPool = RoiPoolingHelper; using ShapeInferenceHelper_QLinearAveragePool = QLinearAveragePoolingHelper; +using ShapeInferenceHelper_QLinearGlobalAveragePool = QLinearGlobalAveragePoolingHelper; using ShapeInferenceHelper_RoiAlign10 = VersionedOpsetHelper; using ShapeInferenceHelper_RoiAlign16 = VersionedOpsetHelper; using ShapeInferenceHelper_InstanceNormalization = GetOutputShapeAsInputShapeHelper; diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h index 5293d630f40f2..078f4a7aef6b0 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h @@ -442,6 +442,7 @@ namespace OperatorHelper static const int sc_sinceVer_DynamicQuantizeMatMul = 1; static const int sc_sinceVer_QLinearConcat = 1; static const int sc_sinceVer_QLinearAveragePool = 1; + static const int sc_sinceVer_QLinearGlobalAveragePool = 1; } // namespace MsftOperatorSet1 } // namespace OperatorHelper diff --git a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc index 8fb245819fd26..71b6f27b5391f 100644 --- a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc +++ b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc @@ -66,6 +66,9 @@ void RunQLinearGlobalAveragePool( test.AddInput("y_scale", {}, {y_scale}); test.AddInput("y_zero_point", {}, {y_zero_point}); test.AddOutput("Y", y_dims, y_data); + if (channels_last) { + test.AddAttribute("channels_last", (int64_t)1LL); + } auto q8checker = [&](const std::vector& fetches, const std::string& provider_type) { const OrtValue& ort_value = fetches[0]; diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc index 78f7f431aa66e..dfe50b8486857 100644 --- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc +++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc @@ -646,5 +646,15 @@ TEST(QLinearPoolTest, AveragePool2D_Global_nhwc_S8) { 1); // count_include_pad } +TEST(QLinearPoolTest, AveragePool2D_Global_mock) { + RunQLinearAveragePoolNhwc( + {1, 1, 32, 32}, // x shape + {1, 1, 1, 1}, // expected y shape + {32, 32}, // kernel shape + {1, 1}, // strides + {0, 0, 0, 0}, // pads + 1); // count_include_pad +} + } // namespace test } // namespace onnxruntime From 38ed2212b4133c344b28fd26491c193ac7214055 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Wed, 20 Sep 2023 02:44:20 -0700 Subject: [PATCH 3/9] Update Layout Conversion logic --- .../DmlOperatorQLinearAveragePooling.cpp | 143 ++++++------------ .../DmlExecutionProvider/src/TensorDesc.cpp | 49 ++++++ .../dml/DmlExecutionProvider/src/TensorDesc.h | 4 + 3 files changed, 98 insertions(+), 98 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp index 2d4b28b69126e..8433e90bb9b24 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp @@ -36,68 +36,9 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe std::vector outputShape = kernelInfo.GetTensorShapeDescription().GetOutputTensorShape(0); uint32_t dmlDimSize = m_inputTensorDescs[OrtInputTensors::ortInput].GetDimensionCount(); - // Reshape the Input Scale to be the same dimension as the input tensor. - // The 1D tensor needs to be moved to the H channel. - m_inputTensorDescs[OrtInputTensors::ortInputScale] = CreateTensorDescFromInput( - kernelInfo, - OrtInputTensors::ortInputScale, - TensorAxis::DoNotCoerce, - TensorAxis::H, - TensorAxis::LeftAligned, - std::nullopt, - dmlDimSize - ); - - // Resize the Input ZeroPoint to be the same dimension as the input tensor. - // The 1D tensor needs to be moved to the H channel. - if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint)) - { - - m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = CreateTensorDescFromInput( - kernelInfo, - OrtInputTensors::ortInputZeroPoint, - TensorAxis::DoNotCoerce, - TensorAxis::H, - TensorAxis::LeftAligned, - std::nullopt, - dmlDimSize - ); - } - - // Resize the Output Scale to be the same dimension as the input tensor. - // The 1D tensor needs to be moved to the H channel. - m_inputTensorDescs[OrtInputTensors::ortOutputScale] = CreateTensorDescFromInput( - kernelInfo, - OrtInputTensors::ortInputScale, - TensorAxis::DoNotCoerce, - TensorAxis::H, - TensorAxis::LeftAligned, - std::nullopt, - dmlDimSize - ); - - // Resize the Input ZeroPoint to be the same dimension as the input tensor. - // The 1D tensor needs to be moved to the H channel. - if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint)) - { - - m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = CreateTensorDescFromInput( - kernelInfo, - OrtInputTensors::ortOutputZeroPoint, - TensorAxis::DoNotCoerce, - TensorAxis::H, - TensorAxis::LeftAligned, - std::nullopt, - dmlDimSize - ); - } - - // Initialize the output description while overriding the shape - m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::W, TensorAxis::RightAligned, outputShape); - - assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize)); - - // DML requires that DimensionCount be equal to Input.DimCount - 2 for Pooling + ML_CHECK_VALID_ARGUMENT(dmlDimSize >= 2); + + // DML requires that DimensionCount be equal to Input.dmlDimSize - 2 for Pooling uint32_t expectedSpatialDimCount = m_inputTensorDescs[0].GetDimensionCount() - 2; if (m_kernel.spatialDimensionCount < expectedSpatialDimCount) { @@ -124,45 +65,51 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe m_kernel.spatialDimensionCount = expectedSpatialDimCount; } + // Initialize dimensionMapping for NCHW or NHWC layout + std::vector dimensionMapping = {0u, dmlDimSize - 1u}; + dimensionMapping.resize(dmlDimSize); if (isNhwc) { - uint32_t dimCount = m_inputTensorDescs[0].GetDimensionCount(); - const auto inputSizes = m_inputTensorDescs[OrtInputTensors::ortInput].GetSizes(); - std::vector nchwInputSizes; - std::vector nchwInputStrides; - ConvertNHWCToNCHW(dimCount, inputSizes, nchwInputSizes, nchwInputStrides); - m_inputTensorDescs[OrtInputTensors::ortInput] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInput].GetDmlDataType(), nchwInputSizes, nchwInputStrides); - - gsl::span inputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortInputScale].GetSizes(); - std::vector nchwInputScaleSizes; - std::vector nchwInputScaleStrides; - ConvertNHWCToNCHW(dimCount, inputScaleSizes, nchwInputScaleSizes, nchwInputScaleStrides); - m_inputTensorDescs[OrtInputTensors::ortInputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputScale].GetDmlDataType(), nchwInputScaleSizes, nchwInputScaleStrides); - - gsl::span inputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetSizes(); - std::vector nchwInputZeroPointSizes; - std::vector nchwInputZeroPointStrides; - ConvertNHWCToNCHW(dimCount, inputZeroPointSizes, nchwInputZeroPointSizes, nchwInputZeroPointStrides); - m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].GetDmlDataType(), nchwInputZeroPointSizes, nchwInputZeroPointStrides); - - gsl::span outputScaleSizes = m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetSizes(); - std::vector nchwOutputScaleSizes; - std::vector nchwOutputScaleStrides; - ConvertNHWCToNCHW(dimCount, outputScaleSizes, nchwOutputScaleSizes, nchwOutputScaleStrides); - m_inputTensorDescs[OrtInputTensors::ortOutputScale] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputScale].GetDmlDataType(), nchwOutputScaleSizes, nchwOutputScaleStrides); - - gsl::span outputZeroPointSizes = m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetSizes(); - std::vector nchwOutputZeroPointSizes; - std::vector nchwOutputZeroPointStrides; - ConvertNHWCToNCHW(dimCount, outputZeroPointSizes, nchwOutputZeroPointSizes, nchwOutputZeroPointStrides); - m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint] = TensorDesc(m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].GetDmlDataType(), nchwOutputZeroPointSizes, nchwOutputZeroPointStrides); - - gsl::span outputSizes = m_outputTensorDescs[0].GetSizes(); - std::vector nchwOutputSizes; - std::vector nchwOutputStrides; - ConvertNHWCToNCHW(dimCount, outputSizes, nchwOutputSizes, nchwOutputStrides); - m_outputTensorDescs[0] = TensorDesc(m_outputTensorDescs[0].GetDmlDataType(), nchwOutputSizes, nchwOutputStrides); + // Form a remapping for dimensions so C is moved before the spatial dimensions. + // e.g. NWC -> {0,2,1} -> NCW + // NHWC -> {0,3,1,2} -> NCHW + // NDHWC -> {0,4,1,2,3} -> NCDHW + std::iota(dimensionMapping.begin() + 2, dimensionMapping.end(), 1u); + } + else + { + // Use NCHW {0,1,2,3} format with increasing order of indexs + std::iota(dimensionMapping.begin() + 1, dimensionMapping.end(), 1u); } + m_inputTensorDescs[OrtInputTensors::ortInput].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); + + // Reshape the Input Scale to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + m_inputTensorDescs[OrtInputTensors::ortInputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); + + // Reshape the Input ZeroPoint to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + if (kernelInfo.IsInputValid(OrtInputTensors::ortInputZeroPoint)) + { + m_inputTensorDescs[OrtInputTensors::ortInputZeroPoint].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); + } + + // Reshape the Output Scale to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + m_inputTensorDescs[OrtInputTensors::ortOutputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); + + + // Reshape the Input ZeroPoint to be the same dimension as the input tensor. + // The 1D tensor needs to be moved to the H channel. + if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint)) + { + m_inputTensorDescs[OrtInputTensors::ortOutputZeroPoint].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); + } + + // Initialize the output description while overriding the shape + m_outputTensorDescs[0].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); + + assert(m_kernel.spatialDimensionCount <= ARRAYSIZE(m_kernel.windowSize)); std::vector inputDescs = GetDmlInputDescs(); std::vector outputDescs = GetDmlOutputDescs(); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index 067a320dd8000..36156e98dd311 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -315,3 +315,52 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm } m_bufferTensorDesc.DimensionCount = newDimensionCount; } + +// Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout +void TensorDesc::PermuteDimensions(const std::vector dimensionMapping, const TensorAxis alignment) +{ + EnsureMinimumDimensionCount(dimensionMapping.size(), alignment); + InitializeStrides(static_cast(dimensionMapping.size()), alignment); + PermuteArray(dimensionMapping, alignment); +} + +// Shuffle m_sizes and m_strides acording to the indexes pointed by dimensionMapping +void TensorDesc::PermuteArray(const std::vector dimensionMapping, const TensorAxis alignment) +{ + std::vector temp_sizes(dimensionMapping.size()); + std::vector temp_strides(dimensionMapping.size()); + // Right alignment values are shifted to the end + int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast(dimensionMapping.size()) : 0; + + for (size_t i = 0; i < dimensionMapping.size(); i++) { + temp_sizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset]; + temp_strides[i] = m_strides[dimensionMapping[i] + alignmentOffset]; + } + + std::copy(temp_sizes.begin(), temp_sizes.end(), m_sizes + alignmentOffset); + std::copy(temp_strides.begin(), temp_strides.end(), m_strides + alignmentOffset); + m_bufferTensorDesc.Sizes = m_sizes; + m_bufferTensorDesc.Strides = m_strides; +} + +void TensorDesc::InitializeStrides(int dimensionCount, const TensorAxis alignment) +{ + ML_CHECK_VALID_ARGUMENT(alignment == TensorAxis::RightAligned || alignment == TensorAxis::LeftAligned); + // Right alignment values are shifted to the end + int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - dimensionCount : 0; + int index = dimensionCount + alignmentOffset; + uint32_t stride = 1; + m_strides[index - 1] = 1; + for (int i = index - 2; i >= 0; i--) + { + stride *= m_sizes[i + 1]; + m_strides[i] = stride; + } +} + +void TensorDesc::EnsureMinimumDimensionCount(const size_t dimensionCount, const TensorAxis alignment) +{ + // m_sizes and m_strides are arrays of size MaximumDimensionCount + assert(MaximumDimensionCount >= dimensionCount); + SetDimensionCount(static_cast(dimensionCount), alignment); +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h index ff70dec5b8871..041fee90284e5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h @@ -44,6 +44,10 @@ namespace Dml gsl::span GetSizes() const { return { m_sizes, m_sizes + m_bufferTensorDesc.DimensionCount }; } gsl::span GetStrides() const; void SetStrides(gsl::span strides); + void PermuteDimensions(const std::vector dimensionMapping, const TensorAxis alignment); + void PermuteArray(const std::vector dimensionMapping, const TensorAxis alignment); + void InitializeStrides(int count, const TensorAxis alignment); + void EnsureMinimumDimensionCount(const size_t count, const TensorAxis alignment); inline uint64_t GetBufferSizeInBytes() const { From f1905765ec8d072359a685572fe5a16294640788 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 21 Sep 2023 00:08:36 -0700 Subject: [PATCH 4/9] Address Reviews --- .../src/Operators/DmlOperator.cpp | 34 -------------- .../src/Operators/DmlOperator.h | 7 --- .../DmlOperatorQLinearAveragePooling.cpp | 1 - .../DmlExecutionProvider/src/TensorDesc.cpp | 46 +++++++++++-------- .../dml/DmlExecutionProvider/src/TensorDesc.h | 8 ++-- .../test/contrib_ops/qlinear_pool_test.cc | 10 ---- 6 files changed, 31 insertions(+), 75 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index 5c2b75d197352..25c7be42d6425 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -635,40 +635,6 @@ namespace Dml )); } - void DmlOperator::ConvertNHWCToNCHW( - const uint32_t dimCount, - const gsl::span nhwcSizes, - std::vector& nchwSizes, - std::vector& nchwInputStrides) - { - int i = 0; - const uint32_t inputBatch = nhwcSizes[i++]; - const uint32_t inputDepth = dimCount == 5 ? nhwcSizes[i++] : 0; - const uint32_t inputHeight = nhwcSizes[i++]; - const uint32_t inputWidth = nhwcSizes[i++]; - const uint32_t inputChannels = nhwcSizes[i++]; - - if (dimCount == 4) - { - nchwSizes = { inputBatch, inputChannels, inputHeight, inputWidth }; - nchwInputStrides = { inputHeight * inputWidth * inputChannels, - 1, - inputWidth * inputChannels, - inputChannels - }; - } - else - { - nchwSizes = { inputBatch, inputChannels, inputDepth, inputHeight, inputWidth }; - nchwInputStrides = { inputDepth * inputChannels * inputWidth * inputHeight, - 1, - inputChannels * inputWidth * inputHeight, - inputChannels * inputWidth, - inputChannels - }; - } - } - TensorDesc DmlOperator::CreateTensorDescFromInput( const MLOperatorKernelCreationContext& kernelInfo, uint32_t index, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h index cecb943c382cf..df123f8db4658 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h @@ -118,13 +118,6 @@ namespace Dml // ComPtr InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes); void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor); - - void ConvertNHWCToNCHW( - const uint32_t dimCount, - const gsl::span nhwcSizes, - std::vector& nchwSizes, - std::vector& nchwInputStrides - ); TensorDesc CreateTensorDescFromInput( const MLOperatorKernelCreationContext& kernelInfo, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp index 8433e90bb9b24..0fccedfe311c1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearAveragePooling.cpp @@ -98,7 +98,6 @@ class DmlOperatorQLinearAveragePooling : public DmlOperator, public PoolingHelpe // The 1D tensor needs to be moved to the H channel. m_inputTensorDescs[OrtInputTensors::ortOutputScale].PermuteDimensions(dimensionMapping, TensorAxis::LeftAligned); - // Reshape the Input ZeroPoint to be the same dimension as the input tensor. // The 1D tensor needs to be moved to the H channel. if (kernelInfo.IsInputValid(OrtInputTensors::ortOutputZeroPoint)) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index 36156e98dd311..08ff04e0d5b57 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -317,38 +317,45 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm } // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout -void TensorDesc::PermuteDimensions(const std::vector dimensionMapping, const TensorAxis alignment) +void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, const TensorAxis alignment) { - EnsureMinimumDimensionCount(dimensionMapping.size(), alignment); - InitializeStrides(static_cast(dimensionMapping.size()), alignment); + EnsureMinimumDimensionCount(static_cast(dimensionMapping.size()), alignment); + EnsureStridesExist(static_cast(dimensionMapping.size()), alignment); PermuteArray(dimensionMapping, alignment); } -// Shuffle m_sizes and m_strides acording to the indexes pointed by dimensionMapping -void TensorDesc::PermuteArray(const std::vector dimensionMapping, const TensorAxis alignment) +// Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping +void TensorDesc::PermuteArray(gsl::span dimensionMapping, TensorAxis alignment) { - std::vector temp_sizes(dimensionMapping.size()); - std::vector temp_strides(dimensionMapping.size()); + std::vector tempSizes(dimensionMapping.size()); + std::vector tempStrides(dimensionMapping.size()); // Right alignment values are shifted to the end int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast(dimensionMapping.size()) : 0; - for (size_t i = 0; i < dimensionMapping.size(); i++) { - temp_sizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset]; - temp_strides[i] = m_strides[dimensionMapping[i] + alignmentOffset]; + for (size_t i = 0; i < dimensionMapping.size(); i++) + { + tempSizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset]; + tempStrides[i] = m_strides[dimensionMapping[i] + alignmentOffset]; } - std::copy(temp_sizes.begin(), temp_sizes.end(), m_sizes + alignmentOffset); - std::copy(temp_strides.begin(), temp_strides.end(), m_strides + alignmentOffset); + std::copy(tempSizes.begin(), tempSizes.end(), m_sizes + alignmentOffset); + std::copy(tempStrides.begin(), tempStrides.end(), m_strides + alignmentOffset); m_bufferTensorDesc.Sizes = m_sizes; m_bufferTensorDesc.Strides = m_strides; } -void TensorDesc::InitializeStrides(int dimensionCount, const TensorAxis alignment) +void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignment) { + if (m_bufferTensorDesc.Strides != nullptr) + { + // Strides are populated + return; + } + ML_CHECK_VALID_ARGUMENT(alignment == TensorAxis::RightAligned || alignment == TensorAxis::LeftAligned); // Right alignment values are shifted to the end - int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - dimensionCount : 0; - int index = dimensionCount + alignmentOffset; + int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast(dimensionCount) : 0; + int index = static_cast(dimensionCount) + alignmentOffset; uint32_t stride = 1; m_strides[index - 1] = 1; for (int i = index - 2; i >= 0; i--) @@ -358,9 +365,10 @@ void TensorDesc::InitializeStrides(int dimensionCount, const TensorAxis alignmen } } -void TensorDesc::EnsureMinimumDimensionCount(const size_t dimensionCount, const TensorAxis alignment) +void TensorDesc::EnsureMinimumDimensionCount(uint32_t dimensionCount, TensorAxis alignment) { - // m_sizes and m_strides are arrays of size MaximumDimensionCount - assert(MaximumDimensionCount >= dimensionCount); - SetDimensionCount(static_cast(dimensionCount), alignment); + if(dimensionCount != m_bufferTensorDesc.DimensionCount) + { + SetDimensionCount(dimensionCount, alignment); + } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h index 041fee90284e5..fbf28f2b425c4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h @@ -44,10 +44,8 @@ namespace Dml gsl::span GetSizes() const { return { m_sizes, m_sizes + m_bufferTensorDesc.DimensionCount }; } gsl::span GetStrides() const; void SetStrides(gsl::span strides); - void PermuteDimensions(const std::vector dimensionMapping, const TensorAxis alignment); - void PermuteArray(const std::vector dimensionMapping, const TensorAxis alignment); - void InitializeStrides(int count, const TensorAxis alignment); - void EnsureMinimumDimensionCount(const size_t count, const TensorAxis alignment); + void PermuteDimensions(gsl::span dimensionMapping, const TensorAxis alignment); + void EnsureMinimumDimensionCount(uint32_t count, TensorAxis alignment); inline uint64_t GetBufferSizeInBytes() const { @@ -94,6 +92,8 @@ namespace Dml uint32_t m_sizes[MaximumDimensionCount] = {}; uint32_t m_strides[MaximumDimensionCount] = {}; DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {}; + void PermuteArray(gsl::span dimensionMapping, TensorAxis alignment); + void EnsureStridesExist(uint32_t count, TensorAxis alignment); }; class TensorDescBuilder diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc index dfe50b8486857..78f7f431aa66e 100644 --- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc +++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc @@ -646,15 +646,5 @@ TEST(QLinearPoolTest, AveragePool2D_Global_nhwc_S8) { 1); // count_include_pad } -TEST(QLinearPoolTest, AveragePool2D_Global_mock) { - RunQLinearAveragePoolNhwc( - {1, 1, 32, 32}, // x shape - {1, 1, 1, 1}, // expected y shape - {32, 32}, // kernel shape - {1, 1}, // strides - {0, 0, 0, 0}, // pads - 1); // count_include_pad -} - } // namespace test } // namespace onnxruntime From cb20fa443592fd0344c88934572df23d4790524e Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 21 Sep 2023 15:14:55 -0700 Subject: [PATCH 5/9] Address review 2 --- .../src/Operators/DmlOperator.h | 2 +- .../DmlExecutionProvider/src/TensorDesc.cpp | 33 ++++++------------- .../dml/DmlExecutionProvider/src/TensorDesc.h | 5 ++- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h index df123f8db4658..c1e8cf42a974c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.h @@ -118,7 +118,7 @@ namespace Dml // ComPtr InitializeZeroInt64Tensor(uint64_t tensorSizeInBytes); void ExecuteZeroInt64Tensor(IDMLCompiledOperator* compiledOperator, IMLOperatorTensor* tensor); - + TensorDesc CreateTensorDescFromInput( const MLOperatorKernelCreationContext& kernelInfo, uint32_t index, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index 08ff04e0d5b57..653836f72bcae 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -319,32 +319,30 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, const TensorAxis alignment) { - EnsureMinimumDimensionCount(static_cast(dimensionMapping.size()), alignment); - EnsureStridesExist(static_cast(dimensionMapping.size()), alignment); - PermuteArray(dimensionMapping, alignment); + EnsureStridesExist(static_cast(dimensionMapping.size())); + SetDimensionCount(static_cast(dimensionMapping.size()), alignment); + PermuteArray(dimensionMapping); } // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping -void TensorDesc::PermuteArray(gsl::span dimensionMapping, TensorAxis alignment) +void TensorDesc::PermuteArray(gsl::span dimensionMapping) { std::vector tempSizes(dimensionMapping.size()); std::vector tempStrides(dimensionMapping.size()); - // Right alignment values are shifted to the end - int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast(dimensionMapping.size()) : 0; for (size_t i = 0; i < dimensionMapping.size(); i++) { - tempSizes[i] = m_sizes[dimensionMapping[i] + alignmentOffset]; - tempStrides[i] = m_strides[dimensionMapping[i] + alignmentOffset]; + tempSizes[i] = m_sizes[dimensionMapping[i]]; + tempStrides[i] = m_strides[dimensionMapping[i]]; } - std::copy(tempSizes.begin(), tempSizes.end(), m_sizes + alignmentOffset); - std::copy(tempStrides.begin(), tempStrides.end(), m_strides + alignmentOffset); + std::copy(tempSizes.begin(), tempSizes.end(), m_sizes); + std::copy(tempStrides.begin(), tempStrides.end(), m_strides); m_bufferTensorDesc.Sizes = m_sizes; m_bufferTensorDesc.Strides = m_strides; } -void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignment) +void TensorDesc::EnsureStridesExist(uint32_t dimensionCount) { if (m_bufferTensorDesc.Strides != nullptr) { @@ -352,10 +350,7 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignmen return; } - ML_CHECK_VALID_ARGUMENT(alignment == TensorAxis::RightAligned || alignment == TensorAxis::LeftAligned); - // Right alignment values are shifted to the end - int alignmentOffset = (alignment == TensorAxis::RightAligned) ? MaximumDimensionCount - static_cast(dimensionCount) : 0; - int index = static_cast(dimensionCount) + alignmentOffset; + int index = static_cast(dimensionCount); uint32_t stride = 1; m_strides[index - 1] = 1; for (int i = index - 2; i >= 0; i--) @@ -364,11 +359,3 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount, TensorAxis alignmen m_strides[i] = stride; } } - -void TensorDesc::EnsureMinimumDimensionCount(uint32_t dimensionCount, TensorAxis alignment) -{ - if(dimensionCount != m_bufferTensorDesc.DimensionCount) - { - SetDimensionCount(dimensionCount, alignment); - } -} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h index fbf28f2b425c4..2a7b0f3714b5d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h @@ -45,7 +45,6 @@ namespace Dml gsl::span GetStrides() const; void SetStrides(gsl::span strides); void PermuteDimensions(gsl::span dimensionMapping, const TensorAxis alignment); - void EnsureMinimumDimensionCount(uint32_t count, TensorAxis alignment); inline uint64_t GetBufferSizeInBytes() const { @@ -92,8 +91,8 @@ namespace Dml uint32_t m_sizes[MaximumDimensionCount] = {}; uint32_t m_strides[MaximumDimensionCount] = {}; DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {}; - void PermuteArray(gsl::span dimensionMapping, TensorAxis alignment); - void EnsureStridesExist(uint32_t count, TensorAxis alignment); + void PermuteArray(gsl::span dimensionMapping); + void EnsureStridesExist(uint32_t count); }; class TensorDescBuilder From 24f1392091feab2f887fb61996c59e5e3043978e Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 21 Sep 2023 16:04:49 -0700 Subject: [PATCH 6/9] Clean up --- .../dml/DmlExecutionProvider/src/TensorDesc.cpp | 10 +++------- .../dml/DmlExecutionProvider/src/TensorDesc.h | 1 - 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index 653836f72bcae..c685ec460dfe0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -321,14 +321,10 @@ void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, c { EnsureStridesExist(static_cast(dimensionMapping.size())); SetDimensionCount(static_cast(dimensionMapping.size()), alignment); - PermuteArray(dimensionMapping); -} -// Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping -void TensorDesc::PermuteArray(gsl::span dimensionMapping) -{ - std::vector tempSizes(dimensionMapping.size()); - std::vector tempStrides(dimensionMapping.size()); + // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping + std::vector tempSizes{m_sizes, m_sizes + MaximumDimensionCount}; + std::vector tempStrides{m_strides, m_strides + MaximumDimensionCount}; for (size_t i = 0; i < dimensionMapping.size(); i++) { diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h index 2a7b0f3714b5d..5925805025cc4 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h @@ -91,7 +91,6 @@ namespace Dml uint32_t m_sizes[MaximumDimensionCount] = {}; uint32_t m_strides[MaximumDimensionCount] = {}; DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {}; - void PermuteArray(gsl::span dimensionMapping); void EnsureStridesExist(uint32_t count); }; From 2c4b8f91e8a0e53c98010c833631ea95995538ec Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 21 Sep 2023 16:57:31 -0700 Subject: [PATCH 7/9] remove some redundancies --- .../providers/dml/DmlExecutionProvider/src/TensorDesc.cpp | 6 ++---- .../providers/dml/DmlExecutionProvider/src/TensorDesc.h | 1 + 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index c685ec460dfe0..92d7e500afe15 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -328,12 +328,10 @@ void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, c for (size_t i = 0; i < dimensionMapping.size(); i++) { - tempSizes[i] = m_sizes[dimensionMapping[i]]; - tempStrides[i] = m_strides[dimensionMapping[i]]; + m_sizes[i] = tempSizes[dimensionMapping[i]]; + m_strides[i] = tempStrides[dimensionMapping[i]]; } - std::copy(tempSizes.begin(), tempSizes.end(), m_sizes); - std::copy(tempStrides.begin(), tempStrides.end(), m_strides); m_bufferTensorDesc.Sizes = m_sizes; m_bufferTensorDesc.Strides = m_strides; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h index 5925805025cc4..57015e3fb58d1 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h @@ -91,6 +91,7 @@ namespace Dml uint32_t m_sizes[MaximumDimensionCount] = {}; uint32_t m_strides[MaximumDimensionCount] = {}; DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {}; + void EnsureStridesExist(uint32_t count); }; From 78ebdeadd6c88d4158f31c126ea7c80bad877b49 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 21 Sep 2023 18:25:11 -0700 Subject: [PATCH 8/9] Update Stride Calculation --- .../providers/dml/DmlExecutionProvider/src/TensorDesc.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index 92d7e500afe15..85d54da120425 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -319,7 +319,7 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, const TensorAxis alignment) { - EnsureStridesExist(static_cast(dimensionMapping.size())); + EnsureStridesExist(m_bufferTensorDesc.DimensionCount); SetDimensionCount(static_cast(dimensionMapping.size()), alignment); // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping @@ -346,10 +346,9 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount) int index = static_cast(dimensionCount); uint32_t stride = 1; - m_strides[index - 1] = 1; - for (int i = index - 2; i >= 0; i--) + for (int i = index; i-- > 0;) { - stride *= m_sizes[i + 1]; m_strides[i] = stride; + stride *= m_sizes[i]; } } From 534985647b95d669550f684ab4fc0f2a923893f9 Mon Sep 17 00:00:00 2001 From: Anagha Rao Date: Thu, 21 Sep 2023 19:03:53 -0700 Subject: [PATCH 9/9] clean up 2 --- .../providers/dml/DmlExecutionProvider/src/TensorDesc.cpp | 7 +++---- .../providers/dml/DmlExecutionProvider/src/TensorDesc.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp index 85d54da120425..a2183aab52eed 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.cpp @@ -319,7 +319,7 @@ void TensorDesc::SetDimensionCount(uint32_t newDimensionCount, TensorAxis alignm // Uses dimensionMapping to reorder m_sizes and m_strides to match specific Tensor layout void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, const TensorAxis alignment) { - EnsureStridesExist(m_bufferTensorDesc.DimensionCount); + EnsureStridesExist(); SetDimensionCount(static_cast(dimensionMapping.size()), alignment); // Shuffle m_sizes and m_strides according to the indexes pointed by dimensionMapping @@ -336,7 +336,7 @@ void TensorDesc::PermuteDimensions(gsl::span dimensionMapping, c m_bufferTensorDesc.Strides = m_strides; } -void TensorDesc::EnsureStridesExist(uint32_t dimensionCount) +void TensorDesc::EnsureStridesExist() { if (m_bufferTensorDesc.Strides != nullptr) { @@ -344,9 +344,8 @@ void TensorDesc::EnsureStridesExist(uint32_t dimensionCount) return; } - int index = static_cast(dimensionCount); uint32_t stride = 1; - for (int i = index; i-- > 0;) + for (uint32_t i = m_bufferTensorDesc.DimensionCount; i-- > 0;) { m_strides[i] = stride; stride *= m_sizes[i]; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h index 57015e3fb58d1..909e2084d0163 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/TensorDesc.h @@ -92,7 +92,7 @@ namespace Dml uint32_t m_strides[MaximumDimensionCount] = {}; DML_BUFFER_TENSOR_DESC m_bufferTensorDesc = {}; - void EnsureStridesExist(uint32_t count); + void EnsureStridesExist(); }; class TensorDescBuilder