Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into skottmckay/MakePartit…
Browse files Browse the repository at this point in the history
…ioningUtilsQDQAware
  • Loading branch information
skottmckay committed Mar 7, 2024
2 parents 4df61e9 + bff4f8b commit cb41315
Show file tree
Hide file tree
Showing 41 changed files with 962 additions and 370 deletions.
6 changes: 3 additions & 3 deletions .pipelines/windowsai-steps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ jobs:
# must call vsdevcmd first to add cmake to PATH
- script: |
curl -O -L https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-windows-x86_64.zip
7z x cmake-3.26.3-windows-x86_64.zip
curl -O -L https://github.com/Kitware/CMake/releases/download/v3.28.3/cmake-3.28.3-windows-x86_64.zip
7z x cmake-3.28.3-windows-x86_64.zip
set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --parallel --use_binskim_compliant_compile_flags --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos --windows_sdk_version "10.0.22621.0" $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" --cmake_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.28.3-windows-x86_64\bin\ctest.exe
workingDirectory: '$(Build.BinariesDirectory)'
displayName: 'Generate cmake config'
Expand Down
1 change: 1 addition & 0 deletions include/onnxruntime/core/graph/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ constexpr size_t kMaxExecutionProviderNameLen = 30;

constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider";
constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
constexpr const char* kCudaNHWCExecutionProvider = "CUDANHWCExecutionProvider";
constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,13 @@ Status CheckInputs(const Tensor* query,
"head_size shall be a multiple of 16. Got head_size % 16 == ",
head_size % 16);
}
if (cos_dims[0] != present_sequence_length) {
if (cos_dims[0] < present_sequence_length) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"cos_cache dimension 0 must be of present_sequence_length.");
"cos_cache dimension 0 should be of max_sequence_length.");
}
if (sin_dims[0] != present_sequence_length) {
if (sin_dims[0] < present_sequence_length) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"sin_cache dimension 0 must be of present_sequence_length.");
"sin_cache dimension 0 should be of max_sequence_length.");
}
if (cos_dims[1] != (head_size / 16) * 8) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
Expand Down
9 changes: 8 additions & 1 deletion onnxruntime/core/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,14 @@ inline std::string TrimString(std::string s) {
}

/**
* So use this simple hash to generate unique int by given string input.
* @brief A consistent way to construct the full qualified op name.
*/
inline std::string GetFullQualifiedOpName(const std::string& op_type, const std::string& domain) {
return MakeString(domain, "::", op_type);
}

/**
* Use this simple hash to generate unique int by given string input.
*/
inline uint32_t GetHashFromString(const std::string& str_value) {
uint32_t hash = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 14);
REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 15);

REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 1);
REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 11);
REGISTER_NHWC_SCHEMA(fn, DepthToSpace, 13);

Expand Down
25 changes: 13 additions & 12 deletions onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#ifdef ENABLE_TRAINING

#include <onnx/defs/attr_proto_util.h>
#include "core/common/string_utils.h"
#include "core/graph/graph_utils.h"
#include "core/optimizer/initializer.h"
#include "core/optimizer/utils.h"
Expand All @@ -26,38 +27,38 @@ UpStreamGatherGraphTransformer::UpStreamGatherGraphTransformer(
// 2. Whether the outputs have the same dim changes if the Gather node moves before that operator.
// 3. Should all inputs be allowed when tracking back further (bottom-up);
// if not, add the input index restriction as MatMul did.
{GetFullQualifiedOpName("Add", kOnnxDomain),
{utils::GetFullQualifiedOpName("Add", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
opset_14_13_7_6_1)},
{GetFullQualifiedOpName("BiasGelu", kMSDomain),
{utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(), opset_1)},

{GetFullQualifiedOpName("Cast", kOnnxDomain),
{utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
opset_19_13_9_6_1)},
{GetFullQualifiedOpName("Div", kOnnxDomain),
{utils::GetFullQualifiedOpName("Div", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
opset_14_13_7_6_1)},
{GetFullQualifiedOpName("Dropout", kOnnxDomain),
{utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
opset_13_12_10_7_6_1)},
{GetFullQualifiedOpName("Gelu", kMSDomain),
{utils::GetFullQualifiedOpName("Gelu", kMSDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SimplePointwiseGatherActor<true>>(),
opset_1)},
{// Be noted, this is our own implementation of ONNX domain op.
GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<LayerNormalizationGatherActor>(),
opset_1)},
{GetFullQualifiedOpName("MatMul", kOnnxDomain),
{utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<MatMulGatherActor>(),
opset_13_9_1)},
{GetFullQualifiedOpName("Reshape", kOnnxDomain),
{utils::GetFullQualifiedOpName("Reshape", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<ReshapeGatherActor>(),
opset_19_14_13_5_1)},
{GetFullQualifiedOpName("Softmax", kOnnxDomain),
{utils::GetFullQualifiedOpName("Softmax", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<SoftmaxGatherActor>(),
opset_13_11_1)},
{GetFullQualifiedOpName("Transpose", kOnnxDomain),
{utils::GetFullQualifiedOpName("Transpose", kOnnxDomain),
OpPassThroughConfig<UpStreamGatherOperatorActorBase>(std::make_shared<TransposeGatherActor>(),
opset_13_1)},
});
Expand All @@ -69,7 +70,7 @@ bool UpStreamGatherGraphTransformer::UpStreamInternal(
const OpPassThroughConfig<UpStreamGatherOperatorActorBase>& pass_through_config,
const logging::Logger& logger) const {
Node& slice_node = *info.node_ptr;
const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());

std::unordered_map<int, int> propagate_input_indices;
std::unordered_map<int, std::vector<DimCompare>> all_input_cmp_rets;
Expand Down
15 changes: 8 additions & 7 deletions onnxruntime/core/optimizer/compute_optimizer/upstream_reshape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#ifdef ENABLE_TRAINING

#include "core/framework/tensorprotoutils.h"
#include "core/common/string_utils.h"
#include "core/graph/graph_utils.h"
#include "core/optimizer/utils.h"
#include "core/optimizer/compute_optimizer/upstream_reshape_actors.h"
Expand All @@ -21,23 +22,23 @@ UpStreamReshapeGraphTransformer::UpStreamReshapeGraphTransformer(
// If optype is not enough to guarantee the equivalence, we need to add a customized pre-check function.
// 2. Should all inputs be allowed when tracking back further (bottom-up);
// if not, add the input index restriction.
{GetFullQualifiedOpName("Add", kOnnxDomain),
{utils::GetFullQualifiedOpName("Add", kOnnxDomain),
OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_14_13_7_6_1)},
{GetFullQualifiedOpName("BiasGelu", kMSDomain),
{utils::GetFullQualifiedOpName("BiasGelu", kMSDomain),
OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_1)},
{GetFullQualifiedOpName("Cast", kOnnxDomain),
{utils::GetFullQualifiedOpName("Cast", kOnnxDomain),
OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_19_13_9_6_1)},
{GetFullQualifiedOpName("Dropout", kOnnxDomain),
{utils::GetFullQualifiedOpName("Dropout", kOnnxDomain),
OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
std::make_shared<SimplePointwiseReshapeActor<true>>(), opset_13_12_10_7_6_1)},
{// Be noted, this is our own implementation of ONNX domain op.
GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain),
OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
std::make_shared<LayerNormalizationReshapeActor>(), opset_1)},
{GetFullQualifiedOpName("MatMul", kOnnxDomain),
{utils::GetFullQualifiedOpName("MatMul", kOnnxDomain),
OpPassThroughConfig<UpStreamReshapeOperatorActorBase>(
std::make_shared<MatMulReshapeActor>(), opset_13_9_1)},
});
Expand All @@ -47,7 +48,7 @@ bool UpStreamReshapeGraphTransformer::UpStreamInternal(
Graph& graph, std::deque<ReshapeInfo>& queue, Node& current_node, ReshapeInfo& info,
const OpPassThroughConfig<UpStreamReshapeOperatorActorBase>& pass_through_config,
const logging::Logger& logger) const {
const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());

std::vector<int> propagate_input_indices;
std::unordered_map<int, std::vector<DimCompare>> all_input_cmp_rets;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <onnx/defs/attr_proto_util.h>
#include "core/common/safeint.h"
#include "core/common/string_utils.h"
#include "core/graph/graph_utils.h"
#include "core/optimizer/initializer.h"
#include "core/optimizer/utils.h"
Expand Down Expand Up @@ -130,7 +131,7 @@ template <typename T1, typename T2>
bool UpStreamGraphTransformerBase<T1, T2>::Upstream(Graph& graph, std::deque<T1>& queue,
Node& current_node, T1& info,
const logging::Logger& logger) const {
const std::string op_type = GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
const std::string op_type = utils::GetFullQualifiedOpName(current_node.OpType(), current_node.Domain());
if (allowed_passthrough_ops_.count(op_type)) {
auto& pass_through_config = allowed_passthrough_ops_.at(op_type);
LOG_DEBUG_INFO(logger, "Enter reorder handle for node " + current_node.Name() + "(" + op_type + ")");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,6 @@ class UpStreamGraphTransformerBase : public GraphTransformer {
const OpPassThroughConfig<T2>& pass_through_config,
const logging::Logger& logger) const = 0;

/**
* @brief A consistent way to construct the full qualified op name.
*/
std::string GetFullQualifiedOpName(const std::string& op_type, const std::string& domain) const {
return domain + "::" + op_type;
}

std::unordered_map<std::string, OpPassThroughConfig<T2>> allowed_passthrough_ops_;

private:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
"GlobalAveragePool",
"AveragePool",
"GridSample",
};
"DepthToSpace",
"SpaceToDepth"};
}();
return cuda_nhwc_ops;
}
Expand Down
16 changes: 12 additions & 4 deletions onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class SpaceDepthBase {
"Attribute blocksize is not set.");
}

template <bool IsNHWC = false>
Status InputValidationsAndOutputDimsCalc(const Tensor& input,
int64_t& batch,
int64_t& input_depth, int64_t& input_height, int64_t& input_width,
Expand All @@ -27,9 +28,15 @@ class SpaceDepthBase {
}

batch = input_shape[0];
input_depth = input_shape[1];
input_height = input_shape[2];
input_width = input_shape[3];
if constexpr (IsNHWC) {
input_depth = input_shape[3];
input_height = input_shape[1];
input_width = input_shape[2];
} else {
input_depth = input_shape[1];
input_height = input_shape[2];
input_width = input_shape[3];
}

if (is_space_to_depth) { // SpaceToDepth op
if ((input_height % this->blocksize_) != 0) {
Expand All @@ -46,7 +53,8 @@ class SpaceDepthBase {

} else { // DepthToSpace op
if ((input_depth % (blocksize_ * blocksize_) != 0)) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "DepthToSpace requires input depth to be a multiple of (block_size * blok_size)");
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"DepthToSpace requires input depth to be a multiple of (block_size * block_size)");
}

output_depth = input_depth / blocksize_ / blocksize_;
Expand Down
16 changes: 16 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalN
BatchNormalization);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 15, MLFloat16,
BatchNormalization);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, DepthToSpace);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 11, 12, DepthToSpace);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, DepthToSpace);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 12, SpaceToDepth);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 13, SpaceToDepth);

Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
static const BuildKernelCreateInfoFn nhwc_function_table[] = {
Expand Down Expand Up @@ -171,6 +176,17 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
kCudaExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, ConvTranspose)>,

BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
1, 10, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
11, 12, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
13, DepthToSpace)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
1, 12, SpaceToDepth)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain,
13, SpaceToDepth)>,
};

for (auto& function_table_entry : nhwc_function_table) {
Expand Down
Loading

0 comments on commit cb41315

Please sign in to comment.