Skip to content

Commit

Permalink
Resize and EP specific transpose optimization updates (microsoft#17664)
Browse files Browse the repository at this point in the history
### Description
<!-- Describe your changes. -->
- Treat Resize as layout sensitive by default
- whilst the ONNX spec does not specify a layout, EPs tend to implement
only one
- add second usage in L2 of TransposeOptimizer to plugin the ability to
push a Transpose through a Resize assigned to the CPU EP
- Allow EP specific logic for changes the ops considered to be layout
sensitive to be plugged in
  - expected usage is for microsoft#17200 


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Finish simplifying/clarifying transpose optimization and layout
transformation that was proposed in microsoft#15552. This PR along with microsoft#17618
should complete the changes.

---------

Co-authored-by: Edward Chen <[email protected]>
  • Loading branch information
2 people authored and kleiti committed Mar 22, 2024
1 parent a397e1f commit e3072c8
Show file tree
Hide file tree
Showing 16 changed files with 235 additions and 115 deletions.
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,9 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
}

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
// Run layout transformer only for EPs other than CPU EP and provided the preferred layout is NHWC
// Run layout transformer for EPs with preferred layout of NHWC
// CPU EP layout transformation happens later when level 3 transformers are run.
if (params.mode != GraphPartitioner::Mode::kAssignOnly &&
if (params.mode != GraphPartitioner::Mode::kAssignOnly && params.transform_layout.get() &&
current_ep.GetPreferredLayout() == DataLayout::NHWC) {
for (auto& capability : capabilities) {
TryAssignNodes(graph, *capability->sub_graph, ep_type);
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/framework/kernel_registry_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
auto create_error_message = [&node, &status](const std::string& prefix) {
std::ostringstream errormsg;
errormsg << prefix << node.OpType() << "(" << node.SinceVersion() << ")";
if (!node.Name().empty()) errormsg << " (node " << node.Name() << "). ";
if (!status.IsOK()) errormsg << status.ErrorMessage();
errormsg << " (node:'" << node.Name() << "' ep:'" << node.GetExecutionProviderType() << "'). ";
if (!status.IsOK())
errormsg << status.ErrorMessage();

return errormsg.str();
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
// so supporting older opsets is unnecessary.

// NOTE: This should be in sync with GetLayoutSensitiveOps in
// /onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
// /onnxruntime/core/optimizer/transpose_optimization/transpose_optimizer.cc
REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 11);

REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 9);
Expand Down
19 changes: 11 additions & 8 deletions onnxruntime/core/optimizer/graph_transformer_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
const InlinedHashSet<std::string_view> cpu_ep = {onnxruntime::kCpuExecutionProvider};
#endif
const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();

switch (level) {
case TransformerLevel::Level1: {
// RewriteRule optimizations are the simplest (they generally remove unnecessary nodes and are cheap to run)
Expand Down Expand Up @@ -240,13 +242,14 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(

// run TransposeOptimizer last as it works in a slightly different way by moving Transpose nodes around.
// shouldn't affect the end result - just easier to debug any issue if it's last.
// local CPU allocator is enough as this allocator is finally passed to a local tensor.
// We will also benefit by using a local allocator as we don't need to pass allocator as parameter for EP API refactor
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
transformers.emplace_back(std::make_unique<TransposeOptimizer>(std::move(cpu_allocator)));
} break;

case TransformerLevel::Level2: {
// we run TransposeOptimizer again in Level2 for some CPU EP specific optimizations that can only be
// applied once nodes are assigned to the CPU EP (which happens between level 1 and level 2).
transformers.emplace_back(std::make_unique<TransposeOptimizer>(std::move(cpu_allocator), kCpuExecutionProvider));

const bool enable_quant_qdq_cleanup =
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableQuantQDQCleanup, "0") == "1";
#if !defined(DISABLE_CONTRIB_OPS)
Expand Down Expand Up @@ -366,16 +369,16 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
if (MlasNchwcGetBlockSize() > 1) {
transformers.emplace_back(std::make_unique<NchwcTransformer>());
}
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();

auto cpu_registry = cpu_execution_provider.GetKernelRegistry();
auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry));
if (nhwc_transformer->IsActive()) {
transformers.emplace_back(std::move(nhwc_transformer));
}
// NCHWCtransformer should have a higher priority versus this. Because NCHWCtransformer also do the similar things
// of fusion patterns and target on CPU. However, NCHWCtransformer will reorder the layout to nchwc which is only available for
// x86-64 cpu, not edge cpu like arm. But This transformer could be used by opencl-ep/cpu-ep. So
// we will prefer NhwcTransformer once ort runs on x86-64 CPU, otherwise ConvAddActivationFusion is enabled.

// NchwcTransformer must have a higher priority than ConvAddActivationFusion. NchwcTransformer does similar
// fusions targeting CPU but also reorders the layout to NCHWc which is expected to be more efficient but is
// only available on x86-64.
// PR #6351 implemented similar fusion-pattern for CUDA only, and can only fuse conv-add-relu,
// while we can fuse more activation.
transformers.emplace_back(std::make_unique<ConvAddActivationFusion>(cpu_ep));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,91 @@ using namespace onnx_transpose_optimization;

namespace onnxruntime {
namespace layout_transformation {
namespace {
// Cost check for aggressively pushing the Transpose nodes involved in the layout transformation further out.
CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& node,
const std::vector<int64_t>& perm,
const std::unordered_set<std::string>& outputs_leading_to_transpose) {
// we aggressively push the layout transpose nodes.
// Exception: pushing through a Concat can result in Transpose nodes being added to multiple other inputs which
// can potentially be worse for performance. Use the cost check in that case.
if (node.OpType() != "Concat" &&
(perm == ChannelFirstToLastPerm(perm.size()) || perm == ChannelLastToFirstPerm(perm.size()))) {
return CostCheckResult::kPushTranspose;
}

// for other nodes use the default ORT cost check
return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
}

/// <summary>
/// Default function for checking if a node should have its layout changed. Allows EP specific adjustments to the
/// default set of layout sensitive operators if required.
///
/// Longer term, if required, the EP API could allow the EP to provide a delegate to plugin EP specific logic so we
/// don't hardcode it here.
/// </summary>
/// <param name="node">Node to check</param>
/// <returns>true if the node should have its layout converted to NHWC.</returns>
bool ConvertNodeLayout(const api::NodeRef& node) {
// skip if op is not an ONNX or contrib op
auto domain = node.Domain();
if (domain != kOnnxDomain && domain != kMSDomain) {
return false;
}

const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps();

// handle special cases
#if defined(USE_XNNPACK)
if (node.GetExecutionProviderType() == kXnnpackExecutionProvider) {
if (node.OpType() == "Resize") {
// XNNPACK supports NCHW and NHWC for Resize so we don't need to use the internal NHWC domain and wrap the Resize
// with Transpose nodes. EPAwareHandleResize will allow an NCHW <-> NHWC Transpose to be pushed through
// the Resize during transpose optimization.
return false;
}
}
#endif

#if defined(USE_JSEP)
// TODO(fs-eire): Remove special case handing of JSEP once NHWC Resize implementation is fixed
if (node.GetExecutionProviderType() == kJsExecutionProvider) {
if (node.OpType() == "Resize") {
// leave Resize as-is pending bugfix for NHWC implementation. this means the node will remain in the ONNX domain
// with the original input layout.
return false;
}
}
#endif

// #if defined(USE_CUDA)
// if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
// Update as per https://github.com/microsoft/onnxruntime/pull/17200 with CUDA ops that support NHWC
// }
// #endif

return layout_sensitive_ops.count(node.OpType()) != 0;
}
} // namespace

// Layout sensitive NCHW ops. TransformLayoutForEP will wrap these with Transpose nodes to convert the input
// data to NHWC and output data back to NCHW, and move the op to the internal NHWC domain (kMSInternalNHWCDomain).
// The EP requesting these ops MUST be able to handle the node with the operator in the kMSInternalNHWCDomain.
// The EP requesting these ops MUST be able to handle the node with the operator in the kMSInternalNHWCDomain domain.
// Once all the layout sensitive ops requested by the EP are wrapped the transpose optimizer will attempt to remove
// as many of the layout transposes as possible.
const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
static std::unordered_set<std::string_view> ort_layout_sensitive_ops = []() {
const auto& layout_sensitive_ops = onnx_transpose_optimization::GetLayoutSensitiveOps();
std::unordered_set<std::string_view> ort_specific_ops =
{ "FusedConv",
"QLinearAveragePool",
"QLinearGlobalAveragePool"
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN) || defined(USE_WEBNN)
// The CUDA/ROCM Resize kernel is layout sensitive as it only handles NCHW input.
// The CPU kernel and ONNX spec are not limited to handling NCHW input so are not layout sensitive, and
// onnx_layout_transformation::HandleResize is used.
,
"Resize"
#endif
};
{
"FusedConv",
"QLinearAveragePool",
"QLinearGlobalAveragePool",
// Whilst the ONNX spec doesn't specify a layout for Resize, we treat it as layout sensitive by default
// as EPs tend to only support one layout.
"Resize",
};

ort_specific_ops.insert(layout_sensitive_ops.cbegin(), layout_sensitive_ops.cend());
return ort_specific_ops;
Expand All @@ -42,45 +106,21 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
return ort_layout_sensitive_ops;
}

// Cost check for aggressively pushing the Transpose nodes involved in the layout transformation further out.
static CostCheckResult
PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& node,
const std::vector<int64_t>& perm,
const std::unordered_set<std::string>& outputs_leading_to_transpose) {
// we aggressively push the layout transpose nodes.
// Exception: pushing through a Concat can result in Transpose nodes being added to multiple other inputs which
// can potentially be worse for performance. Use the cost check in that case.
if (node.OpType() != "Concat" &&
(perm == ChannelFirstToLastPerm(perm.size()) || perm == ChannelLastToFirstPerm(perm.size()))) {
return CostCheckResult::kPushTranspose;
}

// for other nodes use the default ORT cost check
return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
}

Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
AllocatorPtr cpu_allocator,
const DebugGraphFn& debug_graph_fn) {
// We pass in nullptr for the new_node_ep param as new nodes will be assigned by the graph partitioner after
// TransformLayoutForEP returns.
// sub graph recurse will be added later.
// sub graph recurse will be added later
auto api_graph = MakeApiGraph(graph, cpu_allocator, /*new_node_ep*/ nullptr);
const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps();

// to convert to NHWC we need to wrap layout sensitive nodes to Transpose from NCHW to NHWC and back.
for (auto& node : api_graph->Nodes()) {
if (layout_sensitive_ops.count(node->OpType())) {
if (node->GetExecutionProviderType() != execution_provider.Type()) {
continue;
}

auto domain = node->Domain();
// Skip if domain is incorrect
if (domain != kOnnxDomain && domain != kMSDomain) {
continue;
}
if (node->GetExecutionProviderType() != execution_provider.Type()) {
continue;
}

if (ConvertNodeLayout(*node)) {
// if already transformed then change the domain to kMSInternalNHWCDomain this way the EP
// knows this op is in the expected format.
if (node->GetAttributeIntDefault("channels_last", 0) == 1) {
Expand Down Expand Up @@ -137,7 +177,6 @@ Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvid
WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm});
}

// TODO: Technically Resize doesn't need to change domain as the ONNX Resize spec is not layout sensitive.
SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain);
modified = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1242,18 +1242,7 @@ static void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, con
node.SetInput(i, gather_output);
}

static bool HandleResize([[maybe_unused]] HandlerArgs& args) {
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN) || defined(USE_WEBNN)
// The CUDA Resize kernel requires that the input is NCHW, so we can't push a Transpose through a Resize
// in ORT builds with CUDA enabled.
// The ROCm EP is generated from the CUDA EP kernel so the same applies to builds with ROCm enabled.
// The QNN EP requires the input to be NHWC, so the Resize handler is also not enabled for QNN builds.
//
// TODO: Remove this special case once the CUDA Resize kernel is implemented "generically" (i.e.) aligning with the
// generic nature of the ONNX spec.
// See https://github.com/microsoft/onnxruntime/pull/10824 for a similar fix applied to the CPU Resize kernel.
return false;
#else
bool HandleResize([[maybe_unused]] HandlerArgs& args) {
auto inputs = args.node.Inputs();
int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());

Expand All @@ -1279,10 +1268,10 @@ static bool HandleResize([[maybe_unused]] HandlerArgs& args) {
TransposeOutputs(args.ctx, args.node, args.perm);

return true;
#endif
}

constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
// Not currently registered by default.
// constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};

static bool HandlePad(HandlerArgs& args) {
size_t rank = args.perm.size();
Expand Down Expand Up @@ -2034,15 +2023,19 @@ static const std::unordered_map<std::string_view, const HandlerInfo&> handler_ma
{"Split", split_handler},
{"Shape", shape_handler},
{"Pad", pad_handler},
{"Resize", resize_handler},
{"ReduceSum", reduce_op_handler},

// Execution providers tend to only implement Resize for specific layouts. Due to that, it's safer to not
// push a Transpose through a Resize unless the EP specifically checks that it can handle the change via an
// extended handler.
// {"Resize", resize_handler},

{"ReduceLogSum", reduce_op_handler},
{"ReduceLogSumExp", reduce_op_handler},
{"ReduceMax", reduce_op_handler},
{"ReduceMean", reduce_op_handler},
{"ReduceMin", reduce_op_handler},
{"ReduceProd", reduce_op_handler},
{"ReduceSum", reduce_op_handler},
{"ReduceSumSquare", reduce_op_handler},
{"ReduceL1", reduce_op_handler},
{"ReduceL2", reduce_op_handler},
Expand Down Expand Up @@ -2385,6 +2378,8 @@ OptimizeResult OptimizeImpl(OptimizerCtx& ctx) {
continue;
}

// NOTE: this bleeds ORT specific logic into the base optimizer, however we justify that for now because we expect
// the types that the ORT DQ provides to be added to the ONNX spec, at which point this special case can go away.
if (IsMSDomain(dq_domain) && !TransposeQuantizeDequantizeAxis(ctx.graph, perm_inv, *dq_node)) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ bool HandleSimpleNodeWithAxis(HandlerArgs& args, std::optional<int64_t> default_

// base handlers that are used by extended handlers. add from transpose_optimizer.cc as needed.
bool HandleReduceOps(HandlerArgs& args);
bool HandleResize([[maybe_unused]] HandlerArgs& args);

void TransposeInput(api::GraphRef& graph, api::NodeRef& node, size_t i,
const std::vector<int64_t>& perm,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,35 @@

#include <algorithm>
#include "core/graph/constants.h"
#include "core/framework/utils.h"
#include "core/optimizer/transpose_optimization/ort_optimizer_utils.h"

using namespace onnx_transpose_optimization;

namespace onnxruntime {

static bool EPAwareHandleResize(HandlerArgs& args) {
// Whilst Resize is not technically layout sensitive, execution providers typically implement handling for only one
// layout. Due to that, only push a Transpose through a Resize once it is assigned and we know it's being handled
// by an EP that supports multiple layouts. Currently that's the CPU and XNNPACK EPs.
const auto ep_type = args.node.GetExecutionProviderType();
if (ep_type == kCpuExecutionProvider || ep_type == kXnnpackExecutionProvider) {
// allow NCHW <-> NHWC for now. not clear any other sort of transpose has a valid usage in a real model
int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
if (rank_int == 4) {
static const std::vector<int64_t> nchw_to_nhwc_perm{0, 2, 3, 1};
static const std::vector<int64_t> nhwc_to_nchw_perm{0, 3, 1, 2};
if (args.perm == nchw_to_nhwc_perm || args.perm == nhwc_to_nchw_perm) {
return HandleResize(args);
}
}
}

return false;
}

constexpr HandlerInfo ep_aware_resize_handler = {&FirstInput, &EPAwareHandleResize};

static bool HandleQLinearConcat(HandlerArgs& args) {
return HandleSimpleNodeWithAxis(args);
}
Expand Down Expand Up @@ -62,7 +85,7 @@ static bool HandleMaxPool(HandlerArgs& args) {
ORT_UNUSED_PARAMETER(args);
return false;
#else
if (args.node.GetExecutionProviderType() != "CPUExecutionProvider") {
if (args.node.GetExecutionProviderType() != kCpuExecutionProvider) {
return false;
}

Expand Down Expand Up @@ -103,6 +126,7 @@ static bool HandleContribQuantizeDequantizeLinear(HandlerArgs& args) {
}

constexpr HandlerInfo max_pool_op_handler = {&FirstInput, &HandleMaxPool};

constexpr HandlerInfo node_1_inp_handler = {&FirstInput, &HandleSimpleNode};
constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps};
constexpr HandlerInfo contrib_quantize_dequantize_linear_handler = {&FirstInput,
Expand All @@ -113,6 +137,7 @@ const HandlerMap& OrtExtendedHandlers() {
static const HandlerMap extended_handler_map = []() {
HandlerMap map = {
{"MaxPool", max_pool_op_handler},
{"Resize", ep_aware_resize_handler},
{"com.microsoft.QuantizeLinear", contrib_quantize_dequantize_linear_handler},
{"com.microsoft.DequantizeLinear", contrib_quantize_dequantize_linear_handler},
{"com.microsoft.QLinearAdd", q_linear_binary_op_handler},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ namespace onnxruntime {
/// <summary>
/// Get the extended handlers for ORT specific transpose optimization.
/// These include handlers for contrib ops, and where we have an NHWC version of a layout sensitive op.
/// Extends the handlers returned by OrtHandlers.
/// </summary>
/// <returns>HandlerMap</returns>
const onnx_transpose_optimization::HandlerMap& OrtExtendedHandlers();
Expand Down
Loading

0 comments on commit e3072c8

Please sign in to comment.