From ba40022ec42a4b60d4b1ef875d6613923e9e8624 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 23 Oct 2024 11:26:34 +0800
Subject: [PATCH] [WebNN EP] Support axes and fix some validation for Resize
 (#21952)

- Supports arbitrary axes for Resize opset 18+
- Check all inputs and attributes more carefully

---------

Co-authored-by: Dwayne Robinson <fdwr@hotmail.com>
---
 js/web/docs/webnn-operators.md                |   2 +-
 .../core/providers/webnn/builders/helper.h    |  36 +++
 .../webnn/builders/impl/resize_op_builder.cc  | 287 +++++++++++-------
 3 files changed, 216 insertions(+), 109 deletions(-)
diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index f696264aeead7..bf0f1dffb83ee 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -78,7 +78,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 | ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✓ | ✓ | Input 'axes' if present should be a constant |
 | Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | |
 | Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
-| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, exclude_outside != 0, input 'scales' and 'sizes' if present must be a constant, 'linear' and 'nearest' modes |
+| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, coordinate_transformation_mode == 'half_pixel', exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
 | Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | |
 | Sigmoid | ai.onnx(7-12, 13+) | sigmoid | ✓ | ✓ | |
 | Softplus | ai.onnx(7+) | softplus | ✓ | ✓ | |
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index aecb1f7a03bb9..ec9993bf138ba 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -36,6 +36,31 @@ WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type);
 // Collects all the initializer tensors in the subGraph and its ancestor graphs.
 InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
 
+inline std::vector<int64_t> convertAxesFromNCHWtoNHWC(const std::vector<int64_t>& axes) {
+  constexpr std::array<int64_t, 4> nchw_to_nhwc = {0, 3, 1, 2};
+  std::vector<int64_t> new_axes;
+  new_axes.reserve(axes.size());
+  for (int64_t axis : axes) {
+    if (axis >= nchw_to_nhwc.size()) {
+      ORT_THROW("Invalid axis value: ", axis);
+    }
+    new_axes.push_back(nchw_to_nhwc[static_cast<size_t>(axis)]);
+  }
+  return new_axes;
+}
+
+inline std::vector<int64_t> HandleNegativeAxes(const std::vector<int64_t>& axes, size_t input_size) {
+  std::vector<int64_t> new_axes(axes.size());
+  for (size_t i = 0; i < axes.size(); ++i) {
+    new_axes[i] = HandleNegativeAxis(axes[i], input_size);
+  }
+  return new_axes;
+}
+
+inline std::vector<int64_t> GetResolvedAxes(const NodeAttrHelper& helper, size_t input_size) {
+  return HandleNegativeAxes(helper.Get("axes", std::vector<int64_t>{}), input_size);
+}
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
 
 template <typename T>
@@ -144,6 +169,17 @@ inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::va
   return true;
 }
 
+inline bool IsEmptyTensor(const InitializedTensorSet& initializers, const std::string& name) {
+  if (name.empty() || !Contains(initializers, name)) {
+    return true;
+  }
+
+  const auto& tensor = *initializers.at(name);
+  const auto dims = tensor.dims();
+  // An empty tensor contains a 0 in the dimensions list.
+  return std::any_of(dims.begin(), dims.end(), [](auto d) { return d == 0; });
+}
+
 bool IsInputSupported(const NodeArg& node_arg, const std::string& parent_name, const logging::Logger& logger);
 
 // Get a list of groups of supported nodes, each group represents a subgraph supported by WebNN EP.
diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index 9dc79f4f52f46..3442afbc2b3cd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -38,16 +38,33 @@ class ResizeOpBuilder : public BaseOpBuilder {
 };
 
 // Helper functions
-bool GetResizeScales(const InitializedTensorSet& initializers,
-                     const Node& node, std::vector<float>& scales,
-                     const logging::Logger& logger) {
+bool GetResizeScalesAndAxes(const InitializedTensorSet& initializers,
+                            const Node& node, std::vector<float>& scales,
+                            std::vector<int64_t>& axes, const bool is_nhwc,
+                            const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() < 3)
     return false;
 
+  const bool has_axes = !axes.empty();
   const auto& scales_tensor = *initializers.at(input_defs[2]->Name());
-  if (scales_tensor.dims_size() != 1 || scales_tensor.dims()[0] != 4)
+  if (scales_tensor.dims_size() != 1) {
+    LOGS(logger, ERROR) << "'scales' should be a 1D tensor.";
     return false;
+  }
+
+  // Number of elements of 'scales' tensor.
+  const auto num_of_scales = scales_tensor.dims()[0];
+
+  if (has_axes && num_of_scales != 2) {
+    LOGS(logger, ERROR) << "When 'axes' is provided, 'scales' should have 2 elements.";
+    return false;
+  }
+
+  if (!has_axes && num_of_scales != 4) {
+    LOGS(logger, ERROR) << "When 'axes' is not provided, 'scales' should have 4 elements.";
+    return false;
+  }
 
   std::vector<uint8_t> unpacked_tensor;
   auto status = onnxruntime::utils::UnpackInitializerData(scales_tensor, unpacked_tensor);
@@ -56,20 +73,65 @@ bool GetResizeScales(const InitializedTensorSet& initializers,
     return false;
   }
   const float* scales_data = reinterpret_cast<const float*>(unpacked_tensor.data());
-  scales = std::vector<float>{scales_data, scales_data + 4};
+
+  if (has_axes) {
+    // 'axes' is specified since opset 18+, 'scales' should have 2 elements.
+    scales = std::vector<float>{scales_data, scales_data + 2};
+  } else {
+    // Before opset 18, 'scales' should have 4 elements.
+    // Make sure 'scales' is not trying to scale on N/C channels here.
+    std::vector<float> onnx_scales{scales_data, scales_data + 4};
+    // 'scales' input has been transposed to NHWC layout if it is NHWC preferred layout.
+    const float scale_n = onnx_scales[0];
+    const float scale_c = is_nhwc ? onnx_scales[3] : onnx_scales[1];
+    const float scale_h = is_nhwc ? onnx_scales[1] : onnx_scales[2];
+    const float scale_w = is_nhwc ? onnx_scales[2] : onnx_scales[3];
+    if (scale_n != 1.0f || scale_c != 1.0f) {
+      LOGS(logger, VERBOSE) << "Scales of N/C channel should be 1"
+                            << "Scales of N/C channels are not supported"
+                            << ", scale_n, " << scale_n << ", scale_c, " << scale_c;
+      return false;
+    }
+
+    scales = {scale_h, scale_w};
+    axes = {2, 3};
+  }
+
+  if (is_nhwc) {
+    // For NHWC preferred layout, we need to convert axes from NCHW to NHWC.
+    axes = convertAxesFromNCHWtoNHWC(axes);
+  }
+
   return true;
 }
 
-bool GetResizeOutputSizes(const InitializedTensorSet& initializers,
-                          const Node& node, std::vector<int64_t>& sizes,
-                          const logging::Logger& logger) {
+bool GetResizeSizesAndAxes(const InitializedTensorSet& initializers,
+                           const Node& node, std::vector<int64_t>& sizes,
+                           std::vector<int64_t>& axes, const bool is_nhwc,
+                           const gsl::span<int64_t>& input_shape,
+                           const logging::Logger& logger) {
   const auto& input_defs = node.InputDefs();
   if (input_defs.size() < 4)
     return false;
 
+  const bool has_axes = !axes.empty();
   const auto& sizes_tensor = *initializers.at(input_defs[3]->Name());
-  if (sizes_tensor.dims_size() != 1 || sizes_tensor.dims()[0] != 4)
+  if (sizes_tensor.dims_size() != 1) {
+    LOGS(logger, ERROR) << "'sizes' should be a 1D tensor.";
+    return false;
+  }
+
+  // Number of elements of sizes tensor.
+  const auto num_of_sizes = sizes_tensor.dims()[0];
+  if (has_axes && num_of_sizes != 2) {
+    LOGS(logger, ERROR) << "When 'axes' is provided, 'sizes' should have 2 elements.";
+    return false;
+  }
+
+  if (!has_axes && num_of_sizes != 4) {
+    LOGS(logger, ERROR) << "When 'axes' is not provided, 'sizes' should have 4 elements.";
     return false;
+  }
 
   std::vector<uint8_t> unpacked_tensor;
   auto status = onnxruntime::utils::UnpackInitializerData(sizes_tensor, unpacked_tensor);
@@ -78,7 +140,35 @@ bool GetResizeOutputSizes(const InitializedTensorSet& initializers,
     return false;
   }
   const int64_t* sizes_data = reinterpret_cast<const int64_t*>(unpacked_tensor.data());
-  sizes = std::vector<int64_t>{sizes_data, sizes_data + 4};
+
+  if (has_axes) {
+    // 'axes' is specified since opset 18+, 'sizes' should have 2 elements.
+    sizes = std::vector<int64_t>{sizes_data, sizes_data + 2};
+  } else {
+    // Before opset 18, 'sizes' should have 4 elements.
+    // Make sure 'sizes' is not trying to resize on N/C channels here.
+    std::vector<int64_t> onnx_sizes{sizes_data, sizes_data + 4};
+    auto size_n = onnx_sizes[0];
+    const int c_idx = is_nhwc ? 3 : 1;
+    if (size_n != input_shape[0] || onnx_sizes[c_idx] != input_shape[c_idx]) {
+      LOGS(logger, VERBOSE) << "Output sizes of N/C chanel should match the input sizes, "
+                            << "Resize of N/C channels are not supported"
+                            << ", input_size_n, " << input_shape[0] << ", output_size_n, " << size_n
+                            << ". input_size_c, " << input_shape[c_idx] << ", output_size_c, " << onnx_sizes[c_idx];
+      return false;
+    }
+    // 'sizes' input has been transposed to NHWC layout if it is NHWC preferred layout.
+    const int64_t sizes_h = is_nhwc ? onnx_sizes[1] : onnx_sizes[2];
+    const int64_t sizes_w = is_nhwc ? onnx_sizes[2] : onnx_sizes[3];
+    sizes = {sizes_h, sizes_w};
+    axes = {2, 3};
+  }
+
+  if (is_nhwc) {
+    // For NHWC preferred layout, we need to convert 'axes' from NCHW to NHWC.
+    axes = convertAxesFromNCHWtoNHWC(axes);
+  }
+
   return true;
 }
 
@@ -103,9 +193,15 @@ void ResizeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const N
 Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                               const Node& node,
                                               const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+
+  const auto& initializers(model_builder.GetInitializerTensors());
+  NodeAttrHelper helper(node);
+
   emscripten::val options = emscripten::val::object();
   options.set("label", node.Name());
-  NodeAttrHelper helper(node);
   const auto mode = helper.Get("mode", "nearest");
   if (mode == "linear") {
     options.set("mode", emscripten::val("linear"));
@@ -113,45 +209,30 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     options.set("mode", emscripten::val("nearest-neighbor"));
   }
 
-  const auto& input_defs = node.InputDefs();
-  const auto& initializers(model_builder.GetInitializerTensors());
-
   std::vector<float> scales;
-  std::vector<int32_t> sizes;
-  std::vector<float> scales_hw;
-  std::vector<int32_t> sizes_hw;
-  std::vector<int32_t> axes;
-  std::string scales_name = GetTensorName(input_defs, 2);
+  std::vector<int64_t> sizes;
+  std::vector<uint32_t> webnn_sizes;
+  std::vector<int64_t> axes = GetResolvedAxes(helper, 4);  // We already checked input shape is 4D in IsOpSupportedImpl.
+  std::string sizes_name = GetTensorName(input_defs, 3);
   const bool is_nhwc = model_builder.GetPreferredLayout() == DataLayout::NHWC;
-  if (!scales_name.empty()) {  // Use scales.
-    ORT_RETURN_IF_NOT(GetResizeScales(initializers, node, scales, logger), "Error getting resize scales");
-    if (is_nhwc) {
-      scales_hw = {scales[1], scales[2]};
-    } else {
-      scales_hw = {scales[2], scales[3]};
-    }
-    options.set("scales", emscripten::val::array(scales_hw));
-  } else {  // Use sizes, we already checked inputs in IsOpSupportedImpl.
-    std::vector<int64_t> output_sizes;
-    ORT_RETURN_IF_NOT(GetResizeOutputSizes(initializers, node, output_sizes, logger),
-                      "Error getting resize output_sizes");
-    std::transform(output_sizes.cbegin(), output_sizes.cend(),
-                   std::back_inserter(sizes),
-                   [](int64_t dim) -> int32_t { return SafeInt<int32_t>(dim); });
-    if (is_nhwc) {
-      sizes_hw = {sizes[1], sizes[2]};
-    } else {
-      sizes_hw = {sizes[2], sizes[3]};
-    }
-    options.set("sizes", emscripten::val::array(sizes_hw));
-  }
 
-  if (is_nhwc) {
-    axes = {1, 2};
+  // We know we have either a 'scales' or 'sizes' input so this is safe.
+  // Check for 'sizes' first.
+  // This handles Resize-11 where 'scales' was a required input but 'sizes' were used if provided.
+  bool using_sizes = !sizes_name.empty() && Contains(initializers, sizes_name);
+  if (using_sizes) {
+    ORT_RETURN_IF_NOT(GetResizeSizesAndAxes(initializers, node, sizes, axes, is_nhwc, input_shape, logger),
+                      "Error getting Resize sizes");
+    webnn_sizes = GetVecUint32FromVecInt64(sizes);
+    options.set("sizes", emscripten::val::array(webnn_sizes));
   } else {
-    axes = {2, 3};
+    ORT_RETURN_IF_NOT(GetResizeScalesAndAxes(initializers, node, scales, axes, is_nhwc, logger),
+                      "Error getting Resize scales");
+    options.set("scales", emscripten::val::array(scales));
   }
-  options.set("axes", emscripten::val::array(axes));
+
+  std::vector<uint32_t> webnn_axes = GetVecUint32FromVecInt64(axes);
+  options.set("axes", emscripten::val::array(webnn_axes));
 
   emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
   emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("resample2d", input, options);
@@ -166,6 +247,7 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
                                         const WebnnDeviceType /* device_type */,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
+  NodeAttrHelper helper(node);
 
   std::vector<int64_t> input_shape;
   if (!GetShape(*input_defs[0], input_shape, logger))
@@ -179,92 +261,81 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
   }
 
   {  // Check attributes.
-    NodeAttrHelper helper(node);
-    const auto mode = helper.Get("mode", "nearest");
-    bool is_linear_resize = mode == "linear";
-    bool is_nearest_resize = mode == "nearest";
-    // WebNN only supports "linear" and "nearest" modes.
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize does not support input mode: " << mode;
+    // antialias
+    if (helper.Get("antialias", 0) != 0) {
+      LOGS(logger, VERBOSE) << "Resize does not support antialias";
       return false;
     }
 
-    const auto exclude_outside = helper.Get("exclude_outside", 0);
-    if (exclude_outside != 0) {
-      LOGS(logger, VERBOSE) << "Resize does not support exclude_outside for now";
+    // coordinate_transformation_mode
+    // Spec issue for supporting more coordinate transformation modes:
+    // https://github.com/webmachinelearning/webnn/issues/270
+    const std::string coordinate_transformation_mode = helper.Get("coordinate_transformation_mode", "half_pixel");
+    if (coordinate_transformation_mode != "half_pixel") {
+      LOGS(logger, VERBOSE) << "Resize does not support coordinate_transformation_mode: "
+                            << coordinate_transformation_mode;
       return false;
     }
-  }
 
-  {  // scales and sizes (if present) must be initializers.
-    const std::string scales_name = GetTensorName(input_defs, 2);
-    const std::string sizes_name = GetTensorName(input_defs, 3);
-
-    // scales (scales may be empty tensor)
-    bool has_scales = !scales_name.empty();
-    if ((has_scales && !Contains(initializers, scales_name)) || (!has_scales && node.SinceVersion() == 11)) {
-      LOGS(logger, VERBOSE) << "Input scales of Resize must be known";
+    // exclude_outside
+    const auto exclude_outside = helper.Get("exclude_outside", 0);
+    if (exclude_outside != 0) {
+      LOGS(logger, VERBOSE) << "Resize does not support exclude_outside for now";
       return false;
     }
 
-    // sizes (sizes may be empty tensor)
-    bool has_sizes = !sizes_name.empty();
-    if (has_sizes && !Contains(initializers, sizes_name)) {
-      LOGS(logger, VERBOSE) << "Input sizes of Resize must be known";
+    // keep_aspect_ratio_policy
+    const auto keep_aspect_ratio_policy = helper.Get("keep_aspect_ratio_policy", "stretch");
+    if (keep_aspect_ratio_policy != "stretch") {
+      LOGS(logger, VERBOSE) << "Resize does not support keep_aspect_ratio_policy: " << keep_aspect_ratio_policy;
       return false;
     }
 
-    if (has_scales && has_sizes) {
-      LOGS(logger, VERBOSE) << "Only one of 'scales' and 'sizes' can be specified";
+    // mode
+    const auto mode = helper.Get("mode", "nearest");
+    bool is_linear_resize = mode == "linear";
+    bool is_nearest_resize = mode == "nearest";
+    // WebNN only supports "linear" and "nearest" modes.
+    if (!is_linear_resize && !is_nearest_resize) {
+      LOGS(logger, VERBOSE) << "Resize does not support input mode: " << mode;
       return false;
     }
+  }
 
-    const bool is_nhwc = node.Domain() == kMSInternalNHWCDomain;
-    // We want to check if the scales or sizes are not trying to resize on N/C channels here.
-    if (has_scales) {  // We are using scales.
-      std::vector<float> scales;
-      if (!GetResizeScales(initializers, node, scales, logger))
-        return false;
-
-      float scale_n = scales[0];
-      float scale_c = is_nhwc ? scales[3] : scales[1];
-      if (scale_n != 1.0f || scale_c != 1.0f) {
-        LOGS(logger, VERBOSE) << "Scales of N/C channel should be 1"
-                              << "Resize of N/C channels are not supported"
-                              << ", scale_n, " << scale_n << ", scale_c, " << scale_c;
-        return false;
-      }
+  {  // 'scales' and 'sizes' (if present) must be non-empty initializers.
+    const std::string scales_name = GetTensorName(input_defs, 2);
+    const std::string sizes_name = GetTensorName(input_defs, 3);
 
-      // For now we only support upscale, so the scale_h and scale_w should be an integer >= 1.
-      // TODO support ResizeBilinear.
-      float scale_h = is_nhwc ? scales[1] : scales[2];
-      float scale_w = is_nhwc ? scales[2] : scales[3];
+    // Check for 'sizes' first.
+    // This handles Resize-11 where 'scales' was a required input but 'sizes' were used if provided.
+    // 'scales' or 'sizes' may be empty tensor.
+    bool using_sizes = !IsEmptyTensor(initializers, sizes_name);
+    bool using_scales = !using_sizes && !IsEmptyTensor(initializers, scales_name);
 
-      // Onnx spec requires scale to be a positive float, so we are not checking that here.
-      if (roundf(scale_h) != scale_h) {
-        LOGS(logger, VERBOSE) << "Resize: scale_h: " << scale_h << " is not a whole number";
-        return false;
-      }
+    if (!using_scales && !using_sizes) {
+      LOGS(logger, VERBOSE) << "Resize: only one of 'scales' and 'sizes' can be specified";
+      return false;
+    }
 
-      if (roundf(scale_w) != scale_w) {
-        LOGS(logger, VERBOSE) << "Resize: scale_w: " << scale_w << " is not a whole number";
+    // 'axes' is from opset 18 on and allows 'scales' or 'sizes' to have entries for the subset of 'axes'.
+    // We fill with default values if necessary so that the processing is consistent across all supported opsets.
+    std::vector<int64_t> axes = GetResolvedAxes(helper, input_size);
+    if (!axes.empty()) {  // We have 'axes' attribute.
+      if (axes.size() != 2 || axes[0] >= input_size || axes[1] >= input_size) {
+        LOGS(logger, VERBOSE) << "Resize: invalid axes attribute";
         return false;
       }
     }
 
-    if (has_sizes) {
-      // We are using sizes.
-      std::vector<int64_t> output_sizes;
-      if (!GetResizeOutputSizes(initializers, node, output_sizes, logger))
+    const bool is_nhwc = node.Domain() == kMSInternalNHWCDomain;
+    if (using_sizes) {  // We are using 'sizes'.
+      std::vector<int64_t> sizes;
+      if (!GetResizeSizesAndAxes(initializers, node, sizes, axes, is_nhwc, input_shape, logger)) {
         return false;
-
-      auto output_size_n = output_sizes[0];
-      const int c_idx = is_nhwc ? 3 : 1;
-      if (output_size_n != input_shape[0] || output_sizes[c_idx] != input_shape[c_idx]) {
-        LOGS(logger, VERBOSE) << "Output sizes of N/C chanel should match the input sizes, "
-                              << "Resize of N/C channels are not supported"
-                              << ", input_size_n, " << input_shape[0] << ", output_size_n, " << output_size_n
-                              << ". input_size_c, " << input_shape[c_idx] << ", output_size_c, " << output_sizes[c_idx];
+      }
+    } else {  // We are using 'scales'.
+      std::vector<float> scales;
+      if (!GetResizeScalesAndAxes(initializers, node, scales, axes, is_nhwc, logger)) {
         return false;
       }
     }