[ET-VK] Implement generic reduction shader + mean, sum, amax, amin (#…

…6473) Pull Request resolved: #6457 ## Context Introduce a generic shader to compute reduction along a single dim, and `keepdim = True`. With the generic shader template, `mean`, `sum`, `amin`, and `amax` can be implemented. ghstack-source-id: 249709743 @exported-using-ghexport Differential Revision: [D64840504](https://our.internmc.facebook.com/intern/diff/D64840504/) --------- Co-authored-by: Stephen Jia <[email protected]>
pytorch · Oct 24, 2024 · 8140a90 · 8140a90
1 parent 5692203
commit 8140a90
Show file tree

Hide file tree

Showing 13 changed files with 418 additions and 424 deletions.
diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS
@@ -27,7 +27,6 @@ runtime.python_library(
         "//executorch/backends/transforms:fuse_conv_with_clamp",
         "//executorch/backends/transforms:fuse_dequant_linear",
         "//executorch/backends/transforms:fuse_view_copy",
-        "//executorch/backends/transforms:mean_to_sum_div",
         "//executorch/backends/transforms:remove_clone_ops",
         "//executorch/backends/vulkan/_passes:vulkan_passes",
         "//executorch/exir:graph_module",

diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
@@ -89,6 +89,10 @@ def __contains__(self, op):
     # Reduction
     exir_ops.edge.aten._log_softmax.default,
     exir_ops.edge.aten._softmax.default,
+    exir_ops.edge.aten.mean.dim,
+    exir_ops.edge.aten.sum.dim_IntList,
+    exir_ops.edge.aten.amax.default,
+    exir_ops.edge.aten.amin.default,
     # 2D Pooling
     exir_ops.edge.aten.avg_pool2d.default,
     exir_ops.edge.aten.max_pool2d_with_indices.default,
@@ -101,9 +105,6 @@ def __contains__(self, op):
 ]
 
 NO_DYNAMIC_SHAPE = [
-    # Reduction
-    exir_ops.edge.aten.mean.dim,
-    exir_ops.edge.aten.sum.dim_IntList,
     # Normalization
     exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
     exir_ops.edge.aten.native_layer_norm.default,

diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+${define_active_storage_type(STORAGE)}
+
+#extension GL_EXT_control_flow_attributes : require
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
+
+${layout_declare_ubo(B, "ivec3", "tin_limits")}
+${layout_declare_ubo(B, "ivec4", "tin_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = 0;
+layout(constant_id = 4) const int reduce_dim = 0;
+layout(constant_id = 5) const int group_dim = 1;
+
+// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
+// threads that will co-operate to compute one reduction output. There may be
+// multiple groups computing distinct reduction outputs within one work group.
+#define NWORKERS 4
+
+// Sets an upper limit on the total size of a work group based on how many
+// elements are allocated in the shared memory array below. Each thread in the
+// work group will write into its assigned element in the shared array.
+#define MAX_NTHREADS 16
+
+
+shared vec4 shared_vecs[MAX_NTHREADS];
+
+#include "indexing_utils.h"
+
+int tid_to_smi(const ivec2 tid) {
+  return tid.x + tid.y * NWORKERS;
+}
+
+/*
+ * The functions below compute reduction along a single dimension for a tensor.
+ * The shader template generalize reduction by abstracting the initial value of
+ * the accumulator, the calculation used to update the accumulator with new
+ * values, and a postprocessing calculation that can be used to modify the
+ * accumulator before writing to output.
+ *
+ * This shader also utilize shared memory to have multiple threads help compute
+ * the max and sum reduction operations. A total of NGROUPS x NWORKERS threads
+ * are expected to be launched. Each group works on a unique reduction "row", and
+ * within a group NWORKERS threads co-operate to compute the max and sum of one
+ * "row". Each worker in the group is responsible for computing a partial output
+ * of the "row" and uploading it to shared memory; the overall reduction output
+ * can then be determined by aggregating the partial outputs stored in shared
+ * memory.
+ *
+ * As a caveat, this shader does not currently support cases where `batch` > 1
+ * and the reduce dim happens to also be the batch concatenation dim.  To support
+ * this, there will need to be additional logic to set the starting value of
+ * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case,
+ * supporting this case is left as an exercise for when it is required.
+ */
+
+// Initializing the accumulator accepts the first value in the reduction row,
+// since some reduction operations (i.e. amax, amin) prefer to initialize with
+// a data point instead of a static value.
+#define INIT_ACCUM(first_val) ${INIT_ACCUM}
+#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM}
+// Useful for operators such as mean which want to perform a final calculation
+// with the accumulator.
+#define POSTPROCESS(accum) ${POSTPROCESS}
+
+/*
+ * Computes reduction where the reduction dim is orthogonal to the packed dim.
+ * This case is simpler because each element of a texel belongs to a separate
+ * reduction "group", meaning we don't have to perform reduction along a texel.
+ */
+void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
+  // shared memory index of this thread
+  const int smi = tid_to_smi(tid);
+
+  scan_pos[reduce_dim] = 0;
+  vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos));
+
+  scan_pos[reduce_dim] = tid.x;
+  // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
+  // the reduction row
+  for (int i = tid.x; i < tin_sizes[reduce_dim];
+       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
+    accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
+  }
+  // Write partial output to shared memory and synchronize work group
+  shared_vecs[smi] = accum;
+  barrier();
+
+  // Since the reduction row is reduced to only one element, only the "main"
+  // thread in the group needs aggregate the partial outputs
+  if (tid.x == 0) {
+    // Iterate over the partial outputs to obtain the overall output
+    int group_i = tid.y * NWORKERS;
+    accum = shared_vecs[group_i++];
+    for (int i = 1; i < NWORKERS; i++, group_i++) {
+      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
+    }
+
+    // Determine if there are any padding elements in the final texel of the
+    // packed dimension
+    const int nspill = mod4(tin_sizes[packed_dim]);
+    // Detect if this thread is working on the final texels of the packed
+    // dimension, which may have padding elements
+    const bool is_last_texel =
+        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
+
+    // Explicitly set padding elements to 0
+    if (is_last_texel && nspill > 0) {
+      [[unroll]] for (int i = nspill; i < 4; i++) {
+        accum[i] = 0;
+      }
+    }
+    scan_pos[reduce_dim] = tid.x;
+    write_texel(tout, scan_pos, POSTPROCESS(accum));
+  }
+}
+
+/*
+ * Compute reduction where the reduction dim is also the packed dim. This case is
+ * complex because the reduction needs to occur over the individual texels.
+ * Therefore, in this algorithm each element of the accumulator texels are
+ * themselves partial outputs. Special care has to be taken to ignore padding
+ * elements in texels (which occur when the size of the packed dim is not a
+ * multiple of 4) so that they do not influence the output of reduction.
+ */
+void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
+  // shared memory index of this thread
+  const int smi = tid_to_smi(tid);
+
+  // Number of non-padding elements in the last texel in the reduction row
+  const int nspill = mod4(tin_sizes[packed_dim]);
+  // Only reduce up to the last "complete" texel. The last texel will need to be
+  // handled specially if it has padding elements.
+  const int reduce_len = tin_sizes[packed_dim] - nspill;
+
+  scan_pos[reduce_dim] = 0;
+  vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x));
+
+  // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
+  // the reduction row
+  scan_pos[reduce_dim] = tid.x;
+  for (int i = tid.x * 4; i < reduce_len;
+       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
+    accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
+  }
+  // For the last texel in the dim, if there are padding elements then each
+  // element of the texel needs to be processed individually such that the
+  // padding elements are ignored
+  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
+    const vec4 intex = load_texel(tin, scan_pos);
+    for (int i = 0; i < nspill; i++) {
+      accum.x = UPDATE_ACCUM(accum.x, intex[i]);
+    }
+  }
+  // Write partial output to shared memory and synchronize work group
+  shared_vecs[smi] = accum;
+  barrier();
+
+  // Since the reduction row is reduced to only one element, only the "main"
+  // thread in the group needs aggregate the partial outputs
+  if (tid.x == 0) {
+    // Iterate over the partial maximums to obtain the overall maximum
+    int group_i = tid.y * NWORKERS;
+    accum = shared_vecs[group_i++];
+    for (int i = 1; i < NWORKERS; i++, group_i++) {
+      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
+    }
+    // Each element of the texel is itself a partial maximum; iterate over the
+    // texel to find the actual maximum
+    float accum_final = accum.x;
+    [[unroll]] for (int i = 1; i < 4; i++) {
+      accum_final = UPDATE_ACCUM(accum[i], accum_final);
+    }
+
+    scan_pos[reduce_dim] = tid.x;
+    write_texel(tout, scan_pos, POSTPROCESS(vec4(accum_final, 0, 0, 0)));
+  }
+}
+
+void main() {
+  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
+  scan_pos[reduce_dim] = 0;
+
+  const ivec2 tid = ivec2(
+      gl_LocalInvocationID[reduce_dim],
+      gl_LocalInvocationID[group_dim]);
+
+  if (any(greaterThanEqual(scan_pos, tin_limits))) {
+    return;
+  }
+
+  if (reduce_dim != packed_dim) {
+    reduce_nonpacked_dim(tid, scan_pos);
+  } else {
+    reduce_packed_dim(tid, scan_pos);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+reduce:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+    INIT_ACCUM: VEC4_T(0)
+    UPDATE_ACCUM: accum + new_val
+    POSTPROCESS: accum
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: sum
+    - NAME: mean
+      POSTPROCESS: (accum / tin_sizes[reduce_dim])
+    - NAME: amax
+      INIT_ACCUM: first_val
+      UPDATE_ACCUM: max(accum, new_val)
+      POSTPROCESS: accum
+    - NAME: amin
+      INIT_ACCUM: first_val
+      UPDATE_ACCUM: min(accum, new_val)
+      POSTPROCESS: accum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.glsl
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/sum_dim.yaml