Update base for Update on "Reuse GELU implementation from PyTorch core"

kernels/optimized doesn't need to support embedded systems, so it can just take a header-only dep on PyTorch. Note that, because we will pick up Sleef internally and ignore it externally thanks to ATen vec, this PR gets to enable optimized GELU in OSS. Testing: CI to make sure this doesn't break mobile build modes; happy to take advice on anything not currently covered that might break. Differential Revision: [D66335522](https://our.internmc.facebook.com/intern/diff/D66335522/) [ghstack-poisoned]
pytorch · Nov 26, 2024 · cd41576 · cd41576
2 parents 58ab8ff + 2967302
commit cd41576
Show file tree

Hide file tree

Showing 57 changed files with 3,453 additions and 134 deletions.
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
       MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
       shift 2
       ;;
+    -pt2e_quantize)
+      PT2E_QUANTIZE="$2"
+      shift 2
+      ;;
     -upload)
       UPLOAD_DIR="$2"
       shift 2
@@ -44,6 +48,9 @@ MODE=${MODE:-"xnnpack+custom"}
 # Default UPLOAD_DIR to empty string if not set
 UPLOAD_DIR="${UPLOAD_DIR:-}"
 
+# Default PT2E_QUANTIZE to empty string if not set
+PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
+
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -234,6 +241,10 @@ if [[ "${COREML}" == "ON" ]]; then
 fi
 if [[ "${QNN}" == "ON" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
+  echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
+  if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
+    EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
+  fi
 fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}

diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
@@ -27,6 +27,7 @@ jobs:
       test-infra-ref: main
       with-cuda: disabled
       with-rocm: disabled
+      python-versions: '["3.10", "3.11", "3.12"]'
 
   build:
     needs: generate-matrix

diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
@@ -27,6 +27,7 @@ jobs:
       test-infra-ref: main
       with-cuda: disabled
       with-rocm: disabled
+      python-versions: '["3.10", "3.11", "3.12"]'
 
   build:
     needs: generate-matrix

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -368,6 +368,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
+        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
         mode: [qnn]
       fail-fast: false
     with:
@@ -384,6 +385,7 @@ jobs:
         DTYPE=${{ matrix.dtype }}
         BUILD_TOOL="cmake"
         MODE=${{ matrix.mode }}
+        PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
@@ -393,7 +395,7 @@ jobs:
         # Install requirements for export_llama
         PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
         # Test llama2
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
 
   test-phi-3-mini-runner-linux:
     name: test-phi-3-mini-runner-linux

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -441,3 +441,39 @@ jobs:
 
         cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
         echo "::endgroup::"
+
+
+  test-llama-runner-qnn-linux:
+    name: test-llama-runner-qnn-linux
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    strategy:
+      matrix:
+        dtype: [fp32]
+        pt2e_quantize: [qnn_16a16w, qnn_8a8w]
+        mode: [qnn]
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 900
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        BUILD_TOOL="cmake"
+        DTYPE=${{ matrix.dtype }}
+        MODE=${{ matrix.mode }}
+        PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+
+        # Setup executorch
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
diff --git a/.gitmodules b/.gitmodules
@@ -64,6 +64,9 @@
 [submodule "third-party/pybind11"]
 	path = third-party/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
+	path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
+	url = https://github.com/foss-xtensa/nnlib-FusionG3/
 [submodule "third-party/ao"]
 	path = third-party/ao
 	url = https://github.com/pytorch/ao.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -338,8 +338,6 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch")
   )
 endif()
 set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type)
-# We don't need any of C10's CMake macros.
-add_definitions(-DC10_USING_CUSTOM_GENERATED_MACROS)
 
 #
 # The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -522,6 +520,7 @@ endif()
 target_include_directories(
   executorch_core PUBLIC ${_common_include_directories}
 )
+target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch_core PUBLIC ${_common_compile_options})
 if(MAX_KERNEL_NUM)
   target_compile_definitions(
@@ -542,6 +541,7 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE)
   target_include_directories(
     executorch_core_shared PUBLIC ${_common_include_directories}
   )
+  target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
   target_compile_options(
     executorch_core_shared PUBLIC ${_common_compile_options}
   )
@@ -562,6 +562,7 @@ endif()
 add_library(executorch ${_executorch__srcs})
 target_link_libraries(executorch PRIVATE executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
+target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
 target_link_options_shared_lib(executorch)
 
@@ -753,6 +754,8 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_include_directories(
     util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS}
   )
+  target_compile_definitions(util PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
+
   target_compile_options(util PUBLIC ${_pybind_compile_options})
   target_link_libraries(util PRIVATE torch c10 executorch extension_tensor)
 

diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?

diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -71,23 +71,15 @@ def test_vit_skip_conv(self):
             )
         )
 
-        conv_block = ["aten.convolution.default", "executorch_call_delegate"]
-        safe_softmax_block = [
-            "getitem",
-            "getitem",
-            "getitem",
-            "getitem",
-            "aten.any.dim",
-            "executorch_call_delegate",
-        ]
-        final_block = ["getitem"]
-        total = conv_block + 12 * safe_softmax_block + final_block
-
         assert [
             node.target.__name__
             for node in delegated_program_manager.exported_program().graph.nodes
             if node.op == "call_function"
-        ] == total
+        ] == [
+            "aten.convolution.default",
+            "executorch_call_delegate",
+            "getitem",
+        ]
 
     def test_buffer(self):
         embedding_dim = 3

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -52,6 +52,7 @@ def __init__(self):
         self.permute_nhwc = False
         self.quantize_io = False
         self.tosa_version = None
+        self.input_order = None
 
     def ethosu_compile_spec(
         self,
@@ -89,7 +90,7 @@ def ethosu_compile_spec(
             self.compiler_flags.append(extra_flags)
 
         base_tosa_version = "TOSA-0.80.0+BI"
-        if "U55" in config:
+        if "u55" in config:
             # Add the Ethos-U55 extension marker
             base_tosa_version += "+u55"
         self.tosa_version = TosaSpecification.create_from_string(base_tosa_version)
@@ -134,6 +135,14 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         self.quantize_io = quantize_io
         return self
 
+    def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
+        """
+        Reorder the inputs coming in. This may be required when inputs > 1.
+        And while using the U55/U85 CompileSpec.
+        """
+        self.input_order = input_order
+        return self
+
     def build(self) -> List[CompileSpec]:
         """
         Generate a list of compile spec objects from the builder
@@ -163,6 +172,13 @@ def build(self) -> List[CompileSpec]:
                 CompileSpec("permute_memory_format", "nhwc".encode())
             )
 
+        if self.input_order:
+            self.compile_spec.append(
+                CompileSpec(
+                    "input_order", " ".join(map(str, self.input_order)).encode()
+                )
+            )
+
         if self.quantize_io:
             self.compile_spec.append(CompileSpec("quantize_io", "True".encode()))
 
@@ -214,13 +230,16 @@ def preprocess(  # noqa: C901
         artifact_path = None
         output_format = ""
         compile_flags = []
+        input_order = []
         for spec in compile_spec:
             if spec.key == "debug_artifact_path":
                 artifact_path = spec.value.decode()
             if spec.key == "output_format":
                 output_format = spec.value.decode()
             if spec.key == "compile_flags":
                 compile_flags.append(spec.value.decode())
+            if spec.key == "input_order":
+                input_order = list(map(int, spec.value.decode().split(",")))
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -246,19 +265,27 @@ def preprocess(  # noqa: C901
         )
 
         node_visitors = get_node_visitors(edge_program, tosa_spec)
-
+        input_count = 0
         for node in graph_module.graph.nodes:
             if node.op == "call_function":
                 process_call_function(node, tosa_graph, node_visitors, tosa_spec)
             elif node.op == "placeholder":
                 process_placeholder(node, tosa_graph, edge_program, tosa_spec)
+                if node.name in edge_program.graph_signature.user_inputs:
+                    input_count += 1
             elif node.op == "output":
                 process_output(node, tosa_graph)
             else:
                 # This will only happen if an unpartitioned graph is passed without
                 # any checking of compatibility.
                 dbg_fail(node, tosa_graph, artifact_path)
 
+        if len(input_order) > 0:
+            if input_count != len(input_order):
+                raise RuntimeError(
+                    "The rank of the input order is not equal to amount of input tensors"
+                )
+
         # TODO: It would be awesome if this dump could somehow be done on top level and not here.
         # Problem is that the desc.json has to be created on the tosa_graph object, which we can't
         # access from top level.
@@ -275,7 +302,7 @@ def preprocess(  # noqa: C901
         # preprocess and some consume TOSA fb directly.
         if output_format == "vela":
             # Emit vela_bin_stream format
-            binary = vela_compile(tosa_graph, compile_flags)
+            binary = vela_compile(tosa_graph, compile_flags, input_order)
         elif output_format == "tosa":
             # Emit TOSA flatbuffer
             binary = bytes(tosa_graph.serialize())

diff --git a/backends/arm/arm_vela.py b/backends/arm/arm_vela.py
@@ -17,10 +17,13 @@
 
 # Pack either input or output tensor block, compose the related arrays into
 # per-io structs to simplify runtime use.
-def vela_bin_pack_io(prefix, data):
-    ios = struct.pack("<i", len(data[prefix + "_shape"]))
-    for i in range(len(data[prefix + "_shape"])):
-        io_shape = data[prefix + "_shape"][i]
+def vela_bin_pack_io(prefix, data, shape_order=None):
+    vela_input_shapes = data[prefix + "_shape"]
+
+    order = shape_order if shape_order else range(len(vela_input_shapes))
+    ios = struct.pack("<i", len(vela_input_shapes))
+    for i in order:
+        io_shape = vela_input_shapes[i]
         io_elem_size = data[prefix + "_elem_size"][i]
         io_offset = data[prefix + "_offset"][i]
         io_region = data[prefix + "_region"][i]
@@ -36,7 +39,7 @@ def vela_bin_pack_io(prefix, data):
 # Output via Vela to binary stream for ArmBackendEthosU
 # WARNING: Do not change this without changing VelaBinStream.cpp as that
 #          function consumes this format and the two need to align.
-def vela_compile(tosa_graph, args: List[str]):
+def vela_compile(tosa_graph, args: List[str], shape_order=None):
     with tempfile.TemporaryDirectory() as tmpdir:
         tosaname = "out.tosa"
         flatbuffer = tosa_graph.serialize()
@@ -78,7 +81,7 @@ def vela_compile(tosa_graph, args: List[str]):
             bin_blocks["scratch_data"] = b"\x00" * block_length
 
             # Capture inputs and outputs
-            bin_blocks["inputs"] = vela_bin_pack_io("input", data)
+            bin_blocks["inputs"] = vela_bin_pack_io("input", data, shape_order)
             bin_blocks["outputs"] = vela_bin_pack_io("output", data)
 
             bin_blocks["vela_end_stream"] = b""

diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
@@ -5,4 +5,9 @@
 
 # pyre-unsafe
 
-from . import mean_dim_support, tosa_supported_operators, var_correction_support  # noqa
+from . import (  # noqa
+    mean_dim_support,
+    right_shift_support,
+    tosa_supported_operators,
+    var_correction_support,
+)
diff --git a/backends/arm/operator_support/right_shift_support.py b/backends/arm/operator_support/right_shift_support.py
@@ -0,0 +1,35 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+
+import torch.fx as fx
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+
+@register_tosa_support_check
+class RightShiftSupported(SupportedTOSAOperatorCheck):
+    targets = [exir_ops.edge.aten.__rshift__.Scalar]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
+        TosaSpecification.create_from_string("TOSA-0.80.0+MI"),
+    ]
+
+    def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+
+        # TODO MLETORCH-525 Remove warning
+        if isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset:
+            logging.warning(f"{node.target} may introduce one-off errors.")
+        return True
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -27,6 +27,7 @@
     op_reciprocal,
     op_relu,
     op_repeat,
+    op_rshift,
     op_rsqrt,
     op_select,
     op_sigmoid,