Skip to content

Commit

Permalink
Update on "[ET-VK] Replacing use of adaptive_work_group_size function…
Browse files Browse the repository at this point in the history
… by create_local_wg_size function."

This diff replaces the use of the adaptive_work_group_size function with create_local_wg_size function, which is better tuned for improving shader performance.

Differential Revision: [D66308779](https://our.internmc.facebook.com/intern/diff/D66308779/)

[ghstack-poisoned]
  • Loading branch information
trivedivivek committed Dec 2, 2024
2 parents 795e983 + 499341b commit b3b686e
Show file tree
Hide file tree
Showing 126 changed files with 5,638 additions and 934 deletions.
20 changes: 11 additions & 9 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
CUSTOM_RUNNERS = {
"linux": {
# This one runs OOM on smaller runner, the root cause is unclear (T163016365)
"w2l": "linux.12xlarge",
"ic4": "linux.12xlarge",
"resnet50": "linux.12xlarge",
"llava": "linux.12xlarge",
"llama3_2_vision_encoder": "linux.12xlarge",
# "llama3_2_text_decoder": "linux.12xlarge", # TODO: re-enable test when Huy's change is in / model gets smaller.
"w2l": "linux.4xlarge.memory",
"ic4": "linux.4xlarge.memory",
"resnet50": "linux.4xlarge.memory",
"llava": "linux.4xlarge.memory",
"llama3_2_vision_encoder": "linux.4xlarge.memory",
"llama3_2_text_decoder": "linux.4xlarge.memory",
# This one causes timeout on smaller runner, the root cause is unclear (T161064121)
"dl3": "linux.12xlarge",
"emformer_join": "linux.12xlarge",
"emformer_predict": "linux.12xlarge",
"dl3": "linux.4xlarge.memory",
"emformer_join": "linux.4xlarge.memory",
"emformer_predict": "linux.4xlarge.memory",
}
}

Expand All @@ -39,10 +39,12 @@
"linux": {
"mobilebert": 90,
"emformer_predict": 360,
"llama3_2_text_decoder": 360,
},
"macos": {
"mobilebert": 90,
"emformer_predict": 360,
"llama3_2_text_decoder": 360,
},
}

Expand Down
3 changes: 3 additions & 0 deletions .ci/scripts/setup-macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ install_buck() {

rm "${BUCK2}"
popd

# Kill all running buck2 daemon for a fresh start
buck2 killall || true
}

function write_sccache_stub() {
Expand Down
11 changes: 7 additions & 4 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ UPLOAD_DIR="${UPLOAD_DIR:-}"
# Default PT2E_QUANTIZE to empty string if not set
PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"

# Default CMake Build Type to release mode
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}

if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
echo "Expecting atleast 4 positional arguments"
echo "Usage: [...]"
Expand Down Expand Up @@ -143,7 +146,7 @@ cmake_install_executorch_libraries() {
rm -rf cmake-out
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
Expand All @@ -157,22 +160,22 @@ cmake_install_executorch_libraries() {
-DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out .
cmake --build cmake-out -j9 --target install --config Debug
cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
}

cmake_build_llama_runner() {
echo "Building llama runner"
dir="examples/models/llama"
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out/${dir} \
${dir}
cmake --build cmake-out/${dir} -j9 --config Debug
cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"

}

Expand Down
16 changes: 8 additions & 8 deletions .ci/scripts/test_llava.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
set -exu
# shellcheck source=/dev/null

BUILD_TYPE=${1:-Debug}
TARGET_OS=${2:-Native}
BUILD_DIR=${3:-cmake-out}
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}

echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
echo "Building with CMAKE_BUILD_TYPE: $CMAKE_BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"

if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
PYTHON_EXECUTABLE=python3
Expand All @@ -32,7 +32,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi

EXECUTORCH_COMMON_CMAKE_ARGS=" \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
Expand All @@ -49,7 +49,7 @@ cmake_install_executorch_libraries() {
${EXECUTORCH_COMMON_CMAKE_ARGS} \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
}

cmake_install_executorch_libraries_for_android() {
Expand All @@ -59,14 +59,14 @@ cmake_install_executorch_libraries_for_android() {
${EXECUTORCH_COMMON_CMAKE_ARGS} \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
}


LLAVA_COMMON_CMAKE_ARGS=" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON"
Expand All @@ -81,7 +81,7 @@ cmake_build_llava_runner() {
-B${BUILD_DIR}/${dir} \
${dir}

cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
}


Expand All @@ -98,7 +98,7 @@ cmake_build_llava_runner_for_android() {
-B${BUILD_DIR}/${dir} \
${dir}

cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
}

# only export the one without custom op for now since it's
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/apple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
build-demo-ios:
name: build-demo-ios
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
Expand Down Expand Up @@ -190,6 +192,8 @@ jobs:
) done
upload-frameworks-ios:
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
runs-on: ubuntu-22.04
needs: [build-frameworks-ios, set-version]
timeout-minutes: 30
Expand Down Expand Up @@ -278,6 +282,8 @@ jobs:
build-benchmark-app:
name: build-benchmark-app
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
Expand Down
16 changes: 1 addition & 15 deletions .github/workflows/ghstack_land.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,7 @@ on:
pull_request:
types: [closed]
branches:
- 'gh/cccclai/[0-9]+/base'
- 'gh/dbort/[0-9]+/base'
- 'gh/dvorjackz/[0-9]+/base'
- 'gh/guangy10/[0-9]+/base'
- 'gh/helunwencser/[0-9]+/base'
- 'gh/jorgep31415/[0-9]+/base'
- 'gh/kimishpatel/[0-9]+/base'
- 'gh/kirklandsign/[0-9]+/base'
- 'gh/larryliu0820/[0-9]+/base'
- 'gh/lucylq/[0-9]+/base'
- 'gh/manuelcandales/[0-9]+/base'
- 'gh/mcr229/[0-9]+/base'
- 'gh/swolchok/[0-9]+/base'
- 'gh/SS-JIA/[0-9]+/base'
- 'gh/trivedivivek/[0-9]+/base'
- 'gh/*/[0-9]+/base'

jobs:
ghstack_merge_to_main:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ jobs:
docker-image: executorch-ubuntu-22.04-clang12

unittest-arm:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ jobs:
test-arm-backend-delegation:
name: test-arm-backend-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand All @@ -157,7 +157,7 @@ jobs:
test-arm-reference-delegation:
name: test-arm-reference-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -290,7 +290,7 @@ jobs:
# ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava

# # run e2e (export, tokenizer and runner)
# PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
# PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh

test-qnn-model:
name: test-qnn-model
Expand Down Expand Up @@ -351,6 +351,8 @@ jobs:
done
test-huggingface-transformers:
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
name: test-huggingface-transformers
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
secrets: inherit
Expand Down
56 changes: 16 additions & 40 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
endif()

if(EXECUTORCH_BUILD_PYBIND)
# Setup RPATH.
# See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
if(APPLE)
set(CMAKE_MACOSX_RPATH ON)
set(_rpath_portable_origin "@loader_path")
else()
set(_rpath_portable_origin $ORIGIN)
endif(APPLE)
# Use separate rpaths during build and install phases
set(CMAKE_SKIP_BUILD_RPATH FALSE)
# Don't use the install-rpath during the build phase
set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}")
# Automatically add all linked folders that are NOT in the build directory to
# the rpath (per library?)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)

if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
Expand Down Expand Up @@ -765,46 +781,6 @@ if(EXECUTORCH_BUILD_PYBIND)
target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
target_link_libraries(portable_lib PRIVATE ${_dep_libs})
if(APPLE)
# pip wheels will need to be able to find the torch libraries. On Linux, the
# .so has non-absolute dependencies on libs like "libtorch.so" without
# paths; as long as we `import torch` first, those dependencies will work.
# But Apple dylibs do not support non-absolute dependencies, so we need to
# tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries
# for the torch libraries will look like "@rpath/libtorch.dylib", so we can
# add an LC_RPATH entry to look in a directory relative to the installed
# location of our _portable_lib.so file. To see these LC_* values, run
# `otool -l _portable_lib*.so`.
set_target_properties(
portable_lib
PROPERTIES # Assume that this library will be installed in
# `site-packages/executorch/extension/pybindings`, and that
# the torch libs are in `site-packages/torch/lib`.
BUILD_RPATH "@loader_path/../../../torch/lib"
INSTALL_RPATH "@loader_path/../../../torch/lib"
# Assume <executorch> is the root `site-packages/executorch`
# Need to add <executorch>/extension/llm/custom_ops for
# libcustom_ops_aot_lib.dylib
BUILD_RPATH "@loader_path/../../extension/llm/custom_ops"
INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops"
# Need to add <executorch>/kernels/quantized for
# libquantized_ops_aot_lib.dylib
BUILD_RPATH "@loader_path/../../kernels/quantized"
INSTALL_RPATH "@loader_path/../../kernels/quantized"
)
else()
set_target_properties(
portable_lib
PROPERTIES
# Assume <executorch> is the root `site-packages/executorch`
# Need to add <executorch>/extension/llm/custom_ops for
# libcustom_ops_aot_lib
# Need to add <executorch>/kernels/quantized for
# libquantized_ops_aot_lib
BUILD_RPATH
"$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
)
endif()

install(TARGETS portable_lib
LIBRARY DESTINATION executorch/extension/pybindings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
};
Expand Down
11 changes: 11 additions & 0 deletions backends/arm/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,14 @@ python_library(
"//executorch/backends/arm/operators:node_visitor",
],
)

python_library(
name = "arm_model_evaluator",
src = [
"util/arm_model_evaluator.py",
],
typing = True,
deps = [
"//caffe2:torch",
]
)
6 changes: 3 additions & 3 deletions backends/arm/_passes/arm_pass_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
DecomposeSoftmaxesPass,
)
from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import (
InsertSqueezeAfterSumPass,
from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
KeepDimsFalseToSqueezePass,
)
from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
Expand Down Expand Up @@ -71,7 +71,7 @@ def transform_to_backend_pipeline(
self.add_pass(DecomposeMeanDimPass())
self.add_pass(MatchArgRanksPass(exported_program))
self.add_pass(DecomposeDivPass())
self.add_pass(InsertSqueezeAfterSumPass())
self.add_pass(KeepDimsFalseToSqueezePass())
self.add_pass(ConvertSplitToSlicePass())
self.add_pass(Conv1dUnsqueezePass(exported_program))
self.add_pass(DecomposeSoftmaxesPass())
Expand Down
Loading

0 comments on commit b3b686e

Please sign in to comment.