Skip to content

Commit

Permalink
Update on "Reuse GELU implementation from PyTorch core"
Browse files Browse the repository at this point in the history
kernels/optimized doesn't need to support embedded systems, so it can just take a header-only dep on PyTorch.

Note that, because we will pick up Sleef internally and ignore it
externally thanks to ATen vec, this PR gets to enable optimized GELU in OSS.

Testing: CI to make sure this doesn't break mobile build modes; happy to take advice on anything not currently covered that might break.

Differential Revision: [D66335522](https://our.internmc.facebook.com/intern/diff/D66335522/)

[ghstack-poisoned]
  • Loading branch information
swolchok committed Dec 2, 2024
2 parents 9ce0708 + df4aa97 commit ca0fa70
Show file tree
Hide file tree
Showing 87 changed files with 4,622 additions and 497 deletions.
20 changes: 11 additions & 9 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
CUSTOM_RUNNERS = {
"linux": {
# This one runs OOM on smaller runner, the root cause is unclear (T163016365)
"w2l": "linux.12xlarge",
"ic4": "linux.12xlarge",
"resnet50": "linux.12xlarge",
"llava": "linux.12xlarge",
"llama3_2_vision_encoder": "linux.12xlarge",
# "llama3_2_text_decoder": "linux.12xlarge", # TODO: re-enable test when Huy's change is in / model gets smaller.
"w2l": "linux.4xlarge.memory",
"ic4": "linux.4xlarge.memory",
"resnet50": "linux.4xlarge.memory",
"llava": "linux.4xlarge.memory",
"llama3_2_vision_encoder": "linux.4xlarge.memory",
"llama3_2_text_decoder": "linux.4xlarge.memory",
# This one causes timeout on smaller runner, the root cause is unclear (T161064121)
"dl3": "linux.12xlarge",
"emformer_join": "linux.12xlarge",
"emformer_predict": "linux.12xlarge",
"dl3": "linux.4xlarge.memory",
"emformer_join": "linux.4xlarge.memory",
"emformer_predict": "linux.4xlarge.memory",
}
}

Expand All @@ -39,10 +39,12 @@
"linux": {
"mobilebert": 90,
"emformer_predict": 360,
"llama3_2_text_decoder": 360,
},
"macos": {
"mobilebert": 90,
"emformer_predict": 360,
"llama3_2_text_decoder": 360,
},
}

Expand Down
3 changes: 3 additions & 0 deletions .ci/scripts/setup-macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ install_buck() {

rm "${BUCK2}"
popd

# Kill all running buck2 daemon for a fresh start
buck2 killall || true
}

function write_sccache_stub() {
Expand Down
11 changes: 7 additions & 4 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ UPLOAD_DIR="${UPLOAD_DIR:-}"
# Default PT2E_QUANTIZE to empty string if not set
PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"

# Default CMake Build Type to release mode
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}

if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
echo "Expecting atleast 4 positional arguments"
echo "Usage: [...]"
Expand Down Expand Up @@ -143,7 +146,7 @@ cmake_install_executorch_libraries() {
rm -rf cmake-out
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
Expand All @@ -157,22 +160,22 @@ cmake_install_executorch_libraries() {
-DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out .
cmake --build cmake-out -j9 --target install --config Debug
cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
}

cmake_build_llama_runner() {
echo "Building llama runner"
dir="examples/models/llama"
retry cmake \
-DCMAKE_INSTALL_PREFIX=cmake-out \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out/${dir} \
${dir}
cmake --build cmake-out/${dir} -j9 --config Debug
cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"

}

Expand Down
16 changes: 8 additions & 8 deletions .ci/scripts/test_llava.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
set -exu
# shellcheck source=/dev/null

BUILD_TYPE=${1:-Debug}
TARGET_OS=${2:-Native}
BUILD_DIR=${3:-cmake-out}
CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}

echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
echo "Building with CMAKE_BUILD_TYPE: $CMAKE_BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"

if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
PYTHON_EXECUTABLE=python3
Expand All @@ -32,7 +32,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi

EXECUTORCH_COMMON_CMAKE_ARGS=" \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DEXECUTORCH_ENABLE_LOGGING=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
Expand All @@ -49,7 +49,7 @@ cmake_install_executorch_libraries() {
${EXECUTORCH_COMMON_CMAKE_ARGS} \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
}

cmake_install_executorch_libraries_for_android() {
Expand All @@ -59,14 +59,14 @@ cmake_install_executorch_libraries_for_android() {
${EXECUTORCH_COMMON_CMAKE_ARGS} \
-B${BUILD_DIR} .

cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
}


LLAVA_COMMON_CMAKE_ARGS=" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=ON"
Expand All @@ -81,7 +81,7 @@ cmake_build_llava_runner() {
-B${BUILD_DIR}/${dir} \
${dir}

cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
}


Expand All @@ -98,7 +98,7 @@ cmake_build_llava_runner_for_android() {
-B${BUILD_DIR}/${dir} \
${dir}

cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
}

# only export the one without custom op for now since it's
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/apple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
build-demo-ios:
name: build-demo-ios
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
Expand Down Expand Up @@ -190,6 +192,8 @@ jobs:
) done
upload-frameworks-ios:
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
runs-on: ubuntu-22.04
needs: [build-frameworks-ios, set-version]
timeout-minutes: 30
Expand Down Expand Up @@ -278,6 +282,8 @@ jobs:
build-benchmark-app:
name: build-benchmark-app
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
Expand Down
16 changes: 1 addition & 15 deletions .github/workflows/ghstack_land.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,7 @@ on:
pull_request:
types: [closed]
branches:
- 'gh/cccclai/[0-9]+/base'
- 'gh/dbort/[0-9]+/base'
- 'gh/dvorjackz/[0-9]+/base'
- 'gh/guangy10/[0-9]+/base'
- 'gh/helunwencser/[0-9]+/base'
- 'gh/jorgep31415/[0-9]+/base'
- 'gh/kimishpatel/[0-9]+/base'
- 'gh/kirklandsign/[0-9]+/base'
- 'gh/larryliu0820/[0-9]+/base'
- 'gh/lucylq/[0-9]+/base'
- 'gh/manuelcandales/[0-9]+/base'
- 'gh/mcr229/[0-9]+/base'
- 'gh/swolchok/[0-9]+/base'
- 'gh/SS-JIA/[0-9]+/base'
- 'gh/trivedivivek/[0-9]+/base'
- 'gh/*/[0-9]+/base'

jobs:
ghstack_merge_to_main:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ jobs:
docker-image: executorch-ubuntu-22.04-clang12

unittest-arm:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ jobs:
test-arm-backend-delegation:
name: test-arm-backend-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand All @@ -157,7 +157,7 @@ jobs:
test-arm-reference-delegation:
name: test-arm-reference-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -290,7 +290,7 @@ jobs:
# ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava

# # run e2e (export, tokenizer and runner)
# PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
# PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh

test-qnn-model:
name: test-qnn-model
Expand Down Expand Up @@ -351,6 +351,8 @@ jobs:
done
test-huggingface-transformers:
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
name: test-huggingface-transformers
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
secrets: inherit
Expand Down
56 changes: 16 additions & 40 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
endif()

if(EXECUTORCH_BUILD_PYBIND)
# Setup RPATH.
# See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
if(APPLE)
set(CMAKE_MACOSX_RPATH ON)
set(_rpath_portable_origin "@loader_path")
else()
set(_rpath_portable_origin $ORIGIN)
endif(APPLE)
# Use separate rpaths during build and install phases
set(CMAKE_SKIP_BUILD_RPATH FALSE)
# Don't use the install-rpath during the build phase
set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}")
# Automatically add all linked folders that are NOT in the build directory to
# the rpath (per library?)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)

if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
Expand Down Expand Up @@ -770,46 +786,6 @@ if(EXECUTORCH_BUILD_PYBIND)
target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
target_link_libraries(portable_lib PRIVATE ${_dep_libs})
if(APPLE)
# pip wheels will need to be able to find the torch libraries. On Linux, the
# .so has non-absolute dependencies on libs like "libtorch.so" without
# paths; as long as we `import torch` first, those dependencies will work.
# But Apple dylibs do not support non-absolute dependencies, so we need to
# tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries
# for the torch libraries will look like "@rpath/libtorch.dylib", so we can
# add an LC_RPATH entry to look in a directory relative to the installed
# location of our _portable_lib.so file. To see these LC_* values, run
# `otool -l _portable_lib*.so`.
set_target_properties(
portable_lib
PROPERTIES # Assume that this library will be installed in
# `site-packages/executorch/extension/pybindings`, and that
# the torch libs are in `site-packages/torch/lib`.
BUILD_RPATH "@loader_path/../../../torch/lib"
INSTALL_RPATH "@loader_path/../../../torch/lib"
# Assume <executorch> is the root `site-packages/executorch`
# Need to add <executorch>/extension/llm/custom_ops for
# libcustom_ops_aot_lib.dylib
BUILD_RPATH "@loader_path/../../extension/llm/custom_ops"
INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops"
# Need to add <executorch>/kernels/quantized for
# libquantized_ops_aot_lib.dylib
BUILD_RPATH "@loader_path/../../kernels/quantized"
INSTALL_RPATH "@loader_path/../../kernels/quantized"
)
else()
set_target_properties(
portable_lib
PROPERTIES
# Assume <executorch> is the root `site-packages/executorch`
# Need to add <executorch>/extension/llm/custom_ops for
# libcustom_ops_aot_lib
# Need to add <executorch>/kernels/quantized for
# libquantized_ops_aot_lib
BUILD_RPATH
"$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
)
endif()

install(TARGETS portable_lib
LIBRARY DESTINATION executorch/extension/pybindings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
};
Expand Down
Loading

0 comments on commit ca0fa70

Please sign in to comment.