Skip to content

Commit

Permalink
Update on "add eval for attention sink"
Browse files Browse the repository at this point in the history
This PR adds the function to evaluate the model's perplexity when AttentionSink is enabled.

This is mostly copied from https://github.com/mit-han-lab/streaming-llm/blob/main/examples/eval_long_ppl.py which is used by the AttentionSink paper to evaluate the model's perplexity when AttentionSink is enabled.

Differential Revision: [D66474732](https://our.internmc.facebook.com/intern/diff/D66474732/)

Perplexity measured for llama 3.2 1B and 1B_Instruct model up to 40k tokens with AttentionSink enabled:

<img width="966" alt="Screenshot 2024-11-25 at 2 46 04 PM" src="https://github.com/user-attachments/assets/ba7118f9-b5d7-4de8-b1fa-7d2ba0646515">


[ghstack-poisoned]
  • Loading branch information
helunwencser committed Nov 27, 2024
2 parents 8c745db + 85a358b commit a3b8d91
Show file tree
Hide file tree
Showing 68 changed files with 3,866 additions and 504 deletions.
20 changes: 11 additions & 9 deletions .ci/scripts/gather_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
CUSTOM_RUNNERS = {
"linux": {
# This one runs OOM on smaller runner, the root cause is unclear (T163016365)
"w2l": "linux.12xlarge",
"ic4": "linux.12xlarge",
"resnet50": "linux.12xlarge",
"llava": "linux.12xlarge",
"llama3_2_vision_encoder": "linux.12xlarge",
# "llama3_2_text_decoder": "linux.12xlarge", # TODO: re-enable test when Huy's change is in / model gets smaller.
"w2l": "linux.4xlarge.memory",
"ic4": "linux.4xlarge.memory",
"resnet50": "linux.4xlarge.memory",
"llava": "linux.4xlarge.memory",
"llama3_2_vision_encoder": "linux.4xlarge.memory",
"llama3_2_text_decoder": "linux.4xlarge.memory",
# This one causes timeout on smaller runner, the root cause is unclear (T161064121)
"dl3": "linux.12xlarge",
"emformer_join": "linux.12xlarge",
"emformer_predict": "linux.12xlarge",
"dl3": "linux.4xlarge.memory",
"emformer_join": "linux.4xlarge.memory",
"emformer_predict": "linux.4xlarge.memory",
}
}

Expand All @@ -39,10 +39,12 @@
"linux": {
"mobilebert": 90,
"emformer_predict": 360,
"llama3_2_text_decoder": 360,
},
"macos": {
"mobilebert": 90,
"emformer_predict": 360,
"llama3_2_text_decoder": 360,
},
}

Expand Down
3 changes: 3 additions & 0 deletions .ci/scripts/setup-macos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ install_buck() {

rm "${BUCK2}"
popd

# Kill all running buck2 daemon for a fresh start
buck2 killall || true
}

function write_sccache_stub() {
Expand Down
11 changes: 11 additions & 0 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ while [[ $# -gt 0 ]]; do
MODE="$2" # portable or xnnpack+custom or xnnpack+custom+qe
shift 2
;;
-pt2e_quantize)
PT2E_QUANTIZE="$2"
shift 2
;;
-upload)
UPLOAD_DIR="$2"
shift 2
Expand All @@ -44,6 +48,9 @@ MODE=${MODE:-"xnnpack+custom"}
# Default UPLOAD_DIR to empty string if not set
UPLOAD_DIR="${UPLOAD_DIR:-}"

# Default PT2E_QUANTIZE to empty string if not set
PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"

if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
echo "Expecting atleast 4 positional arguments"
echo "Usage: [...]"
Expand Down Expand Up @@ -234,6 +241,10 @@ if [[ "${COREML}" == "ON" ]]; then
fi
if [[ "${QNN}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape"
echo "PT2E_QUANTIZE is ${PT2E_QUANTIZE}"
if [[ "${PT2E_QUANTIZE}" == "qnn_16a16w" ]]; then
EXPORT_ARGS+=" --tokenizer_path tokenizer.model --pt2e_quantize qnn_16a16w --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --calibration_data Once "
fi
fi
# Add dynamically linked library location
$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS}
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/apple.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
build-demo-ios:
name: build-demo-ios
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
Expand Down Expand Up @@ -190,6 +192,8 @@ jobs:
) done
upload-frameworks-ios:
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
runs-on: ubuntu-22.04
needs: [build-frameworks-ios, set-version]
timeout-minutes: 30
Expand Down Expand Up @@ -278,6 +282,8 @@ jobs:
build-benchmark-app:
name: build-benchmark-app
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
secrets: inherit
with:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-wheels-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
test-infra-ref: main
with-cuda: disabled
with-rocm: disabled
python-versions: '["3.10", "3.11", "3.12"]'

build:
needs: generate-matrix
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build-wheels-m1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
test-infra-ref: main
with-cuda: disabled
with-rocm: disabled
python-versions: '["3.10", "3.11", "3.12"]'

build:
needs: generate-matrix
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ jobs:
docker-image: executorch-ubuntu-22.04-clang12

unittest-arm:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -368,6 +368,7 @@ jobs:
strategy:
matrix:
dtype: [fp32]
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
mode: [qnn]
fail-fast: false
with:
Expand All @@ -384,6 +385,7 @@ jobs:
DTYPE=${{ matrix.dtype }}
BUILD_TOOL="cmake"
MODE=${{ matrix.mode }}
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
Expand All @@ -393,7 +395,7 @@ jobs:
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -dtype "${DTYPE}" -mode "${MODE}"
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
test-phi-3-mini-runner-linux:
name: test-phi-3-mini-runner-linux
Expand Down
42 changes: 40 additions & 2 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ jobs:
test-arm-backend-delegation:
name: test-arm-backend-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand All @@ -157,7 +157,7 @@ jobs:
test-arm-reference-delegation:
name: test-arm-reference-delegation
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-arm-sdk
Expand Down Expand Up @@ -351,6 +351,8 @@ jobs:
done
test-huggingface-transformers:
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
if: ${{ !github.event.pull_request.head.repo.fork }}
name: test-huggingface-transformers
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
secrets: inherit
Expand Down Expand Up @@ -441,3 +443,39 @@ jobs:
cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is"
echo "::endgroup::"
test-llama-runner-qnn-linux:
name: test-llama-runner-qnn-linux
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
strategy:
matrix:
dtype: [fp32]
pt2e_quantize: [qnn_16a16w, qnn_8a8w]
mode: [qnn]
fail-fast: false
with:
runner: linux.2xlarge
docker-image: executorch-ubuntu-22.04-qnn-sdk
submodules: 'true'
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
timeout: 900
script: |
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
BUILD_TOOL="cmake"
DTYPE=${{ matrix.dtype }}
MODE=${{ matrix.mode }}
PT2E_QUANTIZE=${{ matrix.pt2e_quantize }}
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
# Setup executorch
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
# Install requirements for export_llama
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
# Test llama2
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh -model stories110M -build_tool "${BUILD_TOOL}" -mode "${MODE}" -dtype "${DTYPE}" -pt2e_quantize "${PT2E_QUANTIZE}"
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
[submodule "third-party/pybind11"]
path = third-party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3"]
path = backends/cadence/fusion_g3/third-party/nnlib/nnlib-FusionG3
url = https://github.com/foss-xtensa/nnlib-FusionG3/
[submodule "third-party/ao"]
path = third-party/ao
url = https://github.com/pytorch/ao.git
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
};
Expand Down
43 changes: 32 additions & 11 deletions backends/arm/_passes/cast_int64_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,15 @@

# pyre-unsafe

import logging

import torch
from executorch.backends.arm._passes.arm_pass_utils import is_param_node
from executorch.exir.pass_base import ExportPass, PassResult
from torch._export.utils import is_buffer

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)


class CastInt64ToInt32Pass(ExportPass):
Expand All @@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
for node in graph_module.graph.nodes:
fake_tensor = node.meta["val"]
if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
if node.meta["val"].dtype == torch.int64:
node.meta["val"] = node.meta["val"].to(torch.int32)
buffer_name = (
self.exported_program.graph_signature.inputs_to_buffers[
node.name
]
)
new_tensor = self.exported_program.state_dict[buffer_name].to(
torch.int32
)
self.exported_program.state_dict[buffer_name] = new_tensor
if node.meta["val"].dtype == torch.int64 and is_param_node(
self.exported_program, node
):
if is_buffer(self.exported_program, node):
node.meta["val"] = node.meta["val"].to(torch.int32)
buffer_name = (
self.exported_program.graph_signature.inputs_to_buffers[
node.name
]
)
buffer = self.exported_program.state_dict[node.name]
logger.warning(
f"Casting buffer {node.name} from torch.int64 to torch.int32"
f" defined in {node.meta['stack_trace']}"
)
if torch.min(buffer) < torch.iinfo(torch.int32).min:
raise RuntimeError(
f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
)
if torch.max(buffer) > torch.iinfo(torch.int32).max:
raise RuntimeError(
f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
)
buffer_int32 = buffer.to(torch.int32)
self.exported_program.state_dict[buffer_name] = buffer_int32

def call(self, graph_module: torch.fx.GraphModule):
self._to_int32(graph_module)
Expand Down
5 changes: 5 additions & 0 deletions backends/arm/_passes/scalars_to_attribute_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
if isinstance(arg, Node):
new_args.append(arg)
continue
if isinstance(arg, int) and not torch.is_floating_point(
get_first_fake_tensor(n)
):
new_args.append(arg)
continue

prefix = "_tensor_constant_"
get_new_attr_name = get_new_attr_name_with_prefix(prefix)
Expand Down
Loading

0 comments on commit a3b8d91

Please sign in to comment.