Skip to content

Commit

Permalink
bump torchao pin (#1318)
Browse files Browse the repository at this point in the history
* bump torchao pin

* update pin

* update pin

* merge conflict
  • Loading branch information
metascroy authored Oct 23, 2024
1 parent 7d5ba09 commit 76c1cd2
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 47 deletions.
33 changes: 6 additions & 27 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1092,32 +1092,11 @@ jobs:
id: install-torchao-ops
run: |
bash torchchat/utils/scripts/build_torchao_ops.sh
- name: Set git shas
id: setup-hash
run: |
export TORCHCHAT_ROOT=${PWD}
echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
- name: Load or install ET
id: install-et
uses: actions/cache@v4
with:
path: |
./et-build
./torchchat/utils/scripts/install_et.sh
key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
- if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
continue-on-error: true
- name: Install ET
run: |
echo "Installing ExecuTorch"
export TORCHCHAT_ROOT=${PWD}
bash torchchat/utils/scripts/install_et.sh
- name: Install ExecuTorch python
run: |
echo "Install ExecuTorch python"
export TORCHCHAT_ROOT=$PWD
export ET_BUILD_DIR="et-build"
ENABLE_ET_PYBIND="${1:-true}"
source "torchchat/utils/scripts/install_utils.sh"
install_executorch_python_libs $ENABLE_ET_PYBIND
- name: Install runner
run: |
echo "Installing runner"
Expand All @@ -1132,14 +1111,14 @@ jobs:
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
export PRMT="Once upon a time in a land far away"
echo "Generate eager"
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
echo "Generate compile"
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
echo "Export and run ET (C++ runner)"
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
echo "Export and run AOTI (C++ runner)"
python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
python torchchat.py export stories110M --output-dso-path ./model.so --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
echo "Generate AOTI"
python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
Expand Down
23 changes: 15 additions & 8 deletions docs/quantization.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,29 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte --prompt "Hello my n
## Experimental TorchAO lowbit kernels

### Use
The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.

#### linear:a8wxdq
The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme.

You should expect high performance on ARM CPU if bitwidth is 1, 2, 3, 4, or 5 and groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.
You should expect high performance on ARM CPU if groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.

#### embedding:wx
The quantization scheme embedding:wx quantizes embeddings in a groupwise manner with the specified bitwidth and groupsize. It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize. Unlike linear:a8wxdq, embedding:wx always quantizes with scales and zeros.

You should expect high performance on ARM CPU if groupsize is divisible by 32. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.

### Setup
To use a8wxdq, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.

From the torchchat root directory, run
```
sh torchchat/utils/scripts/build_torchao_ops.sh
```

This should take about 10 seconds to complete. Once finished, you can use a8wxdq in torchchat.
This should take about 10 seconds to complete.

Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.

Expand All @@ -156,17 +163,17 @@ Below we show how to use the new kernels. Except for ExecuTorch, you can specif

#### Eager mode
```
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --prompt "Once upon a time," --num-samples 5
```

#### torch.compile
```
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile --prompt "Once upon a time," --num-samples 5
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile --prompt "Once upon a time," --num-samples 5
```

#### AOTI
```
OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3_1.so
OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time," --num-samples 5
```

Expand All @@ -178,7 +185,7 @@ OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cac

#### ExecuTorch
```
python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3_1.pte
python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-pte llama3_1.pte
```

Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command.
Expand Down
2 changes: 1 addition & 1 deletion install/.pins/torchao-pin.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
49b1fb61c8b8eceda755579a2fd92c756d822de2
c8f1174a06dcc0102849c8348ca6573bde8847a9
38 changes: 29 additions & 9 deletions torchchat/utils/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,15 @@
find_multiple,
get_device_str,
get_precision,
set_precision,
name_to_dtype,
state_dict_device,
use_et_backend,
)


# Flag for whether the a8wxdq quantizer is available.
a8wxdq_load_error: Optional[Exception] = None
torchao_experimental_load_error: Optional[Exception] = None

#########################################################################
### handle arg validation ###
Expand Down Expand Up @@ -115,6 +116,13 @@ def quantize_model(
if not support_tensor_subclass:
unwrap_tensor_subclass(model)
continue

if quantizer in ["linear:a8wxdq", "embedding:wx"]:
# These quantizers require float32 input weights. Note that after quantization,
# the weights will no longer be float32, but lowbit integers
if get_precision() != torch.float32:
print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}. Changing dtype to float32. Note that after quantization, the weights will be lowbit integers, not float32.")
set_precision(torch.float32)

# We set global precision from quantize options if it is specified at cli.py:485
# so the precision returned by get_precision() is always the authoritative precision/dtype in torchchat
Expand Down Expand Up @@ -887,24 +895,35 @@ def quantized_model(self) -> nn.Module:

try:
import importlib.util
import sys
import os
import sys

torchao_build_path = f"{os.getcwd()}/torchao-build"

# Try loading quantizer
torchao_experimental_quant_api_spec = importlib.util.spec_from_file_location(
"torchao_experimental_quant_api",
f"{torchao_build_path}/src/ao/torchao/experimental/quant_api.py",
)
torchao_experimental_quant_api = importlib.util.module_from_spec(torchao_experimental_quant_api_spec)
torchao_experimental_quant_api = importlib.util.module_from_spec(
torchao_experimental_quant_api_spec
)
sys.modules["torchao_experimental_quant_api"] = torchao_experimental_quant_api
torchao_experimental_quant_api_spec.loader.exec_module(torchao_experimental_quant_api)
from torchao_experimental_quant_api import Int8DynActIntxWeightQuantizer
quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightQuantizer
torchao_experimental_quant_api_spec.loader.exec_module(
torchao_experimental_quant_api
)
from torchao_experimental_quant_api import (
Int8DynActIntxWeightLinearQuantizer,
IntxWeightEmbeddingQuantizer,
)

quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer
quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer

# Try loading custom op
try:
import glob

libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
torch.ops.load_library(libs[0])
Expand All @@ -915,8 +934,9 @@ def quantized_model(self) -> nn.Module:
except Exception as e:
class ErrorHandler(QuantHandler):
def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None):
global a8wxdq_load_error
raise Exception(f"Note: Failed to load torchao experimental a8wxdq quantizer with error: {a8wxdq_load_error}")
global torchao_experimental_load_error
raise Exception(f"Note: Failed to load torchao experimental quantizer with error: {torchao_experimental_load_error}")

a8wxdq_load_error = e
torchao_experimental_load_error = e
quantizer_class_dict["linear:a8wxdq"] = ErrorHandler
quantizer_class_dict["embedding:wx"] = ErrorHandler
3 changes: 1 addition & 2 deletions torchchat/utils/scripts/install_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,6 @@ install_torchao_aten_ops() {
cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
-DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
-DCMAKE_BUILD_TYPE="Release" \
-DTORCHAO_OP_TARGET="aten" \
-S . \
-B ${CMAKE_OUT_DIR} -G Ninja
cmake --build ${CMAKE_OUT_DIR} --target install --config Release
Expand All @@ -207,7 +206,7 @@ install_torchao_executorch_ops() {
cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
-DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
-DCMAKE_BUILD_TYPE="Release" \
-DTORCHAO_OP_TARGET="executorch" \
-DTORCHAO_BUILD_EXECUTORCH_OPS=ON \
-DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \
-DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \
-S . \
Expand Down

0 comments on commit 76c1cd2

Please sign in to comment.