Fix et runner (#388) #545
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: pull | |
on: | |
pull_request: | |
push: | |
branches: | |
- main | |
workflow_dispatch: | |
jobs: | |
gather-models-cpu: | |
runs-on: ubuntu-22.04 | |
outputs: | |
models: ${{ steps.gather-models-cpu.outputs.models }} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'false' | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.11' | |
- name: Extract the list of models to run on CPU | |
id: gather-models-cpu | |
run: | | |
set -eux | |
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "cpu" | |
test-cpu-compile: | |
name: test-cpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
pip list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "compile" | |
test-cpu-aoti: | |
name: test-cpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
pip list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
bash ${TORCHCHAT_ROOT}/.ci/scripts/wget_checkpoint.sh ${{ matrix.repo_name }} "${{ matrix.resources }}" | |
- name: Run validation | |
run: | | |
pushd ${TORCHCHAT_ROOT} | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cpu" "aoti" | |
gather-models-gpu: | |
runs-on: ubuntu-22.04 | |
outputs: | |
models: ${{ steps.gather-models-gpu.outputs.models }} | |
steps: | |
- uses: actions/checkout@v3 | |
with: | |
submodules: 'false' | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: '3.11' | |
- name: Extract the list of models to run on GPU | |
id: gather-models-gpu | |
run: | | |
set -eux | |
PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu" | |
test-gpu-compile: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 | |
pip install -r ./requirements.txt | |
pip list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "compile" | |
echo "::endgroup::" | |
test-gpu-aoti: | |
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main | |
name: test-gpu-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-gpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-gpu.outputs.models) }} | |
fail-fast: false | |
with: | |
runner: linux.g5.4xlarge.nvidia.gpu | |
gpu-arch-type: cuda | |
gpu-arch-version: "12.1" | |
script: | | |
echo "::group::Print machine info" | |
nvidia-smi | |
echo "::endgroup::" | |
echo "::group::Install required packages" | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 | |
pip install -r ./requirements.txt | |
pip list | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoint" | |
export REPO_NAME=${{ matrix.repo_name }} | |
bash .ci/scripts/wget_checkpoint.sh ${REPO_NAME} ${{ matrix.resources }} | |
echo "::endgroup::" | |
echo "::group::Convert checkpoint" | |
bash .ci/scripts/convert_checkpoint.sh ${REPO_NAME} | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti" | |
echo "::endgroup::" | |
test-tinystories-executorch: | |
strategy: | |
matrix: | |
runner: [32-core-ubuntu] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Intalling pip packages" | |
pip install wheel | |
pip install cmake | |
pip install ninja | |
pip install zstd | |
pip install -r requirements.txt | |
echo "Executorch: cloning" | |
mkdir etorch | |
cd etorch | |
git clone https://github.com/pytorch/executorch.git | |
cd executorch | |
echo "Inside: ${PWD}" | |
echo "Executorch: submodule update" | |
git submodule sync | |
git submodule update --init | |
echo "Executorch: installing python interface" | |
./install_requirements.sh --pybind xnnpack | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")' | |
python3 -c 'import torchaudio;print(f"torchaudio: {torchaudio.__version__, torchaudio.version.git_version}")' | |
cd ../.. | |
echo "Inside: ${PWD}" | |
- name: Download checkpoints | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin | |
popd | |
mkdir gguf_files | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model | |
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true" | |
wget -O ${GGUF_TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
- name: Run inference | |
run: | | |
export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ${PWD}/output_eager | |
cat ${PWD}/output_eager | |
python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte > ${PWD}/output_et | |
cat ${PWD}/output_et | |
echo "Tests complete." | |
- name: Run inference | |
run: | | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et | |
cat ./output_et | |
echo "******************************************" | |
echo "******* Emb: channel-wise quantized ******" | |
echo "******************************************" | |
python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et | |
cat ./output_et | |
echo "******************************************" | |
echo "******** Emb: group-wise quantized *******" | |
echo "******************************************" | |
python export.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et | |
cat ./output_et | |
echo "******************************************" | |
echo "******* INT8 channel-wise quantized ******" | |
echo "******************************************" | |
python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et | |
cat ./output_et | |
echo "******************************************" | |
echo "******** INT8 group-wise quantized *******" | |
echo "******************************************" | |
python export.py --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et | |
cat ./output_et | |
echo "******************************************" | |
echo "******** ET: a8w4dq INT4 group-wise quantized *******" | |
echo "******************************************" | |
python export.py --quant '{"linear:a8w4dq" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte | |
python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte > ./output_et | |
# cat ./output_et | |
echo "tests complete" | |
echo "******************************************" | |
- name: Run GGUF export + inference | |
run: | | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export GGUF_TOKENIZER_PATH=gguf_files/tokenizer.model | |
python torchchat.py export --gguf-path ${GGUF_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte | |
python torchchat.py generate --gguf-path ${GGUF_PATH} --pte-path ${PWD}/${MODEL_NAME}.pte --tokenizer-path ${GGUF_TOKENIZER_PATH} --temperature 0 --max-new-tokens 20 > ${PWD}/output_et | |
cat ${PWD}/output_et | |
echo "Tests complete." | |
torchchat-command-load-test: | |
strategy: | |
matrix: | |
runner: [macos-14] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Installing pip packages" | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download Stories files | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
- name: Test generate | |
run: | | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
python generate.py --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager1 | |
python torchchat.py generate --device cpu --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager2 | |
cat ./output_eager1 | |
cat ./output_eager2 | |
echo "Tests complete." | |
- name: Test download | |
run: | | |
python torchchat.py generate stories15M | |
test-tinystories-eager: | |
strategy: | |
matrix: | |
runner: [macos-12] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download checkpoints | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
- name: Run inference | |
run: | | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
for DTYPE in bfloat16 float16 float32; do | |
# if [ $(uname -s) == Darwin ]; then | |
# export DTYPE=float16 | |
# fi | |
python generate.py --dtype ${DTYPE} --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "******************************************" | |
echo "******* Emb: channel-wise quantized ******" | |
echo "******************************************" | |
python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "******************************************" | |
echo "******** Emb: group-wise quantized *******" | |
echo "******************************************" | |
python generate.py --dtype ${DTYPE} --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "******************************************" | |
echo "******* INT8 channel-wise quantized ******" | |
echo "******************************************" | |
python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "******************************************" | |
echo "******** INT8 group-wise quantized *******" | |
echo "******************************************" | |
python generate.py --dtype ${DTYPE} --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "******************************************" | |
echo "******** INT4 group-wise quantized *******" | |
echo "******************************************" | |
echo "INT4 should work on MacOS on x86, but cannot be tested" | |
echo "because nightlies are too old!" | |
# python generate.py --dtype ${DTYPE} --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
# cat ./output_eager | |
echo "tests complete for ${DTYPE}" | |
done | |
echo "tests complete for all dtypes!" | |
test-mps: | |
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
with: | |
runner: macos-m1-stable | |
script: | | |
set -x | |
# NS: Remove previous installation of torch first | |
# as this script does not isntall anything into conda env but rather as system dep | |
pip uninstall -y torch || true | |
set -eou pipefail | |
echo "::group::Print machine info" | |
uname -a | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
echo "::endgroup::" | |
echo "::group::Install requirements" | |
# Install requirements | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
ls -la | |
pwd | |
pip install -r requirements.txt | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoints" | |
( | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
) | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
python generate.py --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "************************************************************" | |
echo "*** embedding" | |
echo "************************************************************" | |
python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
python generate.py --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "************************************************************" | |
echo "*** linear int8" | |
echo "************************************************************" | |
python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
python generate.py --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "************************************************************" | |
echo "*** linear int4" | |
echo "************************************************************" | |
PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
test-gguf-util: | |
strategy: | |
matrix: | |
runner: [macos-14] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Intalling pip packages" | |
pip install gguf | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
git clone https://github.com/ggerganov/llama.cpp.git | |
pushd llama.cpp | |
make | |
popd | |
- name: Download GGUF files | |
run: | | |
mkdir gguf_files | |
wget -O gguf_files/llama-2-7b.Q4_0.gguf "https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true" | |
./llama.cpp/quantize --allow-requantize gguf_files/llama-2-7b.Q4_0.gguf gguf_files/llama-2-7b.Q4_0.requant_F32.gguf F32 | |
- name: Load files | |
run: | | |
touch test.py | |
echo "from build.gguf_util import test_by_to_float" >> test.py | |
echo "test_by_to_float(\"gguf_files/llama-2-7b.Q4_0.gguf\", \"gguf_files/llama-2-7b.Q4_0.requant_F32.gguf\")" >> test.py | |
cat test.py | |
python test.py | |
echo "Tests complete." | |
test-mps-dtype: | |
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main | |
with: | |
runner: macos-m1-stable | |
script: | | |
set -x | |
# NS: Remove previous installation of torch first | |
# as this script does not isntall anything into conda env but rather as system dep | |
pip uninstall -y torch || true | |
set -eou pipefail | |
echo "::group::Print machine info" | |
uname -a | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
echo "::endgroup::" | |
echo "::group::Install requirements" | |
# Install requirements | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
ls -la | |
pwd | |
pip install -r requirements.txt | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
echo "::endgroup::" | |
echo "::group::Download checkpoints" | |
( | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
curl -fsSL -O https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
curl -fsSL -O https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
popd | |
) | |
echo "::endgroup::" | |
echo "::group::Run inference" | |
export MODEL_PATH=checkpoints/stories15M/stories15M.pt | |
export MODEL_NAME=stories15M | |
export MODEL_DIR=/tmp | |
for DTYPE in float16 float32; do | |
# if [ $(uname -s) == Darwin ]; then | |
# export DTYPE=float16 | |
# fi | |
python generate.py --dtype ${DTYPE} --device mps --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
python generate.py --dtype ${DTYPE} --device mps --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int8" : {"bitwidth": 8, "groupsize": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
PYTORCH_ENABLE_MPS_FALLBACK=1 python generate.py --dtype ${DTYPE} --device mps --quant '{"linear:int4" : {"groupsize": 32}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager | |
cat ./output_eager | |
done | |
compile-gguf: | |
strategy: | |
matrix: | |
runner: [macos-14] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
pip install gguf | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
- name: Download GGUF | |
run: | | |
mkdir gguf_files | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export TOKENIZER_PATH=gguf_files/tokenizer.model | |
wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true" | |
wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
- name: Run inference | |
run: | | |
export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf | |
export TOKENIZER_PATH=gguf_files/tokenizer.model | |
export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf | |
export MODEL_DIR=/tmp | |
echo "******************************************" | |
echo "******* Embed: not quantized *************" | |
echo "******************************************" | |
echo "Running eager" | |
python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "Running compiled" | |
python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled | |
cat ./output_compiled | |
echo "******************************************" | |
echo "******* Emb: channel-wise quantized ******" | |
echo "******************************************" | |
echo "Running eager" | |
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "Running compiled" | |
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled | |
cat ./output_compiled | |
echo "******************************************" | |
echo "******** Emb: group-wise quantized *******" | |
echo "******************************************" | |
echo "Running eager" | |
python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager | |
cat ./output_eager | |
echo "Running compiled" | |
python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled | |
cat ./output_compiled | |
echo "tests complete" | |
echo "******************************************" | |
runner-et: | |
strategy: | |
matrix: | |
runner: [macos-14-xlarge] | |
runs-on: ${{matrix.runner}} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v2 | |
- name: Setup Python | |
uses: actions/setup-python@v2 | |
with: | |
python-version: 3.11 | |
- name: Print machine info | |
run: | | |
uname -a | |
if [ $(uname -s) == Darwin ]; then | |
sysctl machdep.cpu.brand_string | |
sysctl machdep.cpu.core_count | |
fi | |
- name: Install requirements | |
run: | | |
echo "Intalling pip packages" | |
pip install -r requirements.txt | |
export TORCHCHAT_ROOT=${PWD} | |
export ENABLE_ET_PYBIND=false | |
./scripts/install_et.sh $ENABLE_ET_PYBIND | |
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' | |
python3 -c 'import torchvision;print(f"torchvision: {torchvision.__version__, torchvision.version.git_version}")' | |
python3 -c 'import torchaudio;print(f"torchaudio: {torchaudio.__version__, torchaudio.version.git_version}")' | |
cmake -S ./runner-et -B ./runner-et/cmake-out -G Ninja | |
cmake --build ./runner-et/cmake-out | |
- name: Download checkpoints | |
run: | | |
- name: Run inference | |
run: | | |
python torchchat.py download stories15M | |
wget -O ./tokenizer.bin https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin | |
export PRMT="Once upon a time in a land far away" | |
python torchchat.py generate stories15M --temperature 0 --prompt "${PRMT}" > ./output_eager | |
cat ./output_eager | |
python torchchat.py export stories15M --output-pte-path ./model.pte | |
./runner-et/cmake-out/run ./model.pte -z ./tokenizer.bin -t 0 -i "${PRMT}" > ./output_et | |
cat ./output_et | |
echo "Tests complete." | |
runner-aoti: | |
name: test-runner-aoti (${{ matrix.platform }}, ${{ matrix.model_name }}) | |
needs: gather-models-cpu | |
strategy: | |
matrix: ${{ fromJSON(needs.gather-models-cpu.outputs.models) }} | |
fail-fast: false | |
runs-on: ${{ matrix.runner }} | |
env: | |
TORCHCHAT_ROOT: ${{ github.workspace }} | |
REPO_NAME: ${{ matrix.repo_name }} | |
steps: | |
- name: Checkout repo | |
uses: actions/checkout@v3 | |
- name: Setup Python | |
uses: actions/setup-python@v4 | |
with: | |
python-version: '3.11' | |
- name: Print machine info | |
run: | | |
echo "$(uname -a)" | |
- name: Install dependencies | |
run: | | |
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu | |
pip install -r requirements.txt | |
pip list | |
cd ${TORCHCHAT_ROOT}/runner-aoti | |
cmake -Bbuild -DCMAKE_PREFIX_PATH=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` | |
cmake --build build | |
cd .. | |
- name: Download checkpoint | |
run: | | |
mkdir -p checkpoints/stories15M | |
pushd checkpoints/stories15M | |
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model | |
wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin | |
popd | |
- name: Run inference | |
run: | | |
export MODEL_DIR=${PWD}/checkpoints/stories15M | |
export PROMPT="Once upon a time in a land far away" | |
python torchchat.py generate --checkpoint-path ${MODEL_DIR}/stories15M.pt --temperature 0 --prompt "${PROMPT}" > ${PWD}/output_eager | |
cat ${PWD}/output_eager | |
python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-dso-path /tmp/model.so | |
./runner-aoti/build/run /tmp/model.so -z ${MODEL_DIR}/tokenizer.bin -i "${PROMPT}" > ${PWD}/output_aoti | |
cat ${PWD}/output_aoti | |
echo "Tests complete." |