From d9fabc5eb82828354bea9c9a10d622c3cacb14e6 Mon Sep 17 00:00:00 2001 From: Michael Gschwind Date: Fri, 5 Apr 2024 12:59:05 -0700 Subject: [PATCH] cuda ci on T4 --- .github/workflows/compile_t4.yml | 108 +++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 .github/workflows/compile_t4.yml diff --git a/.github/workflows/compile_t4.yml b/.github/workflows/compile_t4.yml new file mode 100644 index 000000000..934ef7648 --- /dev/null +++ b/.github/workflows/compile_t4.yml @@ -0,0 +1,108 @@ +name: Compile main + +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + run-tinystories: + strategy: + matrix: + include: + - name: CUDA Nightly + runs-on: 4-core-ubuntu-gpu-t4 + runs-on: ${{matrix.runner}} + steps: + - name: Checkout repo + uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.11 + - name: Print machine info + run: | + uname -a + if [ $(uname -s) == Darwin ]; then + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + fi + - name: Install requirements + run: | + pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121 + pip install -r requirements.txt + - name: Download checkpoints + run: | + mkdir -p checkpoints/stories15M + pushd checkpoints/stories15M + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt + wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model + popd + - name: Run inference + run: | + export MODEL_PATH=checkpoints/stories15M/stories15M.pt + export MODEL_NAME=stories15M + export MODEL_DIR=/tmp + python generate.py --device cuda--checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + python generate.py --device cuda --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + cat ./output_compiled + python export.py --device cuda --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so + python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti + cat ./output_aoti + + echo "******************************************" + echo "******* Emb: channel-wise quantized ******" + echo "******************************************" + python generate.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + python generate.py --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + cat ./output_compiled + python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so + python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti + cat ./output_aoti + + echo "******************************************" + echo "******** Emb: group-wise quantized *******" + echo "******************************************" + python generate.py --device cuda --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + python generate.py --device cuda --compile --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + cat ./output_compiled + python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so + python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti + cat ./output_aoti + + echo "******************************************" + echo "******* INT8 channel-wise quantized ******" + echo "******************************************" + python generate.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + python generate.py --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + cat ./output_compiled + python export.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so + python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti + cat ./output_aoti + + echo "******************************************" + echo "******** INT8 group-wise quantized *******" + echo "******************************************" + python generate.py --device cuda --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager + cat ./output_eager + python generate.py --device cuda --compile --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled + cat ./output_compiled + python export.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so + python generate.py --device cuda --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so > ./output_aoti + cat ./output_aoti + + echo "tests complete" + echo "******************************************" + # echo "********* EAGER vs TORCH.COMPILE *********" + # echo "******************************************" + # diff output_eager output_compiled + # echo "******************************************" + # echo "********* EAGER vs AOT INDUCTOR *********" + # echo "******************************************" + # diff output_eager output_aoti