Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FP32 and INT4 test in Llama2 #21187

Merged
merged 12 commits into from
Jun 27, 2024
67 changes: 51 additions & 16 deletions tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,6 @@ parameters:
type: number
default: 0

resources:
repositories:
- repository: LLaMa2Onnx
type: Github
endpoint: Microsoft
name: Microsoft/Llama-2-Onnx
ref: main

variables:
- template: templates/common-variables.yml
- name: docker_base_image
Expand Down Expand Up @@ -287,11 +279,12 @@ stages:
workingDirectory: $(Build.SourcesDirectory)
condition: ne(variables.hitAnother, 'True')

- stage: Llama2_ONNX_FP16
- stage: Llama2_7B_ONNX
dependsOn:
- Build_Onnxruntime_Cuda
jobs:
- job: Llama2_ONNX_FP16
- job: Llama2_7B_ONNX
timeoutInMinutes: 120
variables:
skipComponentGovernanceDetection: true
workspace:
Expand Down Expand Up @@ -319,15 +312,15 @@ stages:

- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch
Context: tools/ci_build/github/linux/docker/
ScriptName: tools/ci_build/get_docker_image.py
DockerBuildArgs: "
--build-arg BUILD_UID=$( id -u )
--build-arg BASEIMAGE=${{ variables.docker_base_image }}
--build-arg TRT_VERSION=${{ variables.linux_trt_version }}
"
Repository: onnxruntimeubi8packagestest
Repository: onnxruntimeubi8packagestest_torch
UpdateDepsTxt: false

- task: DownloadPackage@1
Expand All @@ -343,7 +336,7 @@ stages:
docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
-v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
-v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
onnxruntimeubi8packagestest \
onnxruntimeubi8packagestest_torch \
bash -c "
set -ex; \
pushd /workspace/onnxruntime/python/tools/transformers/ ; \
Expand All @@ -352,14 +345,56 @@ stages:
python3 -m pip install -r requirements.txt ; \
popd ; \
python3 -m pip install /ort-artifact/*.whl ; \
python3 -m pip uninstall -y torch ; \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --small_gp;\
ls -l llama2-7b-fp16; \
du -sh llama2-7b-fp16; \
popd ; \
"
displayName: 'Run Llama2 to Onnx F16 and parity Test'
workingDirectory: $(Build.SourcesDirectory)

- script: |
docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
-v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
-v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
onnxruntimeubi8packagestest_torch \
bash -c "
set -ex; \
pushd /workspace/onnxruntime/python/tools/transformers/ ; \
python3 -m pip install --upgrade pip ; \
pushd models/llama ; \
python3 -m pip install -r requirements.txt ; \
popd ; \
python3 -m pip install /ort-artifact/*.whl ; \
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda;\
ls -l llama2-7b-fp32-gpu; \
du -sh llama2-7b-fp32-gpu; \
popd ; \
"
displayName: 'Run Llama2 to Onnx fp32 and parity Test'
workingDirectory: $(Build.SourcesDirectory)

- script: |
docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \
-v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \
-v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \
onnxruntimeubi8packagestest_torch \
bash -c "
set -ex; \
pushd /workspace/onnxruntime/python/tools/transformers/ ; \
python3 -m pip install --upgrade pip ; \
pushd models/llama ; \
python3 -m pip install -r requirements.txt ; \
popd ; \
python3 -m pip install /ort-artifact/*.whl ; \
python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-int4-gpu --precision int4 --execution_provider cuda --use_gqa;\
ls -l llama2-7b-int4-gpu; \
du -sh llama2-7b-int4-gpu; \
popd ; \
"
displayName: 'Run Llama2 to Onnx INT4 and parity Test'
workingDirectory: $(Build.SourcesDirectory)

- stage: Whisper_ONNX
dependsOn:
- Build_Onnxruntime_Cuda
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# --------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------
# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10.0 and CUDA 11.8 by default

# Build base image with required system packages
ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
ARG TRT_VERSION=10.0.1.6-1.cuda11.8
FROM $BASEIMAGE AS base
ARG TRT_VERSION
ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}

RUN dnf install -y bash wget &&\
dnf clean dbcache

RUN pip3 install --upgrade pip
RUN pip3 install setuptools>=68.2.2

#Install TensorRT only if TRT_VERSION is not empty
RUN if [ -n "$TRT_VERSION" ]; then \
echo "TRT_VERSION is $TRT_VERSION" && \
dnf -y install \
libnvinfer10-${TRT_VERSION} \
libnvinfer-headers-devel-${TRT_VERSION} \
libnvinfer-devel-${TRT_VERSION} \
libnvinfer-lean10-${TRT_VERSION} \
libnvonnxparsers10-${TRT_VERSION} \
libnvonnxparsers-devel-${TRT_VERSION} \
libnvinfer-dispatch10-${TRT_VERSION} \
libnvinfer-plugin10-${TRT_VERSION} \
libnvinfer-vc-plugin10-${TRT_VERSION} \
libnvinfer-bin-${TRT_VERSION} \
libnvinfer-plugin10-${TRT_VERSION} \
libnvinfer-plugin-devel-${TRT_VERSION} \
libnvinfer-vc-plugin-devel-${TRT_VERSION} \
libnvinfer-lean-devel-${TRT_VERSION} \
libnvinfer-dispatch-devel-${TRT_VERSION} \
libnvinfer-headers-plugin-devel-${TRT_VERSION} && \
dnf clean dbcache ; \
else \
echo "TRT_VERSION is none skipping Tensor RT Installation" ; \
fi

ADD scripts /tmp/scripts
RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && /tmp/scripts/install_java.sh && rm -rf /tmp/scripts

RUN python3 -m pip uninstall -y torch
RUN python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118

# Build final image from base.
FROM base as final
ARG BUILD_USER=onnxruntimedev
ARG BUILD_UID=1000
RUN adduser --uid $BUILD_UID $BUILD_USER
WORKDIR /home/$BUILD_USER
USER $BUILD_USER
Loading