From 587e92c2791b8af512c15688c0d8469217f92122 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 28 Jun 2024 06:18:26 +0800 Subject: [PATCH] Add FP32 and INT4 test in Llama2 (#21187) ### Description ### Motivation and Context --- .../azure-pipelines/bigmodels-ci-pipeline.yml | 67 ++++++++++++++----- ...rfile.package_ubi8_cuda_tensorrt10_0_torch | 57 ++++++++++++++++ 2 files changed, 108 insertions(+), 16 deletions(-) create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 0c0cd8d0a870b..41b3c47ba0396 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -38,14 +38,6 @@ parameters: type: number default: 0 -resources: - repositories: - - repository: LLaMa2Onnx - type: Github - endpoint: Microsoft - name: Microsoft/Llama-2-Onnx - ref: main - variables: - template: templates/common-variables.yml - name: docker_base_image @@ -287,11 +279,12 @@ stages: workingDirectory: $(Build.SourcesDirectory) condition: ne(variables.hitAnother, 'True') -- stage: Llama2_ONNX_FP16 +- stage: Llama2_7B_ONNX dependsOn: - Build_Onnxruntime_Cuda jobs: - - job: Llama2_ONNX_FP16 + - job: Llama2_7B_ONNX + timeoutInMinutes: 120 variables: skipComponentGovernanceDetection: true workspace: @@ -319,7 +312,7 @@ stages: - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch Context: tools/ci_build/github/linux/docker/ ScriptName: tools/ci_build/get_docker_image.py DockerBuildArgs: " @@ -327,7 +320,7 @@ stages: --build-arg BASEIMAGE=${{ variables.docker_base_image }} --build-arg TRT_VERSION=${{ variables.linux_trt_version }} " - Repository: onnxruntimeubi8packagestest + Repository: onnxruntimeubi8packagestest_torch UpdateDepsTxt: false - task: DownloadPackage@1 @@ -343,7 +336,7 @@ stages: docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \ -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \ -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \ - onnxruntimeubi8packagestest \ + onnxruntimeubi8packagestest_torch \ bash -c " set -ex; \ pushd /workspace/onnxruntime/python/tools/transformers/ ; \ @@ -352,14 +345,56 @@ stages: python3 -m pip install -r requirements.txt ; \ popd ; \ python3 -m pip install /ort-artifact/*.whl ; \ - python3 -m pip uninstall -y torch ; \ - python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \ - python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\ + python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --small_gp;\ + ls -l llama2-7b-fp16; \ + du -sh llama2-7b-fp16; \ popd ; \ " displayName: 'Run Llama2 to Onnx F16 and parity Test' workingDirectory: $(Build.SourcesDirectory) + - script: | + docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \ + -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \ + -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \ + onnxruntimeubi8packagestest_torch \ + bash -c " + set -ex; \ + pushd /workspace/onnxruntime/python/tools/transformers/ ; \ + python3 -m pip install --upgrade pip ; \ + pushd models/llama ; \ + python3 -m pip install -r requirements.txt ; \ + popd ; \ + python3 -m pip install /ort-artifact/*.whl ; \ + python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-fp32-gpu --precision fp32 --execution_provider cuda;\ + ls -l llama2-7b-fp32-gpu; \ + du -sh llama2-7b-fp32-gpu; \ + popd ; \ + " + displayName: 'Run Llama2 to Onnx fp32 and parity Test' + workingDirectory: $(Build.SourcesDirectory) + + - script: | + docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \ + -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \ + -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \ + onnxruntimeubi8packagestest_torch \ + bash -c " + set -ex; \ + pushd /workspace/onnxruntime/python/tools/transformers/ ; \ + python3 -m pip install --upgrade pip ; \ + pushd models/llama ; \ + python3 -m pip install -r requirements.txt ; \ + popd ; \ + python3 -m pip install /ort-artifact/*.whl ; \ + python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --input /meta-llama2 --output llama2-7b-int4-gpu --precision int4 --execution_provider cuda --use_gqa;\ + ls -l llama2-7b-int4-gpu; \ + du -sh llama2-7b-int4-gpu; \ + popd ; \ + " + displayName: 'Run Llama2 to Onnx INT4 and parity Test' + workingDirectory: $(Build.SourcesDirectory) + - stage: Whisper_ONNX dependsOn: - Build_Onnxruntime_Cuda diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch new file mode 100644 index 0000000000000..4542d3a3f2e4c --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -0,0 +1,57 @@ +# -------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------- +# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10.0 and CUDA 11.8 by default + +# Build base image with required system packages +ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 +ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +FROM $BASEIMAGE AS base +ARG TRT_VERSION +ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} + +RUN dnf install -y bash wget &&\ + dnf clean dbcache + +RUN pip3 install --upgrade pip +RUN pip3 install setuptools>=68.2.2 + +#Install TensorRT only if TRT_VERSION is not empty +RUN if [ -n "$TRT_VERSION" ]; then \ + echo "TRT_VERSION is $TRT_VERSION" && \ + dnf -y install \ + libnvinfer10-${TRT_VERSION} \ + libnvinfer-headers-devel-${TRT_VERSION} \ + libnvinfer-devel-${TRT_VERSION} \ + libnvinfer-lean10-${TRT_VERSION} \ + libnvonnxparsers10-${TRT_VERSION} \ + libnvonnxparsers-devel-${TRT_VERSION} \ + libnvinfer-dispatch10-${TRT_VERSION} \ + libnvinfer-plugin10-${TRT_VERSION} \ + libnvinfer-vc-plugin10-${TRT_VERSION} \ + libnvinfer-bin-${TRT_VERSION} \ + libnvinfer-plugin10-${TRT_VERSION} \ + libnvinfer-plugin-devel-${TRT_VERSION} \ + libnvinfer-vc-plugin-devel-${TRT_VERSION} \ + libnvinfer-lean-devel-${TRT_VERSION} \ + libnvinfer-dispatch-devel-${TRT_VERSION} \ + libnvinfer-headers-plugin-devel-${TRT_VERSION} && \ + dnf clean dbcache ; \ +else \ + echo "TRT_VERSION is none skipping Tensor RT Installation" ; \ +fi + +ADD scripts /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && /tmp/scripts/install_java.sh && rm -rf /tmp/scripts + +RUN python3 -m pip uninstall -y torch +RUN python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 + +# Build final image from base. +FROM base as final +ARG BUILD_USER=onnxruntimedev +ARG BUILD_UID=1000 +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER