diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index 71f52faa2c1e6..c9ff384a4c856 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -781,6 +781,13 @@ def get_args(): action="store_true", help="Avoid exporting model, only apply quantizations and optimizations to existing model exported from optimum.", ) + + parser.add_argument( + "--small_gpu", + action="store_true", + help="Load the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB.", + ) + parser.set_defaults(optimize_optimum=False) args = parser.parse_args() @@ -788,9 +795,7 @@ def get_args(): def main(): - if version.parse(torch.__version__) < version.parse("2.2.0") and "2.2.0.dev" not in torch.__version__: - # Second predicate is for comparing nightly (ex: 2.2.0.dev20230920 vs 2.2.0) since first predicate is false - # in that scenario. It can be removed when torch v2.2.0 is released in stable. + if version.parse(torch.__version__) < version.parse("2.2.0"): logger.error(f"Detected PyTorch version {torch.__version__}. Please upgrade and use v2.2.0 or newer.") return @@ -1021,7 +1026,11 @@ def main(): args.precision, "--cache_dir", args.cache_dir, + "--torch_model_directory", + args.input, ] + if args.small_gpu: + parity_cmd.append("--small_gpu") if "with_past" in filename: parity_cmd.append("--use_past_kv") if "merged" in filename: @@ -1030,7 +1039,7 @@ def main(): parity_cmd.append("--use_gqa") try: - logger.debug(f"check parity with cmd: {parity_cmd}") + logger.info(f"check parity with cmd: {parity_cmd}") parity_check(parity_cmd) except Exception as e: logger.warning(f"An error occurred while verifying parity: {e}", exc_info=True) diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py index 25d7519769604..f41a90208c51b 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py @@ -17,7 +17,7 @@ get_sample_with_past_kv_inputs, ) from llama_torch import setup_torch_model -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig import onnxruntime as ort @@ -67,20 +67,39 @@ def get_inputs(args: argparse.Namespace, config: AutoConfig): def verify_parity( - args: argparse.Namespace, config: AutoConfig, pt_model: AutoModelForCausalLM, kv_cache_ortvalues: dict + args: argparse.Namespace, + location: str, + use_auth_token: bool, + kv_cache_ortvalues: dict, + pytorch_model: None | torch.nn.Module = None, + config: None | AutoConfig = None, ): + # If it's running in a machine which GPU memory < 36GB, it should unload the llama in GPU in time and free the GPU memory for ORT. + py_model = pytorch_model + if py_model is None: + config, py_model = setup_torch_model( + args, + location, + use_auth_token, + torch_dtype=(torch.float16 if args.use_fp16 else torch.float32), + device=args.device, + ) + inputs = get_inputs(args, config) # Run inference with PyTorch if args.execution_provider != "cpu": torch.cuda.synchronize() start_time = time.time() - pt_outputs = pt_model(**inputs).logits.detach().cpu().numpy() + pt_outputs = py_model(**inputs).logits.detach().cpu().numpy() if args.execution_provider != "cpu": torch.cuda.synchronize() end_time = time.time() logger.info(f"PyTorch took {end_time - start_time} s") - del pt_model + + if args.small_gpu and py_model is not None: + del py_model + torch.cuda.empty_cache() # Run inference with ORT past_sequence_length, _, max_sequence_length = get_sequence_lengths(args) @@ -222,6 +241,13 @@ def get_args(argv: list[str]): help="model cache dir to override default HF cache dir to avoid overflood the /home dir", ) + # The argument is used for CI mainly, because the CI machine has 24G GPU memory at most. + parser.add_argument( + "--small_gpu", + action="store_true", + help="Load the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. ", + ) + args = parser.parse_args() if argv == [] else parser.parse_args(argv) # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models @@ -247,25 +273,29 @@ def main(argv: list[str] = []): # noqa: B006 use_auth_token = args.torch_model_directory == os.path.join(".") location = args.model_name if use_auth_token else args.torch_model_directory - config, llama = setup_torch_model( - args, - location, - use_auth_token, - torch_dtype=(torch.float16 if args.use_fp16 else torch.float32), - device=args.device, - ) - kv_cache_ortvalues = {} if not args.merged: - verify_parity(args, config, llama, kv_cache_ortvalues) + verify_parity(args, location, use_auth_token, kv_cache_ortvalues) else: - # Verify prompt generation in merged model (decoder_model.onnx) + config = llama = None + if not args.small_gpu: + config, llama = setup_torch_model( + args, + location, + use_auth_token, + torch_dtype=(torch.float16 if args.use_fp16 else torch.float32), + device=args.device, + ) + + # Verify prompt processing in merged model (decoder_model.onnx) args.use_past_kv = False - kv_cache_ortvalues = verify_parity(args, config, llama, kv_cache_ortvalues) + kv_cache_ortvalues = verify_parity( + args, location, use_auth_token, kv_cache_ortvalues, pytorch_model=llama, config=config + ) # Verify token generation in merged model (decoder_with_past_model.onnx) args.use_past_kv = True - verify_parity(args, config, llama, kv_cache_ortvalues) + verify_parity(args, location, use_auth_token, kv_cache_ortvalues, pytorch_model=llama, config=config) if __name__ == "__main__": diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt index b634bcc50f6e4..acd9c23aa42d0 100644 --- a/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt +++ b/onnxruntime/python/tools/transformers/models/llama/requirements-cuda.txt @@ -1,4 +1,4 @@ -r requirements.txt -# Please manually install torch>=2.2.0.dev20230920 with CUDA enabled for the CUDA version installed in your system. +# Please manually install torch>=2.2.0 with CUDA enabled for the CUDA version installed in your system. # Instructions can be found here: https://pytorch.org/get-started/locally/ -onnxruntime-gpu>=1.16.2 \ No newline at end of file +onnxruntime-gpu>=1.16.2 diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt index b72c972e7a16a..8b57279295e35 100644 --- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt @@ -1,6 +1,6 @@ optimum>=1.14.1 transformers>=4.33.2 -torch>=2.2.0.dev20230920 +torch>=2.2.0 onnx>=1.14.0 datasets>=2.8.0 -protobuf==3.20.2 \ No newline at end of file +protobuf==3.20.2 diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 0de2ac44215c4..65866fc9827a5 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -268,7 +268,7 @@ stages: skipComponentGovernanceDetection: true workspace: clean: all - pool: onnxruntime-Linux-GPU-T4 + pool: Onnxruntime-Linux-A10-24G steps: - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 displayName: 'Clean Agent Directories' @@ -278,10 +278,6 @@ stages: clean: true submodules: none - - checkout: LLaMa2Onnx - clean: true - submodules: none - - template: templates/flex-downloadPipelineArtifact.yml parameters: StepName: 'Download Onnxruntime Artifact' @@ -290,47 +286,40 @@ stages: SpecificArtifact: ${{ parameters.specificArtifact }} BuildId: ${{ parameters.BuildId }} - - task: DownloadPackage@1 - displayName: 'Download Llama2 model' - inputs: - packageType: upack - feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' - version: 1.0.0 - definition: '772ebce3-7e06-46d5-b3cc-82040ec4b2ce' - downloadPath: $(Agent.TempDirectory)/llama2_onnx_ft16 - - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: onnxruntime/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 - Context: onnxruntime/tools/ci_build/github/linux/docker/ - ScriptName: onnxruntime/tools/ci_build/get_docker_image.py + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 + Context: tools/ci_build/github/linux/docker/ + ScriptName: tools/ci_build/get_docker_image.py DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" Repository: onnxruntimeubi8packagestest UpdateDepsTxt: false + - task: DownloadPackage@1 + displayName: 'Download Meta Llama2 model' + inputs: + packageType: upack + feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' + version: 1.0.0 + definition: '6fe0c4ed-9d0e-4d66-94cc-fb6a111d02a5' + downloadPath: $(Agent.TempDirectory)/meta_llama2_7b_hf + - script: | - docker run --rm --gpus all -v $(Build.SourcesDirectory)/Llama-2-Onnx:/workspace \ + docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \ -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \ - -v $(Agent.TempDirectory)/llama2_onnx_ft16:/models \ + -v $(Agent.TempDirectory)/meta_llama2_7b_hf:/meta-llama2 \ onnxruntimeubi8packagestest \ bash -c " set -ex; \ + pushd /workspace/onnxruntime/python/tools/transformers/ ; \ python3 -m pip install --upgrade pip ; \ + pushd models/llama ; \ + python3 -m pip install -r requirements-cuda.txt ; \ + popd ; \ python3 -m pip install /ort-artifact/*.whl ; \ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \ - python3 -m pip install sentencepiece ; \ - pushd /workspace ; \ - python3 MinimumExample/Example_ONNX_LlamaV2.py --onnx_file /models/ONNX/LlamaV2_7B_FT_float16.onnx \ - --embedding_file /models/embeddings.pth --tokenizer_path tokenizer.model --prompt 'What is the lightest element?' > /workspace/answer.txt ; \ + python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\ popd ; \ " - displayName: 'Run Llama2 demo' + displayName: 'Run Llama2 to Onnx F16 and parity Test' workingDirectory: $(Build.SourcesDirectory) - - - script: | - set -ex - real=$(cat $(Build.SourcesDirectory)/Llama-2-Onnx/answer.txt) - trim_actual=$(tr -dc '[[:print:]]' <<< "$real") - expected="The lightest element is hydrogen. Hydrogen is the lightest element on the periodic table, with an atomic mass of 1.00794 u (unified atomic mass units)." - [ "$expected" == "$trim_actual" ] && exit 0 || exit 1 - displayName: 'Check result'