From f28e2420b530797a64adc060b95476643ab71aa2 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Thu, 14 Mar 2024 17:00:15 -0700 Subject: [PATCH 01/36] Update example READMEs (#199) --- examples/csharp/README.md | 43 ++++++++++++++++++++--------------- examples/python/README.md | 48 ++++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 41 deletions(-) diff --git a/examples/csharp/README.md b/examples/csharp/README.md index d6b4010f3..dce940811 100644 --- a/examples/csharp/README.md +++ b/examples/csharp/README.md @@ -2,31 +2,38 @@ ## Install the onnxruntime-genai library -* Install the python package - - ```bash - pip install onnxruntime-genai - ``` ## Get the model -Install the model builder script dependencies +You can generate the model using the model builder this library, or bring your own model. + +If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config). + +To generate the model with model builder: + +1. Install the python package -```bash -pip install numpy -pip install transformers -pip install torch -pip install onnx -pip install onnxruntime -``` + Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). -Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md) -```bash -cd examples\\phi2\\csharp -python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o phi-2\ -``` +2. Install the model builder script dependencies + + ```bash + pip install numpy + pip install transformers + pip install torch + pip install onnx + pip install onnxruntime + ``` +3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md) + + ```bash + cd examples/python + python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu + ``` ## Run the phi-2 model +Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). + Open [HelloPhi2.sln](HelloPhi2.sln) and run the console application. diff --git a/examples/python/README.md b/examples/python/README.md index 9c7a2cc25..2f20cfcd0 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -2,37 +2,39 @@ ## Install the onnxruntime-genai library -* Install the python package +Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). - ```bash - cd build/wheel - pip install onnxruntime-genai-*.whl - ``` ## Get the model -Install the model builder script dependencies +You can generate the model using the model builder this library, or bring your own model. -```bash -pip install numpy -pip install transformers -pip install torch -pip install onnx -pip install onnxruntime-gpu -``` +If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config). -Choose a model. Examples of supported ones are: -- Phi-2 -- Mistral -- Gemma 2B IT -- LLama 7B +To generate the model with model builder: -Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md) +1. Install the model builder script dependencies -```bash -cd examples/python -python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu -``` + ```bash + pip install numpy + pip install transformers + pip install torch + pip install onnx + pip install onnxruntime + ``` + +2. Choose a model. Examples of supported ones are: + - Phi-2 + - Mistral + - Gemma 2B IT + - LLama 7B + +3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md) + + ```bash + cd examples/python + python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu + ``` ## Run the example model script From 143308f5fa513b7765747562631d66c6b26fba05 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Thu, 14 Mar 2024 20:55:21 -0700 Subject: [PATCH 02/36] Better error checking & reporting (#203) Default batch size to 1 + sanity check batch_size Improve a common JSON error messge wording for extra ',' Improve error reporting in python search parameter type --- examples/python/model-chat.py | 4 ++++ src/generators.cpp | 4 ++++ src/generators.h | 2 +- src/json.cpp | 4 ++-- src/python/python.cpp | 5 +++-- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py index e31403065..edbfc1fc0 100644 --- a/examples/python/model-chat.py +++ b/examples/python/model-chat.py @@ -13,6 +13,10 @@ def main(args): # Keep asking for input prompts in an loop while True: text = input("Input: ") + if not text: + print("Error, input cannot be empty") + continue + input_tokens = tokenizer.encode(text) params = og.GeneratorParams(model) diff --git a/src/generators.cpp b/src/generators.cpp index db27628da..640b6559e 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -71,6 +71,10 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_ throw std::runtime_error("search max_length is 0"); if (params.search.max_length > model.config_->model.context_length) throw std::runtime_error("max_length cannot be greater than model context_length"); + if (params.batch_size < 1) + throw std::runtime_error("batch_size must be 1 or greater"); + if (params.vocab_size < 1) + throw std::runtime_error("vocab_size must be 1 or greater"); search_ = CreateSearch(params); state_ = model.CreateState(search_->GetSequenceLengths(), params); diff --git a/src/generators.h b/src/generators.h index af98aea44..433fda103 100644 --- a/src/generators.h +++ b/src/generators.h @@ -56,7 +56,7 @@ struct GeneratorParams { int vocab_size{}; int context_length{}; - int batch_size{}; + int batch_size{1}; int sequence_length{}; int BatchBeamSize() const { return search.num_beams * batch_size; } diff --git a/src/json.cpp b/src/json.cpp index 412b98509..7e487ac99 100644 --- a/src/json.cpp +++ b/src/json.cpp @@ -49,7 +49,7 @@ JSON::JSON(Element& element, std::string_view document) : begin_{document.data() int line = 1; const auto* last_cr = begin_; for (const auto* p = begin_; p < current_; p++) { - if (*p == '\r') { + if (*p == '\n') { line++; last_cr = p; } @@ -108,7 +108,7 @@ void JSON::Parse_Object(Element& element) { while (true) { if (!Skip('\"')) { - throw std::runtime_error("Expecting \""); + throw std::runtime_error("Expecting \" to start next object name, possibly due to an extra trailing ',' before this"); } auto name = Parse_String(); diff --git a/src/python/python.cpp b/src/python/python.cpp index 3ed2e6170..a1667eb90 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -3,6 +3,7 @@ #include #include #include "../generators.h" +#include "../json.h" #include "../search.h" #include "../models/model.h" @@ -97,8 +98,8 @@ struct PyGeneratorParams : GeneratorParams { } else if (pybind11::isinstance(entry.second)) { SetSearchNumber(search, name, entry.second.cast()); } else - throw std::runtime_error("Unknown search option type, can be float/bool/int"); - } catch (const std::exception& e) { + throw std::runtime_error("Unknown search option type, can be float/bool/int:" + name); + } catch (JSON::unknown_value_error& e) { throw std::runtime_error("Unknown search option:" + name); } } From b740fd3238b83613f67f575d0a1ad8dfd9e7030d Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Fri, 15 Mar 2024 09:08:58 -0700 Subject: [PATCH 03/36] C# example README update (#202) --- examples/csharp/README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/csharp/README.md b/examples/csharp/README.md index dce940811..cb6675e74 100644 --- a/examples/csharp/README.md +++ b/examples/csharp/README.md @@ -1,13 +1,8 @@ # Gen-AI C# Phi-2 Example -## Install the onnxruntime-genai library - - ## Get the model -You can generate the model using the model builder this library, or bring your own model. - -If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config). +You can generate the model using the model builder provided with this library, or bring your own model. To generate the model with model builder: @@ -15,7 +10,6 @@ To generate the model with model builder: Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). - 2. Install the model builder script dependencies ```bash @@ -25,6 +19,7 @@ To generate the model with model builder: pip install onnx pip install onnxruntime ``` + 3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md) ```bash @@ -32,6 +27,10 @@ To generate the model with model builder: python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu ``` +The model builder also generates the configuration needed by the API to run generation. You can modify the config according to your scenario. + +If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config). + ## Run the phi-2 model Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). From e738f62ab4aac53612dcdca758d449cb772ac0c8 Mon Sep 17 00:00:00 2001 From: Adam Clark Date: Sat, 16 Mar 2024 05:55:40 +1300 Subject: [PATCH 04/36] C# Streaming Token Example (#205) --- examples/csharp/Program.cs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/examples/csharp/Program.cs b/examples/csharp/Program.cs index e70920cc0..993af8b57 100644 --- a/examples/csharp/Program.cs +++ b/examples/csharp/Program.cs @@ -9,6 +9,11 @@ using Model model = new Model(modelPath); using Tokenizer tokenizer = new Tokenizer(model); +Console.WriteLine("Please enter option number:"); +Console.WriteLine("1. Complete Output"); +Console.WriteLine("2. Streaming Output"); +int.TryParse(Console.ReadLine(), out var option); + while (true) { Console.WriteLine("Prompt:"); @@ -21,9 +26,24 @@ generatorParams.SetSearchOption("max_length", 200); generatorParams.SetInputSequences(sequences); - var outputSequences = model.Generate(generatorParams); - var outputString = tokenizer.Decode(outputSequences[0]); + if (option == 1) // Complete Output + { + var outputSequences = model.Generate(generatorParams); + var outputString = tokenizer.Decode(outputSequences[0]); + + Console.WriteLine("Output:"); + Console.WriteLine(outputString); + } - Console.WriteLine("Output:"); - Console.WriteLine(outputString); + else if (option == 2) //Streaming Output + { + using var tokenizerStream = tokenizer.CreateStream(); + using var generator = new Generator(model, generatorParams); + while (!generator.IsDone()) + { + generator.ComputeLogits(); + generator.GenerateNextTokenTop(); + Console.Write(tokenizerStream.Decode(generator.GetSequence(0)[^1])); + } + } } From 1edff307364f655f7a869d45d7d0dfe78af15090 Mon Sep 17 00:00:00 2001 From: Adam Clark Date: Sat, 16 Mar 2024 06:05:56 +1300 Subject: [PATCH 05/36] Add missing windows build step (#204) Added copy step for `onnxruntime.lib` which seems to be required for building on windows Resolves Error: `LINK : fatal error LNK1104: cannot open file 'onnxruntime.lib' [D:\Repositories\onnxruntime-genai\build\src\python\pyth on.vcxproj]` --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index be4fd5eff..769e9ab89 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ This step requires `cmake` to be installed. build.bat --config RelWithDebInfo --build_shared_lib --skip_tests --parallel [--use_cuda] copy include\onnxruntime\core\session\onnxruntime_c_api.h $ORT_HOME\include copy build\Windows\RelWithDebInfo\RelWithDebInfo\*.dll $ORT_HOME\lib + copy build\Windows\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $ORT_HOME\lib ``` On Linux From 5679d50850787ded496e362df35c71808ecf67df Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Fri, 15 Mar 2024 15:38:48 -0400 Subject: [PATCH 06/36] Cjian/preset build dir (#194) Change the CMake binaryDir consistent throughout the preset. This way we can simply reference it with $(ep) and $(config) --- .github/workflows/linux-cpu-arm64-build.yml | 4 +- .github/workflows/linux-cpu-x64-build.yml | 6 +-- .github/workflows/linux-gpu-x64-build.yml | 4 +- .github/workflows/mac-cpu-arm64-build.yml | 5 ++- .github/workflows/win-cpu-arm64-build.yml | 17 ++++---- .github/workflows/win-cpu-x64-build.yml | 17 ++++---- .github/workflows/win-gpu-x64-build.yml | 15 ++++--- .../stages/jobs/nuget-linux-packaging-job.yml | 2 +- .../stages/jobs/nuget-win-packaging-job.yml | 2 +- .../stages/jobs/py-win-packaging-job.yml | 4 +- .../stages/jobs/steps/capi-linux-step.yml | 2 +- .../presets/CMakeLinuxClangConfigPresets.json | 16 ++++---- .../CMakeLinuxDefaultConfigPresets.json | 2 + cmake/presets/CMakeLinuxGccConfigPresets.json | 32 +++++++-------- cmake/presets/CMakeWinBuildPresets.json | 18 ++++----- cmake/presets/CMakeWinConfigPresets.json | 40 +++++++++++-------- src/python/CMakeLists.txt | 13 +----- src/tokenizer/CMakeLists.txt | 14 +------ test/CMakeLists.txt | 13 +----- 19 files changed, 106 insertions(+), 120 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index ec31c67ab..5018bdbb6 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -56,10 +56,10 @@ jobs: run: | docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "ls -l /onnxruntime_src/build/gcc_cpu/release/test/" + -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "ls -l /onnxruntime_src/build/cpu/test/" - name: Docker -- Run tests run: | docker run --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/onnxruntime_src/build/gcc_cpu/release/test/unit_tests" + -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/onnxruntime_src/build/cpu/test/unit_tests" diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index fe5c92ad5..da202e19c 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -39,7 +39,7 @@ jobs: - name: Install the python wheel and test dependencies run: | - python3 -m pip install build/gcc_cpu/release/wheel/onnxruntime_genai*.whl + python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl python3 -m pip install -r test/python/requirements-nightly-cpu.txt --user - name: Get HuggingFace Token @@ -59,9 +59,9 @@ jobs: if: always() continue-on-error: true run: | - ls -l ${{ github.workspace }}/build/gcc_cpu/release + ls -l ${{ github.workspace }}/build/cpu - name: Run tests run: | set -e -x - ./build/gcc_cpu/release/test/unit_tests + ./build/cpu/test/unit_tests diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 3a08b8b05..c4e4c372a 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -63,7 +63,7 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "python3 -m pip install /onnxruntime_src/build/gcc_cuda/release/wheel/onnxruntime_genai*.whl --user && python3 -m pip install -r test/python/requirements.txt --user && python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models" + -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "python3 -m pip install /onnxruntime_src/build/cuda/wheel/onnxruntime_genai*.whl --user && python3 -m pip install -r test/python/requirements.txt --user && python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models" - name: Docker -- Run tests run: | @@ -72,4 +72,4 @@ jobs: --gpus all \ --rm \ --volume $GITHUB_WORKSPACE:/onnxruntime_src \ - -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "/onnxruntime_src/build/gcc_cuda/release/test/unit_tests" + -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "/onnxruntime_src/build/cuda/test/unit_tests" diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index 8c56512e0..d757370f4 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -33,11 +33,12 @@ jobs: run: | mv ${{ env.ort_dir }} ort - - name: Build with CMake and Clang + - name: Configure CMake run: | + cmake --preset macos_cpu_release + - name: Build with CMake run: | - cmake --preset macos_cpu_release cmake --build --preset macos_cpu_release --parallel continue-on-error: false diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 4b43c5bae..7c64ba8ff 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -14,7 +14,7 @@ env: ort_dir: "onnxruntime-win-arm64-1.17.1" ort_zip: "$(ort_dir).zip" ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)" - cmake_build_dir: 'build/release/cpu_default' + binaryDir: 'build/cpu' jobs: windows-cpu-arm64-build: @@ -45,14 +45,17 @@ jobs: run: | Rename-Item -Path $env:ort_dir -NewName ort - - name: Build with CMake + - name: Configure CMake run: | cmake --preset windows_arm64_cpu_release + + - name: Build with CMake + run: | cmake --build --preset windows_arm64_cpu_release --parallel - name: Install the Python Wheel and Test Dependencies run: | - python -m pip install (Get-ChildItem ("$env:cmake_build_dir\wheel\*.whl")) + python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) python -m pip install -r test\python\requirements.txt - name: Run the Python Tests @@ -62,15 +65,15 @@ jobs: - name: Build the C# API and Run the C# Tests run: | cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:cmake_build_dir\Release" + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" - name: Verify Build Artifacts if: always() continue-on-error: true run: | - Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir -Recurse - Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir\test -Recurse + Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir -Recurse + Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir\test -Recurse - name: Run tests run: | - .\build\release\cpu_default\test\Release\unit_tests.exe \ No newline at end of file + .\build\cpu\test\Release\unit_tests.exe \ No newline at end of file diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index cfe792005..f13f3c2c8 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -14,7 +14,7 @@ env: ort_dir: "onnxruntime-win-x64-1.17.1" ort_zip: "$(ort_dir).zip" ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)" - cmake_build_dir: 'build/release/cpu_default' + binaryDir: 'build/cpu' jobs: windows-cpu-x64-build: @@ -52,14 +52,17 @@ jobs: with: languages: 'cpp' - - name: Build with CMake + - name: Configure CMake run: | cmake --preset windows_x64_cpu_release + + - name: Build with CMake + run: | cmake --build --preset windows_x64_cpu_release --parallel - name: Install the python wheel and test dependencies run: | - python -m pip install (Get-ChildItem ("$env:cmake_build_dir\wheel\*.whl")) + python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) python -m pip install -r test\python\requirements-nightly-cpu.txt - name: Get HuggingFace Token @@ -76,18 +79,18 @@ jobs: - name: Build the C# API and Run the C# Tests run: | cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:cmake_build_dir\Release" + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" - name: Verify Build Artifacts if: always() continue-on-error: true run: | - Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir -Recurse - Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir\test -Recurse + Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir -Recurse + Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir\test -Recurse - name: Run tests run: | - .\build\release\cpu_default\test\Release\unit_tests.exe + .\build\cpu\test\Release\unit_tests.exe - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/win-gpu-x64-build.yml b/.github/workflows/win-gpu-x64-build.yml index 48afb21d4..a3f1d338b 100644 --- a/.github/workflows/win-gpu-x64-build.yml +++ b/.github/workflows/win-gpu-x64-build.yml @@ -14,7 +14,7 @@ env: cuda_dir: "${{ github.workspace }}\\cuda_sdk" cuda_version: "11.8" CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8 - cmake_build_dir: 'build/release/cuda_default' + binaryDir: 'build/cuda' jobs: @@ -47,9 +47,12 @@ jobs: run: | Rename-Item -Path $env:ort_dir -NewName ort - - name: Build with CMake + - name: Configure CMake run: | cmake --preset windows_x64_cuda_release -T cuda=${{ env.cuda_dir }}\\v${{ env.cuda_version }} -DTEST_PHI2=False + + - name: Build with CMake + run: | cmake --build --preset windows_x64_cuda_release --parallel - name: Add CUDA to PATH @@ -58,7 +61,7 @@ jobs: - name: Install the Python Wheel and Test Dependencies run: | - python -m pip install (Get-ChildItem ("$env:cmake_build_dir\wheel\*.whl")) + python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) python -m pip install -r test\python\requirements-nightly-cpu.txt - name: Get HuggingFace Token @@ -75,17 +78,17 @@ jobs: - name: Build the C# API and Run the C# Tests run: | cd test\csharp - dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:cmake_build_dir\Release" + dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" - name: Verify Build Artifacts if: always() continue-on-error: true run: | - Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir -Recurse + Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir -Recurse - name: Prepend CUDA to PATH and Run tests run: | $env:PATH = "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin;" + $env:PATH echo "Current PATH variable is: $env:PATH" - .\build\release\cuda_default\test\Release\unit_tests.exe \ No newline at end of file + .\build\cuda\test\Release\unit_tests.exe \ No newline at end of file diff --git a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml b/.pipelines/stages/jobs/nuget-linux-packaging-job.yml index 322a48c84..fdf9d7106 100644 --- a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml +++ b/.pipelines/stages/jobs/nuget-linux-packaging-job.yml @@ -23,7 +23,7 @@ jobs: - name: ep value: ${{ parameters.ep }} - name: buildDir - value: 'build/gcc_${{ parameters.ep }}/release' + value: 'build/${{ parameters.ep }}' - name: ort_filename ${{ if eq(parameters.ep, 'cpu') }}: value: 'onnxruntime-linux-${{ parameters.arch }}-${{ parameters.ort_version }}' diff --git a/.pipelines/stages/jobs/nuget-win-packaging-job.yml b/.pipelines/stages/jobs/nuget-win-packaging-job.yml index de370fe18..b15ceb1ee 100644 --- a/.pipelines/stages/jobs/nuget-win-packaging-job.yml +++ b/.pipelines/stages/jobs/nuget-win-packaging-job.yml @@ -30,7 +30,7 @@ jobs: - name: ep value: ${{ parameters.ep }} - name: buildDir - value: 'build\release\${{ parameters.ep }}_default' + value: 'build\${{ parameters.ep }}' - name: artifactName value : 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}' - name: ort_filename diff --git a/.pipelines/stages/jobs/py-win-packaging-job.yml b/.pipelines/stages/jobs/py-win-packaging-job.yml index da6015aa9..88b285506 100644 --- a/.pipelines/stages/jobs/py-win-packaging-job.yml +++ b/.pipelines/stages/jobs/py-win-packaging-job.yml @@ -68,7 +68,7 @@ jobs: - template: steps/compliant/win-esrp-dll-step.yml parameters: - FolderPath: '$(Build.Repository.LocalPath)\build\release\$(ep)_default\wheel\onnxruntime_genai' + FolderPath: '$(Build.Repository.LocalPath)\build\$(ep)\wheel\onnxruntime_genai' DisplayName: 'ESRP - PYD Sign' DoEsrp: true Pattern: '*.pyd' @@ -83,7 +83,7 @@ jobs: - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: - SourceFolder: '$(Build.Repository.LocalPath)\build\release\$(ep)_default\wheel' + SourceFolder: '$(Build.Repository.LocalPath)\build\$(ep)\wheel' Contents: '*.whl' TargetFolder: '$(Build.ArtifactStagingDirectory)' diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml index 1e049ab29..897fae5d5 100644 --- a/.pipelines/stages/jobs/steps/capi-linux-step.yml +++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml @@ -100,7 +100,7 @@ steps: - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: - SourceFolder: '$(Build.Repository.LocalPath)/build/gcc_$(ep)/release/wheel' + SourceFolder: '$(Build.Repository.LocalPath)/build/$(ep)/wheel' Contents: '*manylinux*.whl' TargetFolder: '$(Build.ArtifactStagingDirectory)' diff --git a/cmake/presets/CMakeLinuxClangConfigPresets.json b/cmake/presets/CMakeLinuxClangConfigPresets.json index 59c10d88a..ce607d2f1 100644 --- a/cmake/presets/CMakeLinuxClangConfigPresets.json +++ b/cmake/presets/CMakeLinuxClangConfigPresets.json @@ -11,7 +11,7 @@ "linux_clang_asan_default", "linux_release_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/release" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_clang_cpu_debug_asan", @@ -20,7 +20,7 @@ "linux_clang_asan_default", "linux_debug_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/debug", + "binaryDir": "${sourceDir}/build/cpu", "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug", "CMAKE_C_FLAGS": "-ggdb3 -O0 -fsanitize=address", @@ -34,7 +34,7 @@ "linux_clang_asan_default", "linux_relwithdebinfo_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/relwithdebinfo" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_clang_cpu_minsizerel_asan", @@ -43,7 +43,7 @@ "linux_clang_default", "linux_minsizerel_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/minsizerel", + "binaryDir": "${sourceDir}/build/cpu", "cacheVariables": {} }, { @@ -53,7 +53,7 @@ "linux_clang_default", "linux_release_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/release" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_clang_cpu_debug", @@ -62,7 +62,7 @@ "linux_clang_default", "linux_debug_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/debug" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_clang_cpu_relwithdebinfo", @@ -71,7 +71,7 @@ "linux_clang_default", "linux_relwithdebinfo_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/relwithdebinfo" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_clang_cpu_minsizerel", @@ -80,7 +80,7 @@ "linux_clang_default", "linux_minsizerel_default" ], - "binaryDir": "${sourceDir}/build/clang_cpu/minsizerel", + "binaryDir": "${sourceDir}/build/cpu", "cacheVariables": {} } ] diff --git a/cmake/presets/CMakeLinuxDefaultConfigPresets.json b/cmake/presets/CMakeLinuxDefaultConfigPresets.json index 9e904dc10..559d1dae0 100644 --- a/cmake/presets/CMakeLinuxDefaultConfigPresets.json +++ b/cmake/presets/CMakeLinuxDefaultConfigPresets.json @@ -39,6 +39,7 @@ }, { "name": "linux_gcc_asan_default", + "inherits": "linux_gcc_default", "cacheVariables": { "CMAKE_EXE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address", "CMAKE_MODULE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address", @@ -47,6 +48,7 @@ }, { "name": "linux_clang_asan_default", + "inherits": "linux_clang_default", "cacheVariables": { "CMAKE_EXE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address -L\\usr\\lib64\\x86_64-unknown-linux-gnu", "CMAKE_MODULE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address -L\\usr\\lib64\\x86_64-unknown-linux-gnu", diff --git a/cmake/presets/CMakeLinuxGccConfigPresets.json b/cmake/presets/CMakeLinuxGccConfigPresets.json index 4e7b45a4a..d5518f9ad 100644 --- a/cmake/presets/CMakeLinuxGccConfigPresets.json +++ b/cmake/presets/CMakeLinuxGccConfigPresets.json @@ -12,7 +12,7 @@ "linux_gcc_default", "linux_release_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/release" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_debug_asan", @@ -22,7 +22,7 @@ "linux_gcc_default", "linux_debug_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/debug" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_relwithdebinfo_asan", @@ -32,7 +32,7 @@ "linux_gcc_default", "linux_relwithdebinfo_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/relwithdebinfo" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_minsizerel_asan", @@ -42,7 +42,7 @@ "linux_gcc_default", "linux_minsizerel_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/minsizerel" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_release", @@ -51,7 +51,7 @@ "linux_gcc_default", "linux_release_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/release" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_debug", @@ -60,7 +60,7 @@ "linux_gcc_default", "linux_debug_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/debug" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_relwithdebinfo", @@ -69,7 +69,7 @@ "linux_gcc_default", "linux_relwithdebinfo_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/relwithdebinfo" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cpu_minsizerel", @@ -78,7 +78,7 @@ "linux_gcc_default", "linux_minsizerel_default" ], - "binaryDir": "${sourceDir}/build/gcc_cpu/minsizerel" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "linux_gcc_cuda_release_asan", @@ -88,7 +88,7 @@ "linux_gcc_cuda_default", "linux_release_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/release" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_debug_asan", @@ -98,7 +98,7 @@ "linux_gcc_cuda_default", "linux_debug_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/debug" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_relwithdebinfo_asan", @@ -108,7 +108,7 @@ "linux_gcc_cuda_default", "linux_relwithdebinfo_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/relwithdebinfo" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_minsizerel_asan", @@ -118,7 +118,7 @@ "linux_gcc_cuda_default", "linux_minsizerel_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/minsizerel" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_release", @@ -127,7 +127,7 @@ "linux_gcc_cuda_default", "linux_release_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/release" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_debug", @@ -136,7 +136,7 @@ "linux_gcc_cuda_default", "linux_debug_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/debug" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_relwithdebinfo", @@ -145,7 +145,7 @@ "linux_gcc_cuda_default", "linux_relwithdebinfo_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/relwithdebinfo" + "binaryDir": "${sourceDir}/build/cuda" }, { "name": "linux_gcc_cuda_minsizerel", @@ -154,7 +154,7 @@ "linux_gcc_cuda_default", "linux_minsizerel_default" ], - "binaryDir": "${sourceDir}/build/gcc_cuda/minsizerel" + "binaryDir": "${sourceDir}/build/cuda" } ] } \ No newline at end of file diff --git a/cmake/presets/CMakeWinBuildPresets.json b/cmake/presets/CMakeWinBuildPresets.json index b42eec934..1edfd4e13 100644 --- a/cmake/presets/CMakeWinBuildPresets.json +++ b/cmake/presets/CMakeWinBuildPresets.json @@ -16,12 +16,12 @@ }, { "name": "windows_x64_cpu_relwithdebinfo_asan", - "configuration": "Relwithdebinfo", + "configuration": "RelWithDebInfo", "configurePreset": "windows_x64_cpu_relwithdebinfo_asan" }, { "name": "windows_x64_cpu_minsizerel_asan", - "configuration": "Minsizerel", + "configuration": "MinSizeRel", "configurePreset": "windows_x64_cpu_minsizerel_asan" }, { @@ -36,12 +36,12 @@ }, { "name": "windows_x64_cpu_relwithdebinfo", - "configuration": "Relwithdebinfo", + "configuration": "RelWithDebInfo", "configurePreset": "windows_x64_cpu_relwithdebinfo" }, { "name": "windows_x64_cpu_minsizerel", - "configuration": "Minsizerel", + "configuration": "MinSizeRel", "configurePreset": "windows_x64_cpu_minsizerel" }, { @@ -56,12 +56,12 @@ }, { "name": "windows_x64_cuda_relwithdebinfo_asan", - "configuration": "Relwithdebinfo", + "configuration": "RelWithDebInfo", "configurePreset": "windows_x64_cuda_relwithdebinfo_asan" }, { "name": "windows_x64_cuda_minsizerel_asan", - "configuration": "Minsizerel", + "configuration": "MinSizeRel", "configurePreset": "windows_x64_cuda_minsizerel_asan" }, { @@ -76,17 +76,17 @@ }, { "name": "windows_x64_cuda_relwithdebinfo", - "configuration": "Relwithdebinfo", + "configuration": "RelWithDebInfo", "configurePreset": "windows_x64_cuda_relwithdebinfo" }, { "name": "windows_x64_cuda_minsizerel", - "configuration": "Minsizerel", + "configuration": "MinSizeRel", "configurePreset": "windows_x64_cuda_minsizerel" }, { "name": "windows_arm64_cpu_relwithdebinfo", - "configuration": "Relwithdebinfo", + "configuration": "RelWithDebInfo", "configurePreset": "windows_arm64_cpu_relwithdebinfo" }, { diff --git a/cmake/presets/CMakeWinConfigPresets.json b/cmake/presets/CMakeWinConfigPresets.json index be70ca2d6..3b22aae07 100644 --- a/cmake/presets/CMakeWinConfigPresets.json +++ b/cmake/presets/CMakeWinConfigPresets.json @@ -28,6 +28,7 @@ { "name": "windows_release_default", "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG" } @@ -35,6 +36,7 @@ { "name": "windows_debug_default", "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1" } @@ -42,6 +44,7 @@ { "name": "windows_relwithdebinfo_default", "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG" } @@ -49,12 +52,14 @@ { "name": "windows_minsizerel_default", "cacheVariables": { + "CMAKE_BUILD_TYPE": "MinSizeRel", "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG" } }, { "name": "windows_release_asan_default", + "inherits": "windows_release_default", "cacheVariables": { "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG /fsanitize=address", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG /fsanitize=address" @@ -62,6 +67,7 @@ }, { "name": "windows_debug_asan_default", + "inherits": "windows_debug_default", "cacheVariables": { "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1 /fsanitize=address", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1 /fsanitize=address" @@ -69,6 +75,7 @@ }, { "name": "windows_relwithdebinfo_asan_default", + "inherits": "windows_relwithdebinfo_default", "cacheVariables": { "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG /fsanitize=address", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG /fsanitize=address" @@ -76,6 +83,7 @@ }, { "name": "windows_minsizerel_asan_default", + "inherits": "windows_minsizerel_default", "cacheVariables": { "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG /fsanitize=address", "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG /fsanitize=address" @@ -88,7 +96,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu release asan", - "binaryDir": "${sourceDir}/build/release/cpu_asan" + "binaryDir": "${sourceDir}/build/cpu_asan" }, { "name": "windows_x64_cpu_debug_asan", @@ -97,7 +105,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu debug asan", - "binaryDir": "${sourceDir}/build/debug/cpu_asan" + "binaryDir": "${sourceDir}/build/cpu_asan" }, { "name": "windows_x64_cpu_relwithdebinfo_asan", @@ -106,7 +114,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu relwithdebinfo asan", - "binaryDir": "${sourceDir}/build/relwithdebinfo/cpu_asan" + "binaryDir": "${sourceDir}/build/cpu_asan" }, { "name": "windows_x64_cpu_minsizerel_asan", @@ -115,7 +123,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu minsizerel asan", - "binaryDir": "${sourceDir}/build/minsizerel/cpu_asan" + "binaryDir": "${sourceDir}/build/cpu_asan" }, { "name": "windows_x64_cpu_release", @@ -124,7 +132,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu release", - "binaryDir": "${sourceDir}/build/release/cpu_default" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "windows_x64_cpu_debug", @@ -133,7 +141,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu debug", - "binaryDir": "${sourceDir}/build/debug/cpu_default" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "windows_x64_cpu_relwithdebinfo", @@ -142,7 +150,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu relwithdebinfo", - "binaryDir": "${sourceDir}/build/relwithdebinfo/cpu_default" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "windows_x64_cpu_minsizerel", @@ -151,7 +159,7 @@ "windows_cpu_default" ], "displayName": "windows x64 cpu minsizerel", - "binaryDir": "${sourceDir}/build/minsizerel/cpu_default" + "binaryDir": "${sourceDir}/build/cpu" }, { "name": "windows_x64_cuda_release_asan", @@ -160,7 +168,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda release asan", - "binaryDir": "${sourceDir}/build/release/cuda_asan", + "binaryDir": "${sourceDir}/build/cuda_asan", "cacheVariables": { "USE_CUDA": "ON" } @@ -172,7 +180,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda debug asan", - "binaryDir": "${sourceDir}/build/debug/cuda_asan", + "binaryDir": "${sourceDir}/build/cuda_asan", "cacheVariables": { "USE_CUDA": "ON" } @@ -184,7 +192,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda relwithdebinfo asan", - "binaryDir": "${sourceDir}/build/relwithdebinfo/cuda_asan", + "binaryDir": "${sourceDir}/build/cuda_asan", "cacheVariables": { "USE_CUDA": "ON" } @@ -196,7 +204,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda minsizerel asan", - "binaryDir": "${sourceDir}/build/minsizerel/cuda_asan", + "binaryDir": "${sourceDir}/build/cuda_asan", "cacheVariables": { "USE_CUDA": "ON" } @@ -208,7 +216,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda release", - "binaryDir": "${sourceDir}/build/release/cuda_default", + "binaryDir": "${sourceDir}/build/cuda", "cacheVariables": { "USE_CUDA": "ON" } @@ -220,7 +228,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda debug", - "binaryDir": "${sourceDir}/build/debug/cuda_default", + "binaryDir": "${sourceDir}/build/cuda", "cacheVariables": { "USE_CUDA": "ON" } @@ -232,7 +240,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda relwithdebinfo", - "binaryDir": "${sourceDir}/build/relwithdebinfo/cuda_default", + "binaryDir": "${sourceDir}/build/cuda", "cacheVariables": { "USE_CUDA": "ON" } @@ -244,7 +252,7 @@ "windows_cuda_default" ], "displayName": "windows x64 cuda minsizerel", - "binaryDir": "${sourceDir}/build/minsizerel/cuda_default", + "binaryDir": "${sourceDir}/build/cuda", "cacheVariables": { "USE_CUDA": "ON" } diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 6bce4cbd3..942664246 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -1,15 +1,4 @@ -if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10) - add_compile_definitions(USE_CXX17=1) - message("Python is using C++17 because GCC Version is less than 10") - set(CMAKE_CXX_STANDARD 17) -elseif(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12) - add_compile_definitions(USE_CXX17=1) - message("Python is using C++17 Because CUDA Version is less than 12") - set(CMAKE_CXX_STANDARD 17) -else() - message("Python is using C++20") - set(CMAKE_CXX_STANDARD 20) -endif() +include(${CMAKE_SOURCE_DIR}/cmake/cxx_standard.cmake) file(GLOB python_srcs CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/*.h" diff --git a/src/tokenizer/CMakeLists.txt b/src/tokenizer/CMakeLists.txt index 1eb0fc6c6..69d603715 100644 --- a/src/tokenizer/CMakeLists.txt +++ b/src/tokenizer/CMakeLists.txt @@ -1,17 +1,5 @@ set(TOKENIZER_ROOT ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE) - -if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10) - add_compile_definitions(USE_CXX17=1) - message("Tokenizer is using C++17 because GCC Version is less than 10") - set(CMAKE_CXX_STANDARD 17) -elseif(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12) - add_compile_definitions(USE_CXX17=1) - message("Tokenizer is using C++17 Because CUDA Version is less than 12") - set(CMAKE_CXX_STANDARD 17) -else() - message("Tokenizer is using C++20") - set(CMAKE_CXX_STANDARD 20) -endif() +include(${CMAKE_SOURCE_DIR}/cmake/cxx_standard.cmake) file(GLOB tokenizer_srcs CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/*.cc" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a4263eeda..83571118e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,17 +1,6 @@ enable_testing() -if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10) - add_compile_definitions(USE_CXX17=1) - message("Test is using C++17 because GCC Version is less than 10") - set(CMAKE_CXX_STANDARD 17) -elseif(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12) - add_compile_definitions(USE_CXX17=1) - message("Test is using C++17 Because CUDA Version is less than 12") - set(CMAKE_CXX_STANDARD 17) -else() - message("Test is using C++20") - set(CMAKE_CXX_STANDARD 20) -endif() +include(${CMAKE_SOURCE_DIR}/cmake/cxx_standard.cmake) set(TESTS_ROOT ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE) file(GLOB test_srcs CONFIGURE_DEPENDS From 7cc8062b8a17101f4bdd9367ca732c690309b6c1 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Fri, 15 Mar 2024 14:10:47 -0700 Subject: [PATCH 07/36] Update README.md (#201) --- README.md | 96 +++++++------------------------------------------------ 1 file changed, 11 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index 769e9ab89..564ef3370 100644 --- a/README.md +++ b/README.md @@ -31,25 +31,27 @@ Users can call a high level `generate()` method, or run each iteration of the mo ## Coming very soon -* Support for the Whisper model architectures * Support for DirectML +* Support for the encoder decoder model architectures, such as whisper, T5 and BART. + +## Coming soon + +* Support for mobile devices (Android and iOS) with Java and Objective-C bindings ## Roadmap +* Stable diffusion pipeline * Automatic model download and cache * More model architectures ## Sample code for phi-2 in Python -Install onnxruntime-genai. - -(Temporary) Build and install from source according to the instructions below. - +[Install](https://onnxruntime.ai/docs/genai/install) the onnxruntime-genai Python package. ```python import onnxruntime_genai as og -model = og.Model(f'models/microsoft/phi-2', device_type) +model = og.Model(f'models/microsoft/phi-2') tokenizer = og.Tokenizer(model) @@ -72,88 +74,11 @@ print("Output:") print(text) ``` - -## Build from source - -This step requires `cmake` to be installed. - -1. Clone this repo - - ```bash - git clone https://github.com/microsoft/onnxruntime-genai - cd onnxruntime-genai - ``` - -2. Install ONNX Runtime - - By default, the onnxruntime-genai build expects to find the ONNX Runtime include and binaries in a folder called `ort` in the root directory of onnxruntime-genai. You can put the ONNX Runtime files in a different location and specify this location to the onnxruntime-genai build. These instructions use ORT_HOME as the location. - - * Install from release - - These instructions are for the Linux GPU build of ONNX Runtime. Replace the location with the operating system and target of choice. - - ```bash - cd $ORT_HOME - wget https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-gpu-1.17.1.tgz - tar xvzf onnxruntime-linux-x64-gpu-1.17.1.tgz - mv onnxruntime-linux-x64-gpu-1.17.1/include . - mv onnxruntime-linux-x64-gpu-1.17.1/lib . - ``` - - * Or build from source - - ``` - git clone https://github.com/microsoft/onnxruntime.git - cd onnxruntime - ``` - - Create include and lib folders in the ORT_HOME directory - - ```bash - mkdir $ORT_HOME/include - mkdir $ORT_HOME/lib - ``` - - Build from source and copy the include and libraries into ORT_HOME - - On Windows - - ```cmd - build.bat --config RelWithDebInfo --build_shared_lib --skip_tests --parallel [--use_cuda] - copy include\onnxruntime\core\session\onnxruntime_c_api.h $ORT_HOME\include - copy build\Windows\RelWithDebInfo\RelWithDebInfo\*.dll $ORT_HOME\lib - copy build\Windows\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $ORT_HOME\lib - ``` - - On Linux - - ```cmd - ./build.sh --build_shared_lib --skip_tests --parallel [--use_cuda] - cp include/onnxruntime/core/session/onnxruntime_c_api.h $ORT_HOME/include - cp build/Linux/RelWithDebInfo/libonnxruntime*.so* $ORT_HOME/lib - ``` - -3. Build onnxruntime-genai - - If you are building for CUDA, add the cuda_home argument. - - ```bash - cd .. - python build.py [--cuda_home ] - ``` - -4. Install Python wheel - - ```bash - cd build/wheel - pip install *.whl - ``` - ## Model download and export ONNX models are run from a local folder, via a string supplied to the `Model()` method. -To source `microsoft/phi-2` optimized for your target, download and run the following script. You will need to be logged into Hugging Face via the CLI to run the script. +You can bring your own ONNX model or use the model builder utility, included in this package. Install model builder dependencies. @@ -165,14 +90,15 @@ pip install onnx pip install onnxruntime ``` - Export int4 CPU version ```bash huggingface-cli login --token python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o ``` +## Known issues +* Mistral and Gemma support on CUDA only ## Contributing From 9760fbd801a139dd068ff3f88f5ce127f485488f Mon Sep 17 00:00:00 2001 From: aciddelgado <139922440+aciddelgado@users.noreply.github.com> Date: Fri, 15 Mar 2024 23:45:47 -0700 Subject: [PATCH 08/36] swap p and k in sample function (#162) swap p and k to match generate api functions --- src/generators.cpp | 2 +- src/search.cpp | 2 +- src/search.h | 4 ++-- src/search_cuda.cpp | 2 +- src/search_cuda.h | 2 +- test/sampling_benchmark.cpp | 4 ++-- test/sampling_tests.cpp | 39 ++++++++++++++++++++++++------------- 7 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index 640b6559e..b5cd1bb67 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -120,7 +120,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper throw std::runtime_error("top_k must be 0 or greater"); if (top_p > 0.0f && top_k > 1) { - search_->SampleTopPAndK(top_p, top_k, temperature); + search_->SampleTopKTopP(top_k, top_p, temperature); } else if (top_k > 1) { search_->SampleTopK(top_k, temperature); } else { diff --git a/src/search.cpp b/src/search.cpp index b7dd21e46..cc3a6fd10 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -187,7 +187,7 @@ void GreedySearch_Cpu::SampleTopP(float p, float temperature) { AppendNextTokensToSequences(); } -void GreedySearch_Cpu::SampleTopPAndK(float p, int k, float temperature) { +void GreedySearch_Cpu::SampleTopKTopP(int k, float p, float temperature) { std::uniform_real_distribution dis(0, p); for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) { if (PadIfAlreadyEOS(batch_id)) { diff --git a/src/search.h b/src/search.h index abf373b51..bc81313eb 100644 --- a/src/search.h +++ b/src/search.h @@ -24,7 +24,7 @@ struct Search { virtual void SelectTop() = 0; virtual void SampleTopP(float /*p*/, float /*temperature*/) { assert(false); } virtual void SampleTopK(int /*k*/, float /*temperature*/) { assert(false); } - virtual void SampleTopPAndK(float /*p*/, int /*k*/, float /*temperature*/) { assert(false); } + virtual void SampleTopKTopP(int /*k*/, float /*p*/, float /*temperature*/) { assert(false); } // Scoring features virtual void ApplyMinLength(int min_length) = 0; @@ -69,7 +69,7 @@ struct GreedySearch_Cpu : Search_Cpu { void SelectTop() override; void SampleTopK(int k, float temperature) override; void SampleTopP(float p, float temperature) override; - void SampleTopPAndK(float /*p*/, int /*k*/, float /*temperature*/) override; + void SampleTopKTopP(int /*k*/, float /*p*/, float /*temperature*/) override; private: bool PadIfAlreadyEOS(size_t batch_id); diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp index bc285cad0..304f62cc2 100644 --- a/src/search_cuda.cpp +++ b/src/search_cuda.cpp @@ -154,7 +154,7 @@ void GreedySearch_Cuda::SampleTopK(int k, float temperature) { AppendNextTokensToSequences(); } -void GreedySearch_Cuda::SampleTopPAndK(float p, int k, float temperature) { +void GreedySearch_Cuda::SampleTopKTopP(int k, float p, float temperature) { std::span scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size); cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size), params_.batch_size, k, p, temperature); diff --git a/src/search_cuda.h b/src/search_cuda.h index 50628b9f3..8d1ddbbb4 100644 --- a/src/search_cuda.h +++ b/src/search_cuda.h @@ -51,7 +51,7 @@ struct GreedySearch_Cuda : Search_Cuda { void SelectTop() override; void SampleTopK(int k, float t) override; void SampleTopP(float p, float t) override; - void SampleTopPAndK(float p, int k, float t) override; + void SampleTopKTopP(int k, float p, float t) override; private: void CheckForEOS(); diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index 0fcb138e6..3f21ed669 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -117,7 +117,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) { generator->search_->SetLogits(Generators::cpu_span(logits_cpu.get(), vocab_size * batch_size)); auto start = std::chrono::high_resolution_clock::now(); - generator->search_->SampleTopPAndK(p, k, 1.0f); + generator->search_->SampleTopKTopP(k, p, 1.0f); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); @@ -252,7 +252,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) { cudaStreamSynchronize(params.cuda_stream); auto start = std::chrono::high_resolution_clock::now(); - generator->search_->SampleTopPAndK(p, k, 1.0f); + generator->search_->SampleTopKTopP(k, p, 1.0f); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(stop - start); total_time += duration.count(); diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index 96264f015..f42c03e0a 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -35,8 +35,9 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) { auto generator = Generators::CreateGenerator(*model, params); auto logits_span = Generators::cpu_span(logits_cpu); generator->search_->SetLogits(logits_span); + generator->computed_logits_ = true; // Verify outputs match expected outputs - generator->search_->SampleTopP(0.25f, 1.0f); + generator->GenerateNextToken_TopP(0.25f, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t))); } @@ -60,10 +61,11 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) { auto generator = Generators::CreateGenerator(*model, params); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); + generator->computed_logits_ = true; // Verify outputs match expected outputs int k = 2; - generator->search_->SampleTopK(k, 1.0); + generator->GenerateNextToken_TopK(k, 1.0); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -91,10 +93,11 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) { auto generator = Generators::CreateGenerator(*model, params); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); + generator->computed_logits_ = true; // Verify outputs match expected outputs float p = 0.25f; int k = 2; - generator->search_->SampleTopPAndK(p, k, 1.0); + generator->GenerateNextToken_TopK_TopP(k, p, 1.0); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -143,7 +146,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) { CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); - generator->search_->SampleTopP(0.95f, 1.0f); + generator->computed_logits_ = true; + generator->GenerateNextToken_TopP(0.95f, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { @@ -178,7 +182,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); auto logits_copy=logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); - generator->search_->SampleTopK(k, 1.0f); + generator->computed_logits_ = true; + generator->GenerateNextToken_TopK(k, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { @@ -214,7 +219,8 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); - generator->search_->SampleTopPAndK(p, k, 1.0f); + generator->computed_logits_ = true; + generator->GenerateNextToken_TopK_TopP(k, p, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { @@ -251,8 +257,9 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) { cudaStreamSynchronize(params.cuda_stream); auto generator = Generators::CreateGenerator(*model, params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); + generator->computed_logits_ = true; // Verify outputs match expected outputs - generator->search_->SampleTopP(0.25f, 1.0f); + generator->GenerateNextToken_TopP(0.25f, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t))); } @@ -278,9 +285,10 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) { cudaStreamSynchronize(params.cuda_stream); auto generator = Generators::CreateGenerator(*model, params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); + generator->computed_logits_ = true; // Verify outputs match expected outputs int k = 2; - generator->search_->SampleTopK(k, 1.0); + generator->GenerateNextToken_TopK(k, 1.0); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -310,10 +318,11 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { cudaStreamSynchronize(params.cuda_stream); auto generator = Generators::CreateGenerator(*model, params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); + generator->computed_logits_ = true; // Verify outputs match expected outputs float p = 0.25f; int k = 2; - generator->search_->SampleTopPAndK(p, k, 1.0); + generator->GenerateNextToken_TopK_TopP(k, p, 1.0); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -348,7 +357,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - generator->search_->SampleTopP(0.95f, 1.0f); + generator->computed_logits_ = true; + generator->GenerateNextToken_TopP(0.95f, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params.cuda_stream); // Verify outputs match expected outputs @@ -387,7 +397,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); auto generator = Generators::CreateGenerator(*model, params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - generator->search_->SampleTopK(k, 1.0f); + generator->computed_logits_ = true; + generator->GenerateNextToken_TopK(k, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params.cuda_stream); // Verify outputs match expected outputs @@ -427,7 +438,8 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - generator->search_->SampleTopPAndK(p, k, 1.0f); + generator->computed_logits_ = true; + generator->GenerateNextToken_TopK_TopP(k, p, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params.cuda_stream); // Verify outputs match expected outputs @@ -465,7 +477,8 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); auto generator = Generators::CreateGenerator(*model, params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - generator->search_->SelectTop(); + generator->computed_logits_ = true; + generator->GenerateNextToken_Top(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params.cuda_stream); // Verify outputs match expected outputs From 21fd88ec351760a63b813ba8b31e2c9c1248131a Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Sun, 17 Mar 2024 19:45:34 -0700 Subject: [PATCH 09/36] Update README.md (#208) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 564ef3370..f49534a6d 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ Users can call a high level `generate()` method, or run each iteration of the mo * Built in logits processing like repetition penalties * Easy custom scoring +See full documentation at [https://onnxruntime.ai/docs/genai]. + ## Features * Supported model architectures: From 13d8be5ea28c77ba91860b0b008b9e0a532bf2cd Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Mon, 18 Mar 2024 10:58:46 -0700 Subject: [PATCH 10/36] Fix install link in READMEs (#213) --- README.md | 2 +- examples/csharp/HelloPhi2.csproj | 2 +- examples/csharp/README.md | 4 ++-- examples/python/README.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f49534a6d..350a2fe67 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ See full documentation at [https://onnxruntime.ai/docs/genai]. ## Sample code for phi-2 in Python -[Install](https://onnxruntime.ai/docs/genai/install) the onnxruntime-genai Python package. +[Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package. ```python import onnxruntime_genai as og diff --git a/examples/csharp/HelloPhi2.csproj b/examples/csharp/HelloPhi2.csproj index 0fb2a1948..a431aa126 100644 --- a/examples/csharp/HelloPhi2.csproj +++ b/examples/csharp/HelloPhi2.csproj @@ -9,7 +9,7 @@ - + diff --git a/examples/csharp/README.md b/examples/csharp/README.md index cb6675e74..edb71a717 100644 --- a/examples/csharp/README.md +++ b/examples/csharp/README.md @@ -8,7 +8,7 @@ To generate the model with model builder: 1. Install the python package - Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). + Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). 2. Install the model builder script dependencies @@ -33,6 +33,6 @@ If you bring your own model, you need to provide the configuration. See the [con ## Run the phi-2 model -Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). +Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Open [HelloPhi2.sln](HelloPhi2.sln) and run the console application. diff --git a/examples/python/README.md b/examples/python/README.md index 2f20cfcd0..cf7fe3450 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -2,7 +2,7 @@ ## Install the onnxruntime-genai library -Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install). +Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). ## Get the model From e6c99669ee0cbde6afd525b9b03dfdc70269c2b3 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Mon, 18 Mar 2024 12:40:17 -0700 Subject: [PATCH 11/36] Simple sequence_length check against max_length (#211) A much better error for when the prompt is greater length than max_length --- src/generators.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/generators.cpp b/src/generators.cpp index b5cd1bb67..d7bf47abe 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -75,6 +75,8 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_ throw std::runtime_error("batch_size must be 1 or greater"); if (params.vocab_size < 1) throw std::runtime_error("vocab_size must be 1 or greater"); + if (params.sequence_length >= params.search.max_length) + throw std::runtime_error("input sequence_length is >= max_length"); search_ = CreateSearch(params); state_ = model.CreateState(search_->GetSequenceLengths(), params); From 970fb4d5eb19572f6de9394d898874ed64876d9a Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Tue, 19 Mar 2024 14:38:54 -0700 Subject: [PATCH 12/36] Check the return value when calling SetCurrentGpuDeviceId (#219) --- src/models/onnxruntime_inline.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/models/onnxruntime_inline.h b/src/models/onnxruntime_inline.h index 8c955f08b..cd7180e93 100644 --- a/src/models/onnxruntime_inline.h +++ b/src/models/onnxruntime_inline.h @@ -151,14 +151,7 @@ inline std::unique_ptr Allocator::Create(const OrtSession& sess, cons } inline void SetCurrentGpuDeviceId(int device_id) { -#ifdef __APPLE__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-result" -#endif - api->SetCurrentGpuDeviceId(device_id); -#ifdef __APPLE__ -#pragma clang diagnostic pop -#endif + ThrowOnError(api->SetCurrentGpuDeviceId(device_id)); } inline int GetCurrentGpuDeviceId() { From 8679a684e0b0b01200f9462fd9d763f278a1d4aa Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Tue, 19 Mar 2024 14:39:26 -0700 Subject: [PATCH 13/36] Fix python scripts to use options properly (#217) We were using set_search_options, then for example calling 'get_next_token_topk_topp' which is redundant. We can call 'get_next_token()' and it will use the search options we just set. Also now bypasses calling TopK_TopP if P is 1.0 when calling generate_next_token --- benchmark/python/benchmark_e2e.py | 21 ++++++++++----------- examples/python/model-chat.py | 4 ++-- src/generators.cpp | 2 +- test/python/test_onnxruntime_genai_api.py | 4 ++-- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index 4dddded46..f5cfd3143 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -18,12 +18,12 @@ def generate_prompt(model, tokenizer, prompt_length) -> str: prompt = "a" tokens = tokenizer.encode(prompt) params=og.GeneratorParams(model) - params.set_search_options({"max_length":prompt_length, "min_length":prompt_length+1}) + params.set_search_options({"do_sample":True, "top_k":5, "temperature":temperature, "max_length":prompt_length, "min_length":prompt_length+1}) params.input_ids = tokens generator=og.Generator(model, params) while not generator.is_done(): generator.compute_logits() - generator.generate_next_token_top_k(5, temperature) + generator.generate_next_token() return tokenizer.decode(generator.get_sequence(0)) def save_results(results, filename): @@ -65,15 +65,17 @@ def main(args): # Generate prompt prompt = [generate_prompt(model, tokenizer, prompt_length)] * batch_size tokens = tokenizer.encode_batch(prompt) + + params = og.GeneratorParams(model) + params.input_ids = tokens + params.set_search_options({"do_sample":True, "top_k":args.top_k, "top_p":args.top_p, "temperature":temperature, "max_length":max_length, "min_length":max_length}) + if args.verbose: print("Running warmup runs...") for _ in tqdm(range(args.warmup)): - params = og.GeneratorParams(model) - params.input_ids = tokens - params.set_search_options({"max_length":max_length, "min_length":max_length}) generator = og.Generator(model, params) while not generator.is_done(): generator.compute_logits() - generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, temperature) + generator.generate_next_token() if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0))) tokenize_times = [] @@ -84,9 +86,6 @@ def main(args): if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}") for _ in tqdm(range(num_repetitions)): # Prepare run - params = og.GeneratorParams(model) - params.input_ids = tokens - params.set_search_options({"max_length":max_length, "min_length":max_length}) generator = og.Generator(model, params) # Measure tokenization @@ -102,7 +101,7 @@ def main(args): prompt_times.append(prompt_end_time - prompt_start_time) sampling_start_time = time.perf_counter() - generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, temperature) + generator.generate_next_token() sampling_end_time = time.perf_counter() sampling_times.append(sampling_end_time - sampling_start_time) @@ -115,7 +114,7 @@ def main(args): token_gen_end_time = time.perf_counter() sampling_start_time = time.perf_counter() - generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, temperature) + generator.generate_next_token() sampling_end_time = time.perf_counter() token_gen_times.append(token_gen_end_time - token_gen_start_time) diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py index edbfc1fc0..559423989 100644 --- a/examples/python/model-chat.py +++ b/examples/python/model-chat.py @@ -20,7 +20,7 @@ def main(args): input_tokens = tokenizer.encode(text) params = og.GeneratorParams(model) - params.set_search_options({"max_length": args.max_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty}) + params.set_search_options({"do_sample": True, "max_length": args.max_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty}) params.input_ids = input_tokens generator = og.Generator(model, params) if args.verbose: print("Generator created") @@ -29,7 +29,7 @@ def main(args): print(f'\n{text}', end='') while not generator.is_done(): generator.compute_logits() - generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, args.temperature) + generator.generate_next_token() print(tokenizer_stream.decode(generator.get_next_tokens()[0]), end='', flush=True) print() diff --git a/src/generators.cpp b/src/generators.cpp index d7bf47abe..d64ad34a3 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -121,7 +121,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper if (top_k < 0) throw std::runtime_error("top_k must be 0 or greater"); - if (top_p > 0.0f && top_k > 1) { + if (top_p > 0.0f && top_p < 1.0f && top_k > 1) { search_->SampleTopKTopP(top_k, top_p, temperature); } else if (top_k > 1) { search_->SampleTopK(top_k, temperature); diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py index 7be2c68aa..695fb8802 100644 --- a/test/python/test_onnxruntime_genai_api.py +++ b/test/python/test_onnxruntime_genai_api.py @@ -32,14 +32,14 @@ def test_greedy_search(test_data_path, relative_model_path): search_params.input_ids = np.array( [[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32 ) - search_params.set_search_options({"max_length": 10}) + search_params.set_search_options({"do_sample": False, "max_length": 10}) input_ids_shape = [2, 4] batch_size = input_ids_shape[0] generator = og.Generator(model, search_params) while not generator.is_done(): generator.compute_logits() - generator.generate_next_token_top() + generator.generate_next_token() expected_sequence = np.array( [ From 7a33c01468d2cf03d89e037ed6cca7a6e9ca061d Mon Sep 17 00:00:00 2001 From: Adam Clark Date: Wed, 20 Mar 2024 11:10:36 +1300 Subject: [PATCH 14/36] Access to sampling methods in C# Api (#206) --- src/csharp/Generator.cs | 21 ++- src/csharp/NativeMethods.cs | 23 ++++ src/ort_genai_c.cpp | 28 ++++ src/ort_genai_c.h | 8 ++ test/c_api_tests.cpp | 179 +++++++++++++++++++++++++ test/csharp/TestOnnxRuntimeGenAIAPI.cs | 156 +++++++++++++++++++++ 6 files changed, 414 insertions(+), 1 deletion(-) diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs index 1dc81883b..10c3d4e47 100644 --- a/src/csharp/Generator.cs +++ b/src/csharp/Generator.cs @@ -2,7 +2,6 @@ // Licensed under the MIT License. using System; -using System.Runtime.InteropServices; namespace Microsoft.ML.OnnxRuntimeGenAI { @@ -26,11 +25,31 @@ public void ComputeLogits() Result.VerifySuccess(NativeMethods.OgaGenerator_ComputeLogits(_generatorHandle)); } + public void GenerateNextToken() + { + Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken(_generatorHandle)); + } + public void GenerateNextTokenTop() { Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_Top(_generatorHandle)); } + public void GenerateNextTokenTopK(int k, float temperature) + { + Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK(_generatorHandle, k, temperature)); + } + + public void GenerateNextTokenTopP(float p, float temperature) + { + Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopP(_generatorHandle, p, temperature)); + } + + public void GenerateNextTokenTopKTopP(int k, float p, float temperature) + { + Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK_TopP(_generatorHandle, k, p, temperature)); + } + public ReadOnlySpan GetSequence(ulong index) { ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64(); diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index 4b41102d7..039dfb4de 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -76,10 +76,33 @@ internal class NativeLib [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern IntPtr /* OgaResult* */ OgaGenerator_ComputeLogits(IntPtr /* OgaGenerator* */ generator); + // This function is used to generate the next token in the sequence using the greedy search algorithm. + [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] + public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken(IntPtr /* OgaGenerator* */ generator); + // This function is used to generate the next token in the sequence using the greedy search algorithm. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_Top(IntPtr /* OgaGenerator* */ generator); + // This function is used to generate the next token in the sequence using the greedy search algorithm. + [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] + public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK(IntPtr /* OgaGenerator* */ generator, + int /* int32_t */ k, + float /* single_t */ t); + + // This function is used to generate the next token in the sequence using the greedy search algorithm. + [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] + public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopP(IntPtr /* OgaGenerator* */ generator, + float /* single_t */ p, + float /* single_t */ t); + + // This function is used to generate the next token in the sequence using the greedy search algorithm. + [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] + public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK_TopP(IntPtr /* OgaGenerator* */ generator, + int /* int32_t */ k, + float /* single_t */ p, + float /* single_t */ t); + // This function returns the length of the sequence at the given index. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator, diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 25b0edf18..fbb986af0 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -145,6 +145,13 @@ OgaResult* OGA_API_CALL OgaGenerator_ComputeLogits(OgaGenerator* generator) { OGA_CATCH } +OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator) { + OGA_TRY + reinterpret_cast(generator)->GenerateNextToken(); + return nullptr; + OGA_CATCH +} + OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator) { OGA_TRY reinterpret_cast(generator)->GenerateNextToken_Top(); @@ -152,6 +159,27 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generat OGA_CATCH } +OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t) { + OGA_TRY + reinterpret_cast(generator)->GenerateNextToken_TopK(k, t); + return nullptr; + OGA_CATCH +} + +OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t) { + OGA_TRY + reinterpret_cast(generator)->GenerateNextToken_TopP(p, t); + return nullptr; + OGA_CATCH +} + +OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t) { + OGA_TRY + reinterpret_cast(generator)->GenerateNextToken_TopK_TopP(k, p, t); + return nullptr; + OGA_CATCH +} + size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) { auto& generator = *reinterpret_cast(oga_generator); return generator.GetSequence(static_cast(index)).GetCPU().size(); diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index 255bfbafb..e702082fc 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -180,9 +180,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_ComputeLogits(OgaGenerator* gene */ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator); +/* Top-K sampling: most probable words from the model's output probability distribution for the next word + */ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t); + +/*Top-P sampling selects words from the smallest set of words whose cumulative probability exceeds a predefined threshold (p) + */ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator); + /* * \brief Returns the number of tokens in the sequence at the given index. * \param[in] generator The generator to get the count of the tokens for the sequence at the given index. diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index ab5bfc169..2ac6bfb71 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -221,3 +221,182 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t))); } } + +#if TEST_PHI2 +TEST(CAPITests, TopKCAPI) { + float top_k = 50; + float temp = 0.6f; + + OgaModel* model; + CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); + OgaModelPtr model_ptr{model}; + + OgaTokenizer* tokenizer; + CheckResult(OgaCreateTokenizer(model, &tokenizer)); + OgaTokenizerPtr tokenizer_ptr{tokenizer}; + + OgaSequences* input_sequences; + CheckResult(OgaCreateSequences(&input_sequences)); + OgaSequencesPtr sequences_ptr{input_sequences}; + + const char* input_strings[] = { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog.", + }; + + for (auto& string : input_strings) + CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); + + OgaGeneratorParams* params; + CheckResult(OgaCreateGeneratorParams(model, ¶ms)); + OgaGeneratorParamsPtr params_ptr{params}; + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); + CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); + + OgaGenerator* generator; + CheckResult(OgaCreateGenerator(model, params, &generator)); + OgaGeneratorPtr generator_ptr{generator}; + + while (!OgaGenerator_IsDone(generator)) { + CheckResult(OgaGenerator_ComputeLogits(generator)); + CheckResult(OgaGenerator_GenerateNextToken_TopK(generator, top_k, temp)); + } + + CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); + OgaSequences* output_sequences; + CheckResult(OgaGenerate(model, params, &output_sequences)); + OgaSequencesPtr output_sequences_ptr{output_sequences}; + + // Decode The Batch + for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { + std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; + + const char* out_string; + CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); + std::cout << "Decoded string:" << out_string << std::endl; + OgaDestroyString(out_string); + } +} + +TEST(CAPITests, TopPCAPI) { + float top_p = 0.6f; + float temp = 0.6f; + + OgaModel* model; + CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); + OgaModelPtr model_ptr{model}; + + OgaTokenizer* tokenizer; + CheckResult(OgaCreateTokenizer(model, &tokenizer)); + OgaTokenizerPtr tokenizer_ptr{tokenizer}; + + OgaSequences* input_sequences; + CheckResult(OgaCreateSequences(&input_sequences)); + OgaSequencesPtr sequences_ptr{input_sequences}; + + const char* input_strings[] = { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog.", + }; + + for (auto& string : input_strings) + CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); + + OgaGeneratorParams* params; + CheckResult(OgaCreateGeneratorParams(model, ¶ms)); + OgaGeneratorParamsPtr params_ptr{params}; + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); + CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); + + OgaGenerator* generator; + CheckResult(OgaCreateGenerator(model, params, &generator)); + OgaGeneratorPtr generator_ptr{generator}; + + while (!OgaGenerator_IsDone(generator)) { + CheckResult(OgaGenerator_ComputeLogits(generator)); + CheckResult(OgaGenerator_GenerateNextToken_TopP(generator, top_p, temp)); + } + + CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); + OgaSequences* output_sequences; + CheckResult(OgaGenerate(model, params, &output_sequences)); + OgaSequencesPtr output_sequences_ptr{output_sequences}; + + // Decode The Batch + for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { + std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; + + const char* out_string; + CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); + std::cout << "Decoded string:" << out_string << std::endl; + OgaDestroyString(out_string); + } +} + +TEST(CAPITests, TopKTopPCAPI) { + float top_p = 0.6f; + int top_k = 50; + float temp = 0.6f; + + OgaModel* model; + CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); + OgaModelPtr model_ptr{model}; + + OgaTokenizer* tokenizer; + CheckResult(OgaCreateTokenizer(model, &tokenizer)); + OgaTokenizerPtr tokenizer_ptr{tokenizer}; + + OgaSequences* input_sequences; + CheckResult(OgaCreateSequences(&input_sequences)); + OgaSequencesPtr sequences_ptr{input_sequences}; + + const char* input_strings[] = { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog.", + }; + + for (auto& string : input_strings) + CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); + + OgaGeneratorParams* params; + CheckResult(OgaCreateGeneratorParams(model, ¶ms)); + OgaGeneratorParamsPtr params_ptr{params}; + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); + CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); + + OgaGenerator* generator; + CheckResult(OgaCreateGenerator(model, params, &generator)); + OgaGeneratorPtr generator_ptr{generator}; + + while (!OgaGenerator_IsDone(generator)) { + CheckResult(OgaGenerator_ComputeLogits(generator)); + CheckResult(OgaGenerator_GenerateNextToken_TopK_TopP(generator, top_k, top_p, temp)); + } + + CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p)); + CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); + OgaSequences* output_sequences; + CheckResult(OgaGenerate(model, params, &output_sequences)); + OgaSequencesPtr output_sequences_ptr{output_sequences}; + + // Decode The Batch + for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { + std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; + + const char* out_string; + CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); + std::cout << "Decoded string:" << out_string << std::endl; + OgaDestroyString(out_string); + } +} + +#endif // TEST_PHI2 diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index 2121fc7aa..7bca5ffdc 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -8,6 +8,7 @@ using Microsoft.ML.OnnxRuntimeGenAI; using System.Collections.Generic; using System.Linq; +using System.Reflection.Emit; namespace Microsoft.ML.OnnxRuntimeGenAI.Tests { @@ -86,6 +87,161 @@ public void TestGreedySearch() } } + [IgnoreOnModelAbsebceFact(DisplayName = "TestTopKSearch")] + public void TestTopKSearch() + { + int topK = 100; + float temp = 0.6f; + ulong maxLength = 40; + + string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); + using (var model = new Model(modelPath)) + { + Assert.NotNull(model); + using (var tokenizer = new Tokenizer(model)) + { + Assert.NotNull(tokenizer); + + var strings = new string[] { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog." + }; + + var sequences = tokenizer.EncodeBatch(strings); + Assert.NotNull(sequences); + Assert.Equal((ulong)strings.Length, sequences.NumSequences); + + using GeneratorParams generatorParams = new GeneratorParams(model); + Assert.NotNull(generatorParams); + + generatorParams.SetSearchOption("max_length", 20); + generatorParams.SetInputSequences(sequences); + + using Generator generator = new Generator(model, generatorParams); + Assert.NotNull(generator); + while (!generator.IsDone()) + { + generator.ComputeLogits(); + generator.GenerateNextTokenTopK(topK, temp); + } + + generatorParams.SetSearchOption("do_sample", true); + generatorParams.SetSearchOption("top_k", topK); + generatorParams.SetSearchOption("temperature", temp); + var outputSequences = model.Generate(generatorParams); + Assert.NotNull(outputSequences); + + var outputStrings = tokenizer.DecodeBatch(outputSequences); + Assert.NotNull(outputStrings); + } + } + } + + [IgnoreOnModelAbsebceFact(DisplayName = "TestTopPSearch")] + public void TestTopPSearch() + { + float topP = 0.6f; + float temp = 0.6f; + ulong maxLength = 40; + + string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); + using (var model = new Model(modelPath)) + { + Assert.NotNull(model); + using (var tokenizer = new Tokenizer(model)) + { + Assert.NotNull(tokenizer); + + var strings = new string[] { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog." + }; + + var sequences = tokenizer.EncodeBatch(strings); + Assert.NotNull(sequences); + Assert.Equal((ulong)strings.Length, sequences.NumSequences); + + using GeneratorParams generatorParams = new GeneratorParams(model); + Assert.NotNull(generatorParams); + + generatorParams.SetSearchOption("max_length", 20); + generatorParams.SetInputSequences(sequences); + + using Generator generator = new Generator(model, generatorParams); + Assert.NotNull(generator); + while (!generator.IsDone()) + { + generator.ComputeLogits(); + generator.GenerateNextTokenTopP(topP, temp); + } + + generatorParams.SetSearchOption("do_sample", true); + generatorParams.SetSearchOption("top_p", topP); + generatorParams.SetSearchOption("temperature", temp); + var outputSequences = model.Generate(generatorParams); + Assert.NotNull(outputSequences); + + var outputStrings = tokenizer.DecodeBatch(outputSequences); + Assert.NotNull(outputStrings); + } + } + } + + [IgnoreOnModelAbsebceFact(DisplayName = "TestTopKTopPSearch")] + public void TestTopKTopPSearch() + { + int topK = 100; + float topP = 0.6f; + float temp = 0.6f; + ulong maxLength = 40; + + string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); + using (var model = new Model(modelPath)) + { + Assert.NotNull(model); + using (var tokenizer = new Tokenizer(model)) + { + Assert.NotNull(tokenizer); + + var strings = new string[] { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog." + }; + + var sequences = tokenizer.EncodeBatch(strings); + Assert.NotNull(sequences); + Assert.Equal((ulong)strings.Length, sequences.NumSequences); + + using GeneratorParams generatorParams = new GeneratorParams(model); + Assert.NotNull(generatorParams); + + generatorParams.SetSearchOption("max_length", 20); + generatorParams.SetInputSequences(sequences); + + using Generator generator = new Generator(model, generatorParams); + Assert.NotNull(generator); + while (!generator.IsDone()) + { + generator.ComputeLogits(); + generator.GenerateNextTokenTopKTopP(topK, topP, temp); + } + + generatorParams.SetSearchOption("do_sample", true); + generatorParams.SetSearchOption("top_k", topK); + generatorParams.SetSearchOption("top_p", topP); + generatorParams.SetSearchOption("temperature", temp); + var outputSequences = model.Generate(generatorParams); + Assert.NotNull(outputSequences); + + var outputStrings = tokenizer.DecodeBatch(outputSequences); + Assert.NotNull(outputStrings); + } + } + } + [IgnoreOnModelAbsebceFact(DisplayName = "TestTokenizerBatchEncodeDecode")] public void TestTokenizerBatchEncodeDecode() { From 3a9ecf787f8bc6e84a0850f37044e909bf49d06d Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Wed, 20 Mar 2024 10:08:10 -0700 Subject: [PATCH 15/36] Make Model and GeneratorParams be a shared_ptr vs unique_ptr (#212) For safety. This will ensure the Model object's lifetime matches that of any Generator using it. Same as GeneratorParams and Tokenizer --- src/generators.cpp | 8 +- src/generators.h | 10 +- src/models/input_ids.cpp | 10 +- src/models/kv_cache.cpp | 16 +-- src/models/logits.cpp | 12 +- src/models/model.cpp | 27 ++-- src/models/model.h | 13 +- src/models/position_ids.cpp | 18 +-- src/models/whisper.cpp | 4 +- src/ort_genai_c.cpp | 18 ++- src/python/python.cpp | 50 +++---- src/search.cpp | 68 +++++----- src/search.h | 6 +- src/search_cuda.cpp | 84 ++++++------ src/search_cuda.h | 2 +- test/model_tests.cpp | 118 ++++++++--------- test/sampling_benchmark.cpp | 170 ++++++++++++------------ test/sampling_tests.cpp | 252 ++++++++++++++++++------------------ 18 files changed, 454 insertions(+), 432 deletions(-) diff --git a/src/generators.cpp b/src/generators.cpp index d64ad34a3..6844a9aaf 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -66,7 +66,7 @@ std::unique_ptr CreateSearch(const GeneratorParams& params) { return std::make_unique(params); } -Generator::Generator(const Model& model, const GeneratorParams& params) : model_{model} { +Generator::Generator(const Model& model, const GeneratorParams& params) : model_{model.shared_from_this()} { if (params.search.max_length == 0) throw std::runtime_error("search max_length is 0"); if (params.search.max_length > model.config_->model.context_length) @@ -89,7 +89,7 @@ void Generator::ComputeLogits() { search_->SetLogits(state_->Run(search_->GetSequenceLength(), search_->GetNextTokens(), search_->GetNextIndices())); computed_logits_ = true; - auto& search = search_->params_.search; + auto& search = search_->params_->search; search_->ApplyMinLength(search.min_length); search_->ApplyRepetitionPenalty(search.repetition_penalty); } @@ -112,7 +112,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper } // The user explicitly called TopK_TopP on a beam search - if (search_->params_.search.num_beams != 1) + if (search_->params_->search.num_beams != 1) throw std::runtime_error("TopK and TopP cannot be used with a beam search"); // Sanity checks @@ -134,7 +134,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper } void Generator::GenerateNextToken() { - auto& search = search_->params_.search; + auto& search = search_->params_->search; if (search.do_sample) GenerateNextToken_TopK_TopP(search.top_k, search.top_p, search.temperature); else diff --git a/src/generators.h b/src/generators.h index 433fda103..1b42b45e9 100644 --- a/src/generators.h +++ b/src/generators.h @@ -44,7 +44,7 @@ enum struct DeviceType { CUDA, }; -struct GeneratorParams { +struct GeneratorParams : std::enable_shared_from_this { GeneratorParams() = default; // This constructor is only used if doing a custom model handler vs built-in GeneratorParams(const Model& model); @@ -91,6 +91,8 @@ struct GeneratorParams { std::variant inputs; std::vector input_ids_owner; // Backing memory of input_ids in some cases + + std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime }; struct Generator { @@ -106,13 +108,15 @@ struct Generator { RoamingArray GetSequence(int index) const; - const Model& model_; + std::shared_ptr model_; std::unique_ptr state_; std::unique_ptr search_; bool computed_logits_{}; // Set to true in ComputeLogits() and false after appending a token to ensure a 1 to 1 call ratio }; -std::unique_ptr CreateModel(OrtEnv& ort_env, const char* config_path); +std::shared_ptr CreateModel(OrtEnv& ort_env, const char* config_path); +std::shared_ptr CreateGeneratorParams(const Model& model); +std::shared_ptr CreateGeneratorParams(); // For benchmarking purposes only std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params); std::vector> Generate(const Model& model, const GeneratorParams& params); // Uses CreateGenerator and a simple loop to return the entire sequence diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp index 96a8facc8..88d2514b5 100644 --- a/src/models/input_ids.cpp +++ b/src/models/input_ids.cpp @@ -9,24 +9,24 @@ InputIDs::InputIDs(const Model& model, State& state) : model_{model}, state_{state} { name_ = model_.config_->model.decoder.inputs.input_ids.c_str(); - shape_ = {state_.params_.batch_size, state_.params_.sequence_length}; + shape_ = {state_.params_->batch_size, state_.params_->sequence_length}; type_ = model_.session_info_->GetInputDataType(name_); // If 64-bit, convert from 32-bit to 64-bit if (type_ == Ort::TypeToTensorType::type) { value_ = OrtValue::CreateTensor(model.allocator_cpu_, shape_, type_); auto* p_data = value_->GetTensorMutableData(); - for (auto v : state_.params_.input_ids) { + for (auto v : state_.params_->input_ids) { *p_data++ = v; } } else { if (type_ != Ort::TypeToTensorType::type) throw std::runtime_error("InputIDs must be int64 or int32"); - value_ = OrtValue::CreateTensor(model.allocator_cpu_.GetInfo(), std::span(const_cast(state_.params_.input_ids.data()), shape_[0] * shape_[1]), shape_); + value_ = OrtValue::CreateTensor(model.allocator_cpu_.GetInfo(), std::span(const_cast(state_.params_->input_ids.data()), shape_[0] * shape_[1]), shape_); } - value_ = model_.ExpandInputs(value_, state_.params_.search.num_beams); - shape_[0] *= state_.params_.search.num_beams; + value_ = model_.ExpandInputs(value_, state_.params_->search.num_beams); + shape_[0] *= state_.params_->search.num_beams; } void InputIDs::Add() { diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 4a0910066..17515355f 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -8,7 +8,7 @@ KV_Cache_Combined::KV_Cache_Combined(const Model& model, State& state) : model_{model}, state_{state}, layer_count_{model.config_->model.decoder.num_hidden_layers}, - shape_{2, state_.params_.BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} { + shape_{2, state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} { pasts_.resize(layer_count_); presents_.reserve(layer_count_); @@ -25,7 +25,7 @@ KV_Cache_Combined::KV_Cache_Combined(const Model& model, State& state) type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]); empty_past_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_); - shape_[3] = state_.params_.sequence_length; + shape_[3] = state_.params_->sequence_length; for (int i = 0; i < layer_count_; ++i) { presents_.push_back(OrtValue::CreateTensor(*model.allocator_device_, shape_, type_)); @@ -45,7 +45,7 @@ void KV_Cache_Combined::Add() { } void KV_Cache_Combined::Update(std::span beam_indices, int current_length) { - assert(state_.params_.search.num_beams == 1 || !beam_indices.empty()); // We require beam_indices if we're a beam search + assert(state_.params_->search.num_beams == 1 || !beam_indices.empty()); // We require beam_indices if we're a beam search for (int i = 0; i < layer_count_; i++) { if (beam_indices.empty()) { @@ -117,8 +117,8 @@ KV_Cache::KV_Cache(const Model& model, State& state) : model_{model}, state_{state}, layer_count_{model_.config_->model.decoder.num_hidden_layers}, - past_present_share_buffer_{state_.params_.search.past_present_share_buffer && state_.params_.search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA}, - shape_{state_.params_.BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} { + past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA}, + shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} { pasts_.resize(layer_count_ * 2); presents_.reserve(layer_count_ * 2); @@ -142,9 +142,9 @@ KV_Cache::KV_Cache(const Model& model, State& state) // Set the size after empty_past_ has been created with 0 for this field if (past_present_share_buffer_) - shape_[2] = state_.params_.search.max_length; + shape_[2] = state_.params_->search.max_length; else - shape_[2] = state_.params_.sequence_length; + shape_[2] = state_.params_->sequence_length; for (int i = 0; i < layer_count_; ++i) { presents_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)); @@ -245,7 +245,7 @@ Cross_Cache::Cross_Cache(const Model& model, State& state) : model_{model}, state_{state}, layer_count_{model_.config_->model.decoder.num_hidden_layers}, - shape_{state_.params_.BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 1500, model.config_->model.decoder.head_size} { + shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 1500, model.config_->model.decoder.head_size} { values_.reserve(layer_count_ * 2); for (int i = 0; i < layer_count_; ++i) { diff --git a/src/models/logits.cpp b/src/models/logits.cpp index 7dc79e53a..d7dd837f3 100644 --- a/src/models/logits.cpp +++ b/src/models/logits.cpp @@ -7,7 +7,7 @@ namespace Generators { Logits::Logits(const Model& model, State& state) : model_{model}, state_{state}, - shape_{static_cast(state_.params_.batch_size) * state_.params_.search.num_beams, state_.params_.sequence_length, state_.params_.vocab_size}, + shape_{static_cast(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size}, type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} { if (model_.device_type_ == DeviceType::CPU && type_ != Ort::TypeToTensorType::type) throw std::runtime_error("Model logits_type can only be float32 on CPU"); @@ -34,7 +34,7 @@ RoamingArray Logits::Get() { if (shape_[1] != 1) { const size_t seq_length = shape_[1]; const size_t vocab_size = shape_[2]; - const size_t num_beams = state_.params_.search.num_beams; + const size_t num_beams = state_.params_->search.num_beams; shape_[1] = 1; auto value_next = OrtValue::CreateTensor(*model_.allocator_device_, shape_); @@ -42,12 +42,12 @@ RoamingArray Logits::Get() { size_t vocab_index = 0; // Simpler math to have this index go up by vocab_size for every logit chunk we process - const auto* input_ids = state_.params_.input_ids.data(); - for (int batch_index = 0; batch_index < state_.params_.batch_size; batch_index++) { + const auto* input_ids = state_.params_->input_ids.data(); + for (int batch_index = 0; batch_index < state_.params_->batch_size; batch_index++) { // Find the first non pad token from the end size_t token_index = seq_length; while (token_index-- > 0) { - if (input_ids[token_index] != state_.params_.pad_token_id) + if (input_ids[token_index] != state_.params_->pad_token_id) break; } @@ -57,7 +57,7 @@ RoamingArray Logits::Get() { auto target = logits_next.subspan(vocab_index, vocab_size); #if USE_CUDA if (model_.device_type_ == DeviceType::CUDA) - CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_.cuda_stream); + CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream); else #endif copy(source, target); diff --git a/src/models/model.cpp b/src/models/model.cpp index 993b66248..a31b1ed84 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -12,7 +12,7 @@ namespace Generators { -State::State(const GeneratorParams& params) : params_{params} { +State::State(const GeneratorParams& params) : params_{params.shared_from_this()} { } void State::Run(OrtSession& session) { @@ -94,13 +94,13 @@ void CheckResult(tfmError_t error) { } TokenizerStream::TokenizerStream(const Tokenizer& tokenizer) - : tokenizer_{tokenizer} { + : tokenizer_{tokenizer.shared_from_this()} { CheckResult(TfmCreate(kTfmKindDetokenizerCache, cache_.Address())); } const std::string& TokenizerStream::Decode(int32_t token) { const char* string; - CheckResult(TfmDetokenizeCached(tokenizer_.tokenizer_, cache_, token, &string)); + CheckResult(TfmDetokenizeCached(tokenizer_->tokenizer_, cache_, token, &string)); chunk_ = string; return chunk_; } @@ -297,23 +297,32 @@ void Model::CreateSessionOptions() { } } -std::unique_ptr Model::CreateTokenizer() const { - return std::make_unique(*config_); +std::shared_ptr Model::CreateTokenizer() const { + return std::make_shared(*config_); } -std::unique_ptr CreateModel(OrtEnv& ort_env, const char* config_path) { +std::shared_ptr CreateModel(OrtEnv& ort_env, const char* config_path) { auto config = std::make_unique(config_path); if (config->model.type == "gpt2") - return std::make_unique(std::move(config), ort_env); + return std::make_shared(std::move(config), ort_env); if (config->model.type == "llama" || config->model.type == "gemma" || config->model.type == "mistral" || config->model.type == "phi") - return std::make_unique(std::move(config), ort_env); + return std::make_shared(std::move(config), ort_env); if (config->model.type == "whisper") - return std::make_unique(std::move(config), ort_env); + return std::make_shared(std::move(config), ort_env); throw std::runtime_error("Unsupported model_type in config.json: " + config->model.type); } +std::shared_ptr CreateGeneratorParams(const Model& model) { + return std::make_shared(model); +} + +// Used by benchmarking tests only, should not be used normally +std::shared_ptr CreateGeneratorParams() { + return std::make_shared(); +} + #if USE_CUDA void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr& p_out) { auto shape_info = in.GetTensorTypeAndShapeInfo(); diff --git a/src/models/model.h b/src/models/model.h index 3f1d4ceca..9af784362 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -15,7 +15,7 @@ struct State { virtual RoamingArray Run(int current_length, RoamingArray next_tokens, RoamingArray next_indices = {}) = 0; - const GeneratorParams& params_; + std::shared_ptr params_; std::vector input_names_, output_names_; std::vector inputs_, outputs_; @@ -57,7 +57,7 @@ struct TokenizerStream { const std::string& Decode(int32_t token); private: - const Tokenizer& tokenizer_; + std::shared_ptr tokenizer_; TfmPtr cache_; std::string chunk_; }; @@ -66,7 +66,7 @@ struct TokenizerStream { // Sequence length is vector.size()/count std::vector PadInputs(std::span > sequences, int32_t pad_token_id); -struct Tokenizer { +struct Tokenizer : std::enable_shared_from_this { Tokenizer(Config& config); std::unique_ptr CreateStream() const; @@ -78,6 +78,7 @@ struct Tokenizer { std::vector DecodeBatch(std::span sequences, size_t count) const; TfmPtr tokenizer_; + std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime private: int32_t pad_token_id_; @@ -94,11 +95,11 @@ struct SessionInfo { std::unordered_map inputs_, outputs_; }; -struct Model { +struct Model : std::enable_shared_from_this { Model(std::unique_ptr config); virtual ~Model(); - std::unique_ptr CreateTokenizer() const; + std::shared_ptr CreateTokenizer() const; virtual std::unique_ptr CreateState(RoamingArray sequence_lengths, const GeneratorParams& params) const = 0; @@ -113,6 +114,8 @@ struct Model { std::unique_ptr session_info_; + std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime + protected: void InitDeviceAllocator(OrtSession& session); void CreateSessionOptions(); diff --git a/src/models/position_ids.cpp b/src/models/position_ids.cpp index bfff6e161..a0e8d6b56 100644 --- a/src/models/position_ids.cpp +++ b/src/models/position_ids.cpp @@ -12,7 +12,7 @@ PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray if (type_ != Ort::TypeToTensorType::type && type_ != Ort::TypeToTensorType::type) throw std::runtime_error("position_ids & attention_mask only support int32 or int64 types"); - std::array shape{state_.params_.batch_size, state_.params_.sequence_length}; // Only batch_size initially, as we haven't expanded over the beams yet + std::array shape{state_.params_->batch_size, state_.params_->sequence_length}; // Only batch_size initially, as we haven't expanded over the beams yet position_ids_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_); position_ids_next_ = OrtValue::CreateTensor(model.allocator_cpu_, std::array{shape[0], 1}, type_); attention_mask_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_); @@ -22,10 +22,10 @@ PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray else InitializeTensors(shape, sequence_lengths_unk); - position_ids_ = model_.ExpandInputs(position_ids_, state_.params_.search.num_beams); - position_ids_next_ = model_.ExpandInputs(position_ids_next_, state_.params_.search.num_beams); - attention_mask_ = model_.ExpandInputs(attention_mask_, state_.params_.search.num_beams); - shape[0] *= state_.params_.search.num_beams; + position_ids_ = model_.ExpandInputs(position_ids_, state_.params_->search.num_beams); + position_ids_next_ = model_.ExpandInputs(position_ids_next_, state_.params_->search.num_beams); + attention_mask_ = model_.ExpandInputs(attention_mask_, state_.params_->search.num_beams); + shape[0] *= state_.params_->search.num_beams; position_ids_shape_ = shape; attention_mask_shape_ = shape; } @@ -106,13 +106,13 @@ void PositionIDs::InitializeTensors(std::array shape, cpu_spanGetTensorMutableData(); auto* position_data = position_ids_->GetTensorMutableData(); auto* position_data_next = position_ids_next_->GetTensorMutableData(); - const auto* word_id = state_.params_.input_ids.data(); + const auto* word_id = state_.params_->input_ids.data(); auto* mask = mask_data; auto* position = position_data; for (int i = 0; i < shape[0]; i++) { T abs_position = 0; for (int j = 0; j < shape[1]; j++, word_id++, mask++, position++) { - if (*word_id == state_.params_.pad_token_id) { + if (*word_id == state_.params_->pad_token_id) { *mask = 0; *position = 0; } else { @@ -122,8 +122,8 @@ void PositionIDs::InitializeTensors(std::array shape, cpu_span(abs_position); + for (int k = 0; k < state_.params_->search.num_beams; k++) { + sequence_lengths[i * state_.params_->search.num_beams + k] = static_cast(abs_position); } } } diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp index f6f2aaea1..5c8fe9d83 100644 --- a/src/models/whisper.cpp +++ b/src/models/whisper.cpp @@ -20,12 +20,12 @@ Whisper_State::Whisper_State(const Whisper_Model& model, RoamingArray s model_{model} { auto& inputs = const_cast(std::get(params.inputs)); - auto encoder_input_ids = model_.ExpandInputs(inputs.input_features, params_.search.num_beams); + auto encoder_input_ids = model_.ExpandInputs(inputs.input_features, params_->search.num_beams); encoder_hidden_states_ = OrtValue::CreateTensor(*model_.allocator_device_, std::array{decoder_input_ids_.GetShape()[0], 1500, 384}); auto sequence_lengths = sequence_lengths_unk.GetCPU(); for (int i = 0; i < decoder_input_ids_.GetShape()[0]; i++) { - sequence_lengths[i] = static_cast(params_.sequence_length); + sequence_lengths[i] = static_cast(params_->sequence_length); } input_names_.push_back("encoder_input_ids"); diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index fbb986af0..bbf84be51 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -63,14 +63,18 @@ const int32_t* OGA_API_CALL OgaSequencesGetSequenceData(const OgaSequences* p, s OgaResult* OGA_API_CALL OgaCreateModel(const char* config_path, OgaModel** out) { OGA_TRY - *out = reinterpret_cast(Generators::CreateModel(Generators::GetOrtEnv(), config_path).release()); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), config_path); + model->external_owner_ = model; + *out = reinterpret_cast(model.get()); return nullptr; OGA_CATCH } OgaResult* OGA_API_CALL OgaCreateGeneratorParams(const OgaModel* model, OgaGeneratorParams** out) { OGA_TRY - *out = reinterpret_cast(new Generators::GeneratorParams(*reinterpret_cast(model))); + auto params = std::make_shared(*reinterpret_cast(model)); + params->external_owner_ = params; + *out = reinterpret_cast(params.get()); return nullptr; OGA_CATCH } @@ -192,7 +196,9 @@ const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* oga_gen OgaResult* OGA_API_CALL OgaCreateTokenizer(const OgaModel* model, OgaTokenizer** out) { OGA_TRY - *out = reinterpret_cast(reinterpret_cast(model)->CreateTokenizer().release()); + auto tokenizer = reinterpret_cast(model)->CreateTokenizer(); + tokenizer->external_owner_ = tokenizer; + *out = reinterpret_cast(tokenizer.get()); return nullptr; OGA_CATCH } @@ -265,11 +271,11 @@ void OGA_API_CALL OgaDestroySequences(OgaSequences* p) { } void OGA_API_CALL OgaDestroyModel(OgaModel* p) { - delete reinterpret_cast(p); + reinterpret_cast(p)->external_owner_ = nullptr; } void OGA_API_CALL OgaDestroyGeneratorParams(OgaGeneratorParams* p) { - delete reinterpret_cast(p); + reinterpret_cast(p)->external_owner_ = nullptr; } void OGA_API_CALL OgaDestroyGenerator(OgaGenerator* p) { @@ -277,7 +283,7 @@ void OGA_API_CALL OgaDestroyGenerator(OgaGenerator* p) { } void OGA_API_CALL OgaDestroyTokenizer(OgaTokenizer* p) { - delete reinterpret_cast(p); + reinterpret_cast(p)->external_owner_ = nullptr; } void OGA_API_CALL OgaDestroyTokenizerStream(OgaTokenizerStream* p) { diff --git a/src/python/python.cpp b/src/python/python.cpp index a1667eb90..1c8db803d 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -53,26 +53,33 @@ void Declare_DeviceArray(pybind11::module& m, const char* name) { "get_array", [](Type& t) -> pybind11::array_t { return t.GetNumpy(); }, pybind11::return_value_policy::reference_internal); } -struct PyGeneratorParams : GeneratorParams { +struct PyGeneratorParams { + PyGeneratorParams(const Model& model) : params_{std::make_shared(model)} { + } + + operator const GeneratorParams&() const { return *params_; } + + std::shared_ptr params_; + // Turn the python py_input_ids_ into the low level parameters void Prepare() { // TODO: This will switch to using the variant vs being ifs if (py_input_ids_.size() != 0) { if (py_input_ids_.ndim() == 1) { // Just a 1D array - batch_size = 1; - sequence_length = static_cast(py_input_ids_.shape(0)); + params_->batch_size = 1; + params_->sequence_length = static_cast(py_input_ids_.shape(0)); } else { if (py_input_ids_.ndim() != 2) throw std::runtime_error("Input IDs can only be 1 or 2 dimensional"); - batch_size = static_cast(py_input_ids_.shape(0)); - sequence_length = static_cast(py_input_ids_.shape(1)); + params_->batch_size = static_cast(py_input_ids_.shape(0)); + params_->sequence_length = static_cast(py_input_ids_.shape(1)); } - input_ids = ToSpan(py_input_ids_); + params_->input_ids = ToSpan(py_input_ids_); } if (py_whisper_input_features_.size() != 0) { - GeneratorParams::Whisper& whisper = inputs.emplace(); + GeneratorParams::Whisper& whisper = params_->inputs.emplace(); #ifdef __APPLE__ std::span shape(reinterpret_cast(py_whisper_input_features_.shape()), py_whisper_input_features_.ndim()); @@ -81,9 +88,9 @@ struct PyGeneratorParams : GeneratorParams { #endif whisper.input_features = OrtValue::CreateTensor(Ort::Allocator::GetWithDefaultOptions().GetInfo(), ToSpan(py_whisper_input_features_), shape); whisper.decoder_input_ids = ToSpan(py_whisper_decoder_input_ids_); - batch_size = 1; - sequence_length = static_cast(py_whisper_decoder_input_ids_.shape(1)); - input_ids = ToSpan(py_whisper_decoder_input_ids_); + params_->batch_size = 1; + params_->sequence_length = static_cast(py_whisper_decoder_input_ids_.shape(1)); + params_->input_ids = ToSpan(py_whisper_decoder_input_ids_); } } @@ -92,11 +99,11 @@ struct PyGeneratorParams : GeneratorParams { auto name = entry.first.cast(); try { if (pybind11::isinstance(entry.second)) { - SetSearchNumber(search, name, entry.second.cast()); + SetSearchNumber(params_->search, name, entry.second.cast()); } else if (pybind11::isinstance(entry.second)) { - SetSearchBool(search, name, entry.second.cast()); + SetSearchBool(params_->search, name, entry.second.cast()); } else if (pybind11::isinstance(entry.second)) { - SetSearchNumber(search, name, entry.second.cast()); + SetSearchNumber(params_->search, name, entry.second.cast()); } else throw std::runtime_error("Unknown search option type, can be float/bool/int:" + name); } catch (JSON::unknown_value_error& e) { @@ -182,9 +189,9 @@ PYBIND11_MODULE(onnxruntime_genai, m) { pybind11::class_(m, "GeneratorParams") .def(pybind11::init()) - .def_readonly("pad_token_id", &PyGeneratorParams::pad_token_id) - .def_readonly("eos_token_id", &PyGeneratorParams::eos_token_id) - .def_readonly("vocab_size", &PyGeneratorParams::vocab_size) + .def_property_readonly("pad_token_id", [](const PyGeneratorParams& v) { return v.params_->pad_token_id; }) + .def_property_readonly("eos_token_id", [](const PyGeneratorParams& v) { return v.params_->eos_token_id; }) + .def_property_readonly("vocab_size", [](const PyGeneratorParams& v) { return v.params_->vocab_size; }) .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_) .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_) .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_) @@ -196,7 +203,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { pybind11::class_(m, "TokenizerStream") .def("decode", [](TokenizerStream& t, int32_t token) { return t.Decode(token); }); - pybind11::class_(m, "Tokenizer") + pybind11::class_>(m, "Tokenizer") .def(pybind11::init([](Model& model) { return model.CreateTokenizer(); })) .def("encode", &Tokenizer::Encode) .def("decode", [](const Tokenizer& t, pybind11::array_t tokens) { return t.Decode(ToSpan(tokens)); }) @@ -216,18 +223,11 @@ PYBIND11_MODULE(onnxruntime_genai, m) { }) .def("create_stream", [](const Tokenizer& t) { return t.CreateStream(); }); - pybind11::class_(m, "Model") + pybind11::class_>(m, "Model") .def(pybind11::init([](const std::string& config_path) { return CreateModel(GetOrtEnv(), config_path.c_str()); })) .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); }) - .def("generate_sequence", [](Model& model, pybind11::array_t input_ids, const pybind11::dict& search_options) { - PyGeneratorParams params{model}; - params.SetSearchOptions(search_options); - params.py_input_ids_ = input_ids; - params.Prepare(); - return Generate(model, params)[0]; - }) .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; }); pybind11::class_(m, "Generator") diff --git a/src/search.cpp b/src/search.cpp index cc3a6fd10..dd3389270 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -9,7 +9,7 @@ namespace Generators { Search_Cpu::Search_Cpu(const GeneratorParams& params) : Search{params}, - sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_.search.max_length} { + sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_->search.max_length} { auto batch_beam_size = params.BatchBeamSize(); sequence_lengths_buffer_ = AllocateArray(batch_beam_size, &sequence_lengths_); } @@ -25,8 +25,8 @@ GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params) BeamSearch_Cpu::BeamSearch_Cpu(const GeneratorParams& params) : Search_Cpu(params) { - assert(params_.search.num_beams > 1); // If 1, use GreedySearch - beam_scorer_ = std::make_unique(params_); + assert(params_->search.num_beams > 1); // If 1, use GreedySearch + beam_scorer_ = std::make_unique(*params_); } BeamSearch_Cpu::~BeamSearch_Cpu() = default; @@ -58,16 +58,16 @@ void BeamSearch_Cpu::SelectTop() { // TODO(tianleiwu): use thread pool to parallel int offset = 0; int batch_beam_index = 0; - for (int i = 0; i < params_.batch_size; i++) { - for (int j = 0; j < params_.search.num_beams; j++, batch_beam_index++) { - for (int k = 0; k < params_.vocab_size; k++, offset++) { + for (int i = 0; i < params_->batch_size; i++) { + for (int j = 0; j < params_->search.num_beams; j++, batch_beam_index++) { + for (int k = 0; k < params_->vocab_size; k++, offset++) { next_token_scores_[offset] += beam_scores[batch_beam_index]; } } } // TODO: Write output scores? - unsigned const top_k = 2 * params_.search.num_beams; + const size_t top_k = 2 * params_->search.num_beams; struct ScoreIndex { float score; @@ -76,17 +76,17 @@ void BeamSearch_Cpu::SelectTop() { bool operator<(const ScoreIndex& s) const { return score < s.score; } }; - auto scores = std::make_unique(top_k * params_.batch_size); - auto indices = std::make_unique(top_k * params_.batch_size); - auto tokens = std::make_unique(top_k * params_.batch_size); + auto scores = std::make_unique(top_k * params_->batch_size); + auto indices = std::make_unique(top_k * params_->batch_size); + auto tokens = std::make_unique(top_k * params_->batch_size); - auto next_scores = std::span(scores.get(), top_k * params_.batch_size); - auto next_indices = std::span(indices.get(), top_k * params_.batch_size); - auto next_tokens = std::span(tokens.get(), top_k * params_.batch_size); + auto next_scores = std::span(scores.get(), top_k * params_->batch_size); + auto next_indices = std::span(indices.get(), top_k * params_->batch_size); + auto next_tokens = std::span(tokens.get(), top_k * params_->batch_size); - for (size_t batch_index = 0; batch_index < static_cast(params_.batch_size); batch_index++) { + for (size_t batch_index = 0; batch_index < static_cast(params_->batch_size); batch_index++) { std::priority_queue> queue; - auto token_scores_sub = next_token_scores_.subspan(batch_index * params_.search.num_beams * params_.vocab_size, params_.search.num_beams * params_.vocab_size); + auto token_scores_sub = next_token_scores_.subspan(batch_index * params_->search.num_beams * params_->vocab_size, static_cast(params_->search.num_beams) * params_->vocab_size); for (int i = 0; i < token_scores_sub.size(); i++) { queue.push({token_scores_sub[i], i}); } @@ -96,8 +96,8 @@ void BeamSearch_Cpu::SelectTop() { auto next_scores_sub = next_scores.subspan(top_k * batch_index, top_k); for (unsigned i = 0; i < top_k; i++) { auto v = queue.top(); - next_indices_sub[i] = v.index / params_.vocab_size; - next_tokens_sub[i] = v.index % params_.vocab_size; + next_indices_sub[i] = v.index / params_->vocab_size; + next_tokens_sub[i] = v.index % params_->vocab_size; next_scores_sub[i] = v.score; queue.pop(); } @@ -117,12 +117,12 @@ void BeamSearch_Cpu::SelectTop() { void GreedySearch_Cpu::SelectTop() { // next_tokens = torch.argmax(scores, dim=-1) - for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) { + for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) { if (PadIfAlreadyEOS(batch_id)) { continue; } - std::span const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size); + std::span const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size); auto const token = static_cast(std::distance(scores.begin(), std::max_element(scores.begin(), scores.end()))); SetNextToken(batch_id, token); } @@ -144,8 +144,8 @@ void SoftMax(std::span scores, float temperature) { } void GreedySearch_Cpu::SampleTopK(int k, float temperature) { - for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) { - std::span const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size); + for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) { + std::span const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size); SoftMax(scores, temperature); // Find the top K scores std::vector indices(scores.size()); @@ -160,11 +160,11 @@ void GreedySearch_Cpu::SampleTopK(int k, float temperature) { void GreedySearch_Cpu::SampleTopP(float p, float temperature) { std::uniform_real_distribution dis(0, p); - for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) { + for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) { if (PadIfAlreadyEOS(batch_id)) { continue; } - std::span const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size); + std::span const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size); SoftMax(scores, temperature); // Sort an array of indices into the scores std::vector indices(scores.size()); @@ -189,11 +189,11 @@ void GreedySearch_Cpu::SampleTopP(float p, float temperature) { void GreedySearch_Cpu::SampleTopKTopP(int k, float p, float temperature) { std::uniform_real_distribution dis(0, p); - for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) { + for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) { if (PadIfAlreadyEOS(batch_id)) { continue; } - std::span const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size); + std::span const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size); SoftMax(scores, temperature); // Find the top K scores std::vector indices(scores.size()); @@ -222,13 +222,13 @@ bool GreedySearch_Cpu::PadIfAlreadyEOS(size_t batch_id) { return false; } - next_tokens_[batch_id] = params_.pad_token_id; + next_tokens_[batch_id] = params_->pad_token_id; return true; } void GreedySearch_Cpu::SetNextToken(size_t batch_id, int32_t token) { next_tokens_[batch_id] = token; - if (token == params_.eos_token_id) { + if (token == params_->eos_token_id) { eos_seen_[batch_id] = true; if (--not_done_count_ == 0) { done_ = true; @@ -239,7 +239,7 @@ void GreedySearch_Cpu::SetNextToken(size_t batch_id, int32_t token) { void GreedySearch_Cpu::AppendNextTokensToSequences() { sequences_.AppendNextTokenToSequences(next_tokens_); - if (sequences_.GetSequenceLength() == params_.search.max_length) { + if (sequences_.GetSequenceLength() == params_->search.max_length) { done_ = true; } } @@ -247,7 +247,7 @@ void GreedySearch_Cpu::AppendNextTokensToSequences() { void BeamSearch_Cpu::AppendNextTokensToSequences() { sequences_.AppendNextTokenToSequences(beam_scorer_->GetNextIndicesCPU(), beam_scorer_->GetNextTokens()); - if (sequences_.GetSequenceLength() == params_.search.max_length) { + if (sequences_.GetSequenceLength() == params_->search.max_length) { done_ = true; } } @@ -257,8 +257,8 @@ void BeamSearch_Cpu::Finalize(size_t num_return_sequences, RoamingArray } std::span Search_Cpu::GetScores(int batch_beam_index) const { - assert(batch_beam_index >= 0 && batch_beam_index < params_.BatchBeamSize()); - return next_token_scores_.subspan(batch_beam_index * params_.vocab_size, params_.vocab_size); + assert(batch_beam_index >= 0 && batch_beam_index < params_->BatchBeamSize()); + return next_token_scores_.subspan(static_cast(batch_beam_index) * params_->vocab_size, params_->vocab_size); } void Search_Cpu::ApplyMinLength(int min_length) { @@ -266,10 +266,10 @@ void Search_Cpu::ApplyMinLength(int min_length) { return; } - const int batch_beam_size = params_.BatchBeamSize(); + const int batch_beam_size = params_->BatchBeamSize(); for (int i = 0; i < batch_beam_size; i++) { std::span const beam_token_scores = GetScores(i); - beam_token_scores[params_.eos_token_id] = std::numeric_limits::lowest(); + beam_token_scores[params_->eos_token_id] = std::numeric_limits::lowest(); } } @@ -277,7 +277,7 @@ void Search_Cpu::ApplyRepetitionPenalty(float penalty) { if (penalty == 1.0f) return; - const int batch_beam_size = params_.BatchBeamSize(); + const int batch_beam_size = params_->BatchBeamSize(); for (int i = 0; i < batch_beam_size; i++) { std::span const beam_token_scores = GetScores(i); std::span const sequence = sequences_.GetSequence(i); diff --git a/src/search.h b/src/search.h index bc81313eb..5a52c11e2 100644 --- a/src/search.h +++ b/src/search.h @@ -6,7 +6,7 @@ namespace Generators { struct BeamSearchScorer; struct Search { - Search(const GeneratorParams& params) : params_{params} {} + Search(const GeneratorParams& params) : params_{params.shared_from_this()} {} virtual ~Search() = default; virtual RoamingArray GetNextTokens() = 0; @@ -30,7 +30,7 @@ struct Search { virtual void ApplyMinLength(int min_length) = 0; virtual void ApplyRepetitionPenalty(float penalty) = 0; - const GeneratorParams& params_; + std::shared_ptr params_; }; struct Search_Cpu : Search { @@ -81,7 +81,7 @@ struct GreedySearch_Cpu : Search_Cpu { std::span eos_seen_; // shape (batch_size) std::unique_ptr eos_seen_buffer_; - int not_done_count_{params_.batch_size}; // When zero, every batch entry is done (starts at batch_size_) + int not_done_count_{params_->batch_size}; // When zero, every batch entry is done (starts at batch_size_) std::random_device rd_; std::mt19937 gen_; diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp index 304f62cc2..aa6d85431 100644 --- a/src/search_cuda.cpp +++ b/src/search_cuda.cpp @@ -17,13 +17,13 @@ void OnCudaError(cudaError_t error) { Search_Cuda::Search_Cuda(const GeneratorParams& params) : Search{params}, - sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_.search.max_length, params_.cuda_stream} { + sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_->search.max_length, params_->cuda_stream} { auto batch_beam_size = params.BatchBeamSize(); sequence_lengths_buffer_ = std::make_unique(batch_beam_size); sequence_lengths_ = cpu_span(sequence_lengths_buffer_.get(), batch_beam_size); eos_meet_buffer_ = CudaMallocArray(batch_beam_size, &eos_meet_); - cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_.cuda_stream); + cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_->cuda_stream); done_cpu_ = CudaMallocHostArray(1); *done_cpu_ = false; @@ -32,26 +32,26 @@ Search_Cuda::Search_Cuda(const GeneratorParams& params) GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params) : Search_Cuda{params} { next_tokens_buffer_ = CudaMallocArray(params.batch_size, &next_tokens_); - cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_.cuda_stream); - samplingdata_ = std::make_unique(params_.batch_size, params_.vocab_size, params_.cuda_stream); + cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_->cuda_stream); + samplingdata_ = std::make_unique(params_->batch_size, params_->vocab_size, params_->cuda_stream); } BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params) : Search_Cuda{params} { - assert(params_.search.num_beams > 1); // If 1, use GreedySearch - auto batch_beam_size = params_.BatchBeamSize(); - beam_scorer_ = std::make_unique(params_); + assert(params_->search.num_beams > 1); // If 1, use GreedySearch + auto batch_beam_size = params_->BatchBeamSize(); + beam_scorer_ = std::make_unique(*params_); topk_next_tokens_ = CudaMallocArray(2 * batch_beam_size); topk_next_indices_ = CudaMallocArray(2 * batch_beam_size); topk_next_scores_ = CudaMallocArray(2 * batch_beam_size); constexpr size_t max_parts_of_vocab = 128; - size_t topk_buffer_size = batch_beam_size * (max_parts_of_vocab + 1) * params_.search.num_beams * 2 * 2; + size_t topk_buffer_size = batch_beam_size * (max_parts_of_vocab + 1) * params_->search.num_beams * 2 * 2; topk_buffer_ = CudaMallocArray(topk_buffer_size); static_assert(sizeof(float) == sizeof(int32_t)); // The topk_buffer assumes these match, fix for float16 - cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), params_.cuda_stream); + cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), params_->cuda_stream); } BeamSearch_Cuda::~BeamSearch_Cuda() = default; @@ -82,13 +82,13 @@ void BeamSearch_Cuda::SelectTop() { // Add beam score to next token scores. Corresponding python code is like: // next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) cuda::LaunchAddProbsKernel(next_token_scores_.data(), beam_scores.data(), - params_.batch_size, params_.search.num_beams, params_.vocab_size, params_.cuda_stream); + params_->batch_size, params_->search.num_beams, params_->vocab_size, params_->cuda_stream); // TODO: Write output scores? - if (params_.search.num_beams <= 32) { + if (params_->search.num_beams <= 32) { constexpr size_t max_parts_of_vocab = 128; - size_t candidate_count = params_.BatchBeamSize() * 2 * params_.search.num_beams; + size_t candidate_count = params_->BatchBeamSize() * 2 * params_->search.num_beams; float* topk_tmp_buffer = topk_buffer_.get(); float* topk_scores_1st_stage = topk_tmp_buffer; int32_t* topk_tokens_1st_stage = reinterpret_cast(topk_scores_1st_stage + candidate_count * max_parts_of_vocab); @@ -96,10 +96,10 @@ void BeamSearch_Cuda::SelectTop() { int32_t* topk_tokens_2nd_stage = reinterpret_cast(topk_scores_2nd_stage + candidate_count); cuda::BeamSearchTopK(next_token_scores_.data(), - params_.batch_size, - params_.search.num_beams, - params_.vocab_size, - 2 * params_.search.num_beams, + params_->batch_size, + params_->search.num_beams, + params_->vocab_size, + 2 * params_->search.num_beams, topk_scores_1st_stage, topk_tokens_1st_stage, topk_scores_2nd_stage, @@ -107,13 +107,13 @@ void BeamSearch_Cuda::SelectTop() { topk_next_scores_.get(), topk_next_tokens_.get(), topk_next_indices_.get(), - params_.cuda_stream); + params_->cuda_stream); } else assert(false); - CudaCheck() == cudaStreamSynchronize(params_.cuda_stream); + CudaCheck() == cudaStreamSynchronize(params_->cuda_stream); - size_t size = params_.BatchBeamSize() * 2; + size_t size = params_->BatchBeamSize() * 2; std::span next_scores{topk_next_scores_.get(), size}; std::span next_tokens{topk_next_tokens_.get(), size}; std::span next_indices{topk_next_indices_.get(), size}; @@ -131,52 +131,52 @@ void BeamSearch_Cuda::SelectTop() { } void GreedySearch_Cuda::SelectTop() { - std::span scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size); - cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size), - params_.batch_size, 1, 0.0, 1.0); + std::span scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size); + cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size), + params_->batch_size, 1, 0.0, 1.0); CheckForEOS(); AppendNextTokensToSequences(); } void GreedySearch_Cuda::SampleTopP(float p, float temperature) { - std::span scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size); - cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size), - params_.batch_size, -1, p, temperature); + std::span scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size); + cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size), + params_->batch_size, -1, p, temperature); CheckForEOS(); AppendNextTokensToSequences(); } void GreedySearch_Cuda::SampleTopK(int k, float temperature) { - std::span scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size); - cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size), - params_.batch_size, k, 0.0, temperature); + std::span scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size); + cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size), + params_->batch_size, k, 0.0, temperature); CheckForEOS(); AppendNextTokensToSequences(); } void GreedySearch_Cuda::SampleTopKTopP(int k, float p, float temperature) { - std::span scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size); - cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size), - params_.batch_size, k, p, temperature); + std::span scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size); + cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size), + params_->batch_size, k, p, temperature); CheckForEOS(); AppendNextTokensToSequences(); } void GreedySearch_Cuda::CheckForEOS() { assert(next_tokens_.size() == eos_meet_.size()); - cuda::Launch_CheckForEOS(next_tokens_.data(), static_cast(next_tokens_.size()), eos_meet_.data(), params_.eos_token_id, params_.pad_token_id, done_cpu_.get(), params_.cuda_stream); + cuda::Launch_CheckForEOS(next_tokens_.data(), static_cast(next_tokens_.size()), eos_meet_.data(), params_->eos_token_id, params_->pad_token_id, done_cpu_.get(), params_->cuda_stream); } void GreedySearch_Cuda::AppendNextTokensToSequences() { sequences_.AppendNextTokenToSequences(next_tokens_); - if (sequences_.GetSequenceLength() == params_.search.max_length) + if (sequences_.GetSequenceLength() == params_->search.max_length) *done_cpu_ = true; } bool BeamSearch_Cuda::IsDone() const { beam_scorer_->IsDone(); - return beam_scorer_->IsDoneLater() || sequences_.GetSequenceLength() == params_.search.max_length; + return beam_scorer_->IsDoneLater() || sequences_.GetSequenceLength() == params_->search.max_length; } void BeamSearch_Cuda::AppendNextTokensToSequences() { @@ -195,10 +195,10 @@ void GreedySearch::Finalize(size_t num_return_sequences, std::span outp // Copy the sequences to output std::span output{ output_sequences_->GetTensorMutableData(), shape_count}; - for (int batch_id = 0; batch_id < params_.batch_size; ++batch_id) { + for (int batch_id = 0; batch_id < params_->batch_size; ++batch_id) { auto batch_output = output.subspan( - static_cast(batch_id) * params_.max_length, - params_.max_length); + static_cast(batch_id) * params_->max_length, + params_->max_length); std::span sequence_source = sequences_.GetSequence(batch_id); std::copy(sequence_source, batch_output); } @@ -206,8 +206,8 @@ void GreedySearch::Finalize(size_t num_return_sequences, std::span outp #endif std::span Search_Cuda::GetScores(int batch_beam_index) { - assert(batch_beam_index >= 0 && batch_beam_index < params_.BatchBeamSize()); - return next_token_scores_.subspan(batch_beam_index * params_.vocab_size, params_.vocab_size); + assert(batch_beam_index >= 0 && batch_beam_index < params_->BatchBeamSize()); + return next_token_scores_.subspan(batch_beam_index * params_->vocab_size, params_->vocab_size); } std::span Search_Cuda::GetScores() { @@ -218,7 +218,7 @@ void Search_Cuda::ApplyMinLength(int min_length) { if (sequences_.GetSequenceLength() >= min_length) return; - cuda::LaunchSetScoreProcessor(GetScores().data(), params_.BatchBeamSize(), params_.vocab_size, params_.eos_token_id, std::numeric_limits::lowest(), params_.cuda_stream); + cuda::LaunchSetScoreProcessor(GetScores().data(), params_->BatchBeamSize(), params_->vocab_size, params_->eos_token_id, std::numeric_limits::lowest(), params_->cuda_stream); } void Search_Cuda::ApplyRepetitionPenalty(float penalty) { @@ -226,8 +226,8 @@ void Search_Cuda::ApplyRepetitionPenalty(float penalty) { return; cuda::LaunchRepetitionPenaltyProcessor(sequences_.GetSequences().data(), - GetScores().data(), params_.batch_size, params_.search.num_beams, params_.vocab_size, - params_.search.max_length, GetSequenceLength(), penalty, params_.cuda_stream); + GetScores().data(), params_->batch_size, params_->search.num_beams, params_->vocab_size, + params_->search.max_length, GetSequenceLength(), penalty, params_->cuda_stream); } } // namespace Generators \ No newline at end of file diff --git a/src/search_cuda.h b/src/search_cuda.h index 8d1ddbbb4..11a5a428d 100644 --- a/src/search_cuda.h +++ b/src/search_cuda.h @@ -15,7 +15,7 @@ struct Search_Cuda : Search { RoamingArray GetSequence(int index) override { return sequences_.GetSequence(index); } bool IsDone() const { - cudaStreamSynchronize(params_.cuda_stream); + cudaStreamSynchronize(params_->cuda_stream); return *done_cpu_; } // TODO: Use an event void SetLogits(RoamingArray logits); diff --git a/test/model_tests.cpp b/test/model_tests.cpp index 90655cfed..a2b3a7832 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -34,13 +34,13 @@ TEST(ModelTests, GreedySearchGptFp32) { auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); - Generators::GeneratorParams params{*model}; - params.search.max_length = 10; - params.batch_size = static_cast(input_ids_shape[0]); - params.sequence_length = static_cast(input_ids_shape[1]); - params.input_ids = input_ids; + auto params = Generators::CreateGeneratorParams(*model); + params->search.max_length = 10; + params->batch_size = static_cast(input_ids_shape[0]); + params->sequence_length = static_cast(input_ids_shape[1]); + params->input_ids = input_ids; - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); while (!generator->IsDone()) { generator->ComputeLogits(); @@ -48,10 +48,10 @@ TEST(ModelTests, GreedySearchGptFp32) { } // Verify outputs match expected outputs - for (size_t i = 0; i < static_cast(params.batch_size); i++) { + for (size_t i = 0; i < static_cast(params->batch_size); i++) { auto sequence = generator->GetSequence(i).GetCPU(); - auto* expected_output_start = &expected_output[i * params.search.max_length]; - EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t))); + auto* expected_output_start = &expected_output[i * params->search.max_length]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t))); } } @@ -74,16 +74,16 @@ TEST(ModelTests, BeamSearchGptFp32) { auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); - Generators::GeneratorParams params{*model}; - params.batch_size = static_cast(input_ids_shape[0]); - params.sequence_length = static_cast(input_ids_shape[1]); - params.input_ids = input_ids; - params.search.max_length = 20; - params.search.length_penalty = 1.0f; - params.search.num_beams = 4; + auto params = Generators::CreateGeneratorParams(*model); + params->batch_size = static_cast(input_ids_shape[0]); + params->sequence_length = static_cast(input_ids_shape[1]); + params->input_ids = input_ids; + params->search.max_length = 20; + params->search.length_penalty = 1.0f; + params->search.num_beams = 4; - Generators::BeamSearch_Cpu search{params}; - auto state = model->CreateState(search.sequence_lengths_, params); + Generators::BeamSearch_Cpu search{*params}; + auto state = model->CreateState(search.sequence_lengths_, *params); while (!search.IsDone()) { search.SetLogits(state->Run(search.GetSequenceLength(), search.GetNextTokens(), search.GetNextIndices())); @@ -95,14 +95,14 @@ TEST(ModelTests, BeamSearchGptFp32) { search.SelectTop(); } - std::vector output_sequence(static_cast(search.params_.batch_size) * search.params_.search.max_length); + std::vector output_sequence(static_cast(search.params_->batch_size) * search.params_->search.max_length); search.Finalize(1, Generators::cpu_span{output_sequence}, {}); // Verify outputs match expected outputs - for (size_t i = 0; i < static_cast(search.params_.batch_size); i++) { - auto sequence = std::span(output_sequence.data() + search.params_.search.max_length * i, search.params_.search.max_length); - auto* expected_output_start = &expected_output[i * search.params_.search.max_length]; - EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t))); + for (size_t i = 0; i < static_cast(search.params_->batch_size); i++) { + auto sequence = std::span(output_sequence.data() + search.params_->search.max_length * i, search.params_->search.max_length); + auto* expected_output_start = &expected_output[i * search.params_->search.max_length]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t))); } } @@ -118,13 +118,13 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label) auto model = Generators::CreateModel(*g_ort_env, model_path); - Generators::GeneratorParams params{*model}; - params.batch_size = static_cast(input_ids_shape[0]); - params.sequence_length = static_cast(input_ids_shape[1]); - params.search.max_length = 10; - params.input_ids = input_ids; + auto params = Generators::CreateGeneratorParams(*model); + params->batch_size = static_cast(input_ids_shape[0]); + params->sequence_length = static_cast(input_ids_shape[1]); + params->search.max_length = 10; + params->input_ids = input_ids; - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); while (!generator->IsDone()) { generator->ComputeLogits(); @@ -132,11 +132,11 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label) } // Verify outputs match expected outputs - for (int i = 0; i < params.batch_size; i++) { + for (int i = 0; i < params->batch_size; i++) { auto sequence_gpu = generator->GetSequence(i); auto sequence = sequence_gpu.GetCPU(); - auto* expected_output_start = &expected_output[i * params.search.max_length]; - EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t))); + auto* expected_output_start = &expected_output[i * params->search.max_length]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t))); } } @@ -163,34 +163,34 @@ void Test_BeamSearch_Gpt_Cuda(const char* model_path, const char* model_label) { // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default) auto model = Generators::CreateModel(*g_ort_env, model_path); - Generators::GeneratorParams params{*model}; - params.batch_size = static_cast(input_ids_shape[0]); - params.sequence_length = static_cast(input_ids_shape[1]); - params.input_ids = input_ids; - params.search.max_length = 20; - params.search.num_beams = 4; - params.search.length_penalty = 1.0f; + auto params = Generators::CreateGeneratorParams(*model); + params->batch_size = static_cast(input_ids_shape[0]); + params->sequence_length = static_cast(input_ids_shape[1]); + params->input_ids = input_ids; + params->search.max_length = 20; + params->search.num_beams = 4; + params->search.length_penalty = 1.0f; - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); while (!generator->IsDone()) { generator->ComputeLogits(); generator->GenerateNextToken(); } - size_t sequence_length = params.batch_size * params.search.max_length; + size_t sequence_length = params->batch_size * params->search.max_length; auto output_sequence_cuda = Generators::CudaMallocArray(sequence_length); auto output_sequence_cpu = std::make_unique(sequence_length); generator->search_->Finalize(1, Generators::gpu_span(output_sequence_cuda.get(), sequence_length), {}); - cudaMemcpyAsync(output_sequence_cpu.get(), output_sequence_cuda.get(), sequence_length * sizeof(int32_t), cudaMemcpyDeviceToHost, params.cuda_stream); - cudaStreamSynchronize(params.cuda_stream); + cudaMemcpyAsync(output_sequence_cpu.get(), output_sequence_cuda.get(), sequence_length * sizeof(int32_t), cudaMemcpyDeviceToHost, params->cuda_stream); + cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs - for (int i = 0; i < params.batch_size; i++) { - auto sequence = std::span(output_sequence_cpu.get() + params.search.max_length * i, params.search.max_length); - auto* expected_output_start = &expected_output[i * params.search.max_length]; - EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t))); + for (int i = 0; i < params->batch_size; i++) { + auto sequence = std::span(output_sequence_cpu.get() + params->search.max_length * i, params->search.max_length); + auto* expected_output_start = &expected_output[i * params->search.max_length]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t))); } } @@ -216,14 +216,14 @@ Print all primes between 1 and n auto tokenizer = model->CreateTokenizer(); auto tokens = tokenizer->Encode(prompt); - Generators::GeneratorParams params{*model}; - params.batch_size = 1; - params.sequence_length = static_cast(tokens.size()); - params.input_ids = tokens; - params.search.max_length = 128; + auto params = Generators::CreateGeneratorParams(*model); + params->batch_size = 1; + params->sequence_length = static_cast(tokens.size()); + params->input_ids = tokens; + params->search.max_length = 128; // Generator version - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); while (!generator->IsDone()) { generator->ComputeLogits(); generator->GenerateNextToken_Top(); @@ -254,14 +254,14 @@ Print all primes between 1 and n auto tokenizer = model->CreateTokenizer(); auto tokens = tokenizer->Encode(prompt); - Generators::GeneratorParams params{*model}; - params.batch_size = 1; - params.sequence_length = static_cast(tokens.size()); - params.input_ids = tokens; - params.search.max_length = 128; + auto params = Generators::CreateGeneratorParams(*model); + params->batch_size = 1; + params->sequence_length = static_cast(tokens.size()); + params->input_ids = tokens; + params->search.max_length = 128; // High level version - auto result = Generators::Generate(*model, params); + auto result = Generators::Generate(*model, *params); std::cout << tokenizer->Decode(result[0]) << "\r\n"; #else diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index 3f21ed669..6190e2507 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -24,24 +24,24 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCpu) { int vocab_size = 32000; // vocab size of llama int batch_size = 1; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; - std::unique_ptr logits_cpu(new float[vocab_size * batch_size]); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; + std::vector logits_cpu(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(1, 25); double total_time = 0.0; int num_iter = 1000; for (int i = 0; i < num_iter; i++) { - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); int num_large = dist(engine); - CreateRandomLogits(logits_cpu.get(), num_large, vocab_size, batch_size, engine); - generator->search_->SetLogits(Generators::cpu_span(logits_cpu.get(), vocab_size * batch_size)); + CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); + generator->search_->SetLogits(Generators::cpu_span(logits_cpu.data(), vocab_size * batch_size)); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SampleTopP(0.95f, 1.0f); auto stop = std::chrono::high_resolution_clock::now(); @@ -59,14 +59,14 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) { int batch_size = 1; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; - std::unique_ptr logits_cpu(new float[vocab_size * batch_size]); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; + std::vector logits_cpu(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(5, 25); @@ -74,9 +74,9 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) { int num_iter = 1000; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - CreateRandomLogits(logits_cpu.get(), num_large, vocab_size, batch_size, engine); - generator->search_->SetLogits(Generators::cpu_span(logits_cpu.get(), vocab_size * batch_size)); + auto generator = Generators::CreateGenerator(*model, *params); + CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); + generator->search_->SetLogits(Generators::cpu_span(logits_cpu.data(), vocab_size * batch_size)); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SampleTopK(k, 1.0f); @@ -97,14 +97,14 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) { float p = 0.95f; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; - std::unique_ptr logits_cpu(new float[vocab_size * batch_size]); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; + std::vector logits_cpu(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(5, 25); @@ -112,9 +112,9 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) { int num_iter = 1000; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - CreateRandomLogits(logits_cpu.get(), num_large, vocab_size, batch_size, engine); - generator->search_->SetLogits(Generators::cpu_span(logits_cpu.get(), vocab_size * batch_size)); + auto generator = Generators::CreateGenerator(*model, *params); + CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); + generator->search_->SetLogits(Generators::cpu_span(logits_cpu.data(), vocab_size * batch_size)); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SampleTopKTopP(k, p, 1.0f); @@ -136,14 +136,14 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) { int vocab_size = 32000; // vocab size of llama int batch_size = 1; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; - float* cpu_logits = new float[vocab_size * batch_size]; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; + std::vector cpu_logits(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(1, 25); @@ -153,13 +153,13 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) { int num_iter = 1000; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); + auto generator = Generators::CreateGenerator(*model, *params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SampleTopP(0.95f, 1.0f); auto stop = std::chrono::high_resolution_clock::now(); @@ -167,7 +167,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) { total_time += duration.count(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); } double average_time = total_time / double(num_iter); std::cout << "Average time taken by TopP CUDA: " @@ -183,16 +183,16 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) { int batch_size = 1; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocArray(vocab_size * batch_size); - float* cpu_logits = new float[vocab_size * batch_size]; + std::vector cpu_logits(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(1, 25); @@ -200,12 +200,12 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) { int num_iter = 1000; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); + auto generator = Generators::CreateGenerator(*model, *params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SampleTopK(k, 1.0f); auto stop = std::chrono::high_resolution_clock::now(); @@ -227,30 +227,30 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) { float p = 0.95f; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocArray(vocab_size * batch_size); - float* cpu_logits = new float[vocab_size * batch_size]; + std::vector cpu_logits(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(1, 25); double total_time = 0.0; int num_iter = 1000; for (int i = 0; i < num_iter; i++) { - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); int num_large = dist(engine); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SampleTopKTopP(k, p, 1.0f); auto stop = std::chrono::high_resolution_clock::now(); @@ -258,7 +258,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) { total_time += duration.count(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); } double average_time = total_time / double(num_iter); std::cout << "Average time taken by TopP+K: " @@ -273,13 +273,13 @@ TEST(Benchmarks, BenchmarkRandomizedSelectTopCuda) { int vocab_size = 32000; // vocab size of llama int batch_size = 12; std::vector input_ids{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; // Needs to match batch_size - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocArray(vocab_size * batch_size); std::vector cpu_logits(vocab_size * batch_size); @@ -290,12 +290,12 @@ TEST(Benchmarks, BenchmarkRandomizedSelectTopCuda) { int num_iter = 1000; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); auto start = std::chrono::high_resolution_clock::now(); generator->search_->SelectTop(); auto stop = std::chrono::high_resolution_clock::now(); diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index f42c03e0a..531270f78 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -25,14 +25,14 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) { 0.1f, 0.1f, 0.1f, 0.1f, 0.6f}; int vocab_size = 5; int batch_size = 4; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; - auto generator = Generators::CreateGenerator(*model, params); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; + auto generator = Generators::CreateGenerator(*model, *params); auto logits_span = Generators::cpu_span(logits_cpu); generator->search_->SetLogits(logits_span); generator->computed_logits_ = true; @@ -51,14 +51,14 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) { 1.25f, 0.25f, 1.5f, 0.25f, 2.0f}; int vocab_size = 5; int batch_size = 4; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; - auto generator = Generators::CreateGenerator(*model, params); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; + auto generator = Generators::CreateGenerator(*model, *params); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); generator->computed_logits_ = true; @@ -83,14 +83,14 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) { 1.25f, 0.25f, 1.5f, 0.25f, 2.0f}; int vocab_size = 5; int batch_size = 4; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; - auto generator = Generators::CreateGenerator(*model, params); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; + auto generator = Generators::CreateGenerator(*model, *params); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); generator->computed_logits_ = true; @@ -128,20 +128,20 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) { int vocab_size = 32000; // vocab size of llama int batch_size = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; std::vector logits_cpu(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); std::uniform_int_distribution<> dist(1, 25); int num_iter = 100; for (int i = 0; i < num_iter; i++) { - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); int num_large = dist(engine); CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); auto logits_copy = logits_cpu; @@ -164,13 +164,13 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { int batch_size = 5; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; std::vector logits_cpu(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); @@ -178,7 +178,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); auto logits_copy=logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); @@ -201,13 +201,13 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { float p = 0.95f; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CPU; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CPU; std::vector logits_cpu(vocab_size * batch_size); std::random_device rd; std::mt19937 engine(rd()); @@ -215,7 +215,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); + auto generator = Generators::CreateGenerator(*model, *params); CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine); auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); @@ -246,16 +246,16 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) { auto logits_gpu = Generators::CudaMallocArray(logits_cpu.size()); int vocab_size = 5; int batch_size = 4; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; - cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream); - cudaStreamSynchronize(params.cuda_stream); - auto generator = Generators::CreateGenerator(*model, params); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; + cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params->cuda_stream); + cudaStreamSynchronize(params->cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); generator->computed_logits_ = true; // Verify outputs match expected outputs @@ -274,16 +274,16 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) { auto logits_gpu = Generators::CudaMallocArray(logits_cpu.size()); int vocab_size = 5; int batch_size = 4; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; - cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream); - cudaStreamSynchronize(params.cuda_stream); - auto generator = Generators::CreateGenerator(*model, params); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; + cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params->cuda_stream); + cudaStreamSynchronize(params->cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); generator->computed_logits_ = true; // Verify outputs match expected outputs @@ -307,16 +307,16 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { auto logits_gpu = Generators::CudaMallocArray(logits_cpu.size()); int vocab_size = 5; int batch_size = 4; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; - cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream); - cudaStreamSynchronize(params.cuda_stream); - auto generator = Generators::CreateGenerator(*model, params); + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; + cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params->cuda_stream); + cudaStreamSynchronize(params->cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); generator->computed_logits_ = true; // Verify outputs match expected outputs @@ -336,13 +336,13 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { int vocab_size = 32000; // vocab size of llama int batch_size = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocHostArray(vocab_size * batch_size); float* cpu_logits = new float[vocab_size * batch_size]; @@ -352,15 +352,15 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; generator->GenerateNextToken_TopP(0.95f, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -376,13 +376,13 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { int batch_size = 5; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocHostArray(vocab_size * batch_size); float* cpu_logits = new float[vocab_size * batch_size]; @@ -392,15 +392,15 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); - auto generator = Generators::CreateGenerator(*model, params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; generator->GenerateNextToken_TopK(k, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -417,13 +417,13 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { float p = 0.95f; int k = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocHostArray(vocab_size * batch_size); float* cpu_logits = new float[vocab_size * batch_size]; @@ -433,15 +433,15 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - auto generator = Generators::CreateGenerator(*model, params); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; generator->GenerateNextToken_TopK_TopP(k, p, 1.0f); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -456,13 +456,13 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { int vocab_size = 32000; // vocab size of llama int batch_size = 5; std::vector input_ids{0, 1, 2, 3, 4}; - Generators::GeneratorParams params = Generators::GeneratorParams{}; - params.search.max_length = 10; - params.batch_size = batch_size; - params.sequence_length = 1; - params.vocab_size = vocab_size; - params.input_ids = input_ids; - params.device_type = Generators::DeviceType::CUDA; + auto params = Generators::CreateGeneratorParams(); + params->search.max_length = 10; + params->batch_size = batch_size; + params->sequence_length = 1; + params->vocab_size = vocab_size; + params->input_ids = input_ids; + params->device_type = Generators::DeviceType::CUDA; auto logits_gpu = Generators::CudaMallocArray(vocab_size * batch_size); auto indices_buffer = Generators::CudaMallocHostArray(vocab_size * batch_size); float* cpu_logits = new float[vocab_size * batch_size]; @@ -472,15 +472,15 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { int num_iter = 100; for (int i = 0; i < num_iter; i++) { int num_large = dist(engine); - LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream); - LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream); - cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream); - auto generator = Generators::CreateGenerator(*model, params); + LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream); + LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream); + cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream); + auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; generator->GenerateNextToken_Top(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); - cudaStreamSynchronize(params.cuda_stream); + cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { float max_score = *std::max_element(cpu_logits + vocab_size * b, cpu_logits + vocab_size * (b + 1)); From e46c8b21665c7197e975e52edba8b67b920bf1bf Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Thu, 21 Mar 2024 13:30:10 -0700 Subject: [PATCH 16/36] Fix nightly pipeline (#222) --- .github/workflows/linux-cpu-x64-nightly-build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml index 89a52a803..c7a0234b1 100644 --- a/.github/workflows/linux-cpu-x64-nightly-build.yml +++ b/.github/workflows/linux-cpu-x64-nightly-build.yml @@ -45,12 +45,12 @@ jobs: run: | set -e -x rm -rf build - cmake --preset linux_clang_cpu_release - cmake --build --preset linux_clang_cpu_release + cmake --preset linux_gcc_cpu_release + cmake --build --preset linux_gcc_cpu_release - name: Install the python wheel and test dependencies run: | - python3 -m pip install build/clang_cpu/release/wheel/onnxruntime_genai*.whl + python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl python3 -m pip install -r test/python/requirements-nightly-cpu.txt --user - name: Get HuggingFace Token From a29d4a5d71b760ea2a81cf141892e95b3f917afc Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Thu, 21 Mar 2024 13:40:39 -0700 Subject: [PATCH 17/36] Add repeat KV to model builder (#210) ### Description This PR adds `repeat_kv` to the model builder for models where `num_attention_heads != num_key_value_heads`. ### Motivation and Context By supporting `repeat_kv`, models where `num_attention_heads != num_key_value_heads` can now run on both CPU and GPU. --- src/python/py/models/builder.py | 298 ++++++++++++++++++++++++-------- 1 file changed, 230 insertions(+), 68 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 1652d5778..fa022f7aa 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -116,7 +116,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): "op_type": "MultiHeadAttention", # Attention op to use "use_gqa": ep == "cuda" and io_dtype == TensorProto.FLOAT16 # Check if GroupQueryAttention can be used } - if self.attention_attrs["use_gqa"] or self.num_attn_heads != self.num_kv_heads: + if self.attention_attrs["use_gqa"]: self.attention_attrs["op_type"] = "GroupQueryAttention" # Quantization-specific variables (INT4, INT8, etc.) @@ -166,15 +166,15 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): "do_sample": config.do_sample if hasattr(config, "do_sample") else False, "early_stopping": True, "length_penalty": config.length_penalty if hasattr(config, "length_penalty") else 1.0, - "max_length": config.max_length if hasattr(config, "max_length") else 20, + "max_length": self.context_length, "min_length": 0, "no_repeat_ngram_size": config.no_repeat_ngram_size if hasattr(config, "no_repeat_ngram_size") else 0, "num_beams": config.num_beams if hasattr(config, "num_beams") else 1, "num_return_sequences": config.num_return_sequences if hasattr(config, "num_return_sequences") else 1, - "past_present_share_buffer": True if self.attention_attrs["op_type"] == "GroupQueryAttention" else False, + "past_present_share_buffer": self.attention_attrs["op_type"] == "GroupQueryAttention", "repetition_penalty": config.repetition_penalty if hasattr(config, "repetition_penalty") else 1.0, "temperature": config.temperature if hasattr(config, "temperature") else 1.0, - "top_k": config.top_k if hasattr(config, "top_k") else 50, + "top_k": 1, "top_p": config.top_p if hasattr(config, "top_p") else 1.0, }, } @@ -337,7 +337,7 @@ def make_constant(self, name): path = name.split("/") onnx_dtype, dims, num = eval(path[-3]), path[-2], eval(path[-1]) np_dtype = self.to_numpy_dtype[onnx_dtype] - value = numpy_helper.from_array(np.array(num if dims == "0D" else [num], dtype=np_dtype), name=name.replace("constants", "numpy_helper")) + value = numpy_helper.from_array(np.array(num if dims == "0D" else list(num) if type(num) == tuple else [num], dtype=np_dtype), name=name.replace("constants", "numpy_helper")) node_name = name.replace("constants", "constant_nodes") self.make_node("Constant", inputs=[], outputs=[name], name=node_name, value=value) @@ -349,10 +349,10 @@ def make_gather(self, name, inputs, axis): self.make_node("Gather", inputs=inputs, outputs=[output], name=name, axis=axis) self.make_value_info(output, TensorProto.INT64, shape=[]) - def make_reshape(self, name, inputs): + def make_reshape(self, name, inputs, dtype, shape): output = f"{name}/output_0" self.make_node("Reshape", inputs=inputs, outputs=[output], name=name) - self.make_value_info(output, TensorProto.INT64, shape=None) + self.make_value_info(output, dtype, shape=shape) def make_shape(self, name, root_input, shape): output = f"{name}/output_0" @@ -379,10 +379,10 @@ def make_concat(self, name, inputs, dtype, shape, axis=0): self.make_node("Concat", inputs=inputs, outputs=[output], name=name, axis=axis) self.make_value_info(output, dtype, shape=shape) - def make_equal(self, name, inputs): + def make_equal(self, name, inputs, shape): output = f"{name}/output_0" self.make_node("Equal", inputs=inputs, outputs=[output], name=name) - self.make_value_info(output, TensorProto.BOOL, shape=[4]) + self.make_value_info(output, TensorProto.BOOL, shape=shape) def make_where(self, name, inputs, dtype, shape): output = f"{name}/output_0" @@ -439,6 +439,11 @@ def make_mul(self, name, inputs, dtype, shape): self.make_node("Mul", inputs=inputs, outputs=[output], name=name) self.make_value_info(output, dtype, shape=shape) + def make_transpose(self, name, root_input, dtype, shape, perm): + output = f"{name}/output_0" + self.make_node("Transpose", inputs=[root_input], outputs=[output], perm=perm) + self.make_value_info(output, dtype, shape=shape) + def make_matmul(self, matmul, name, root_input, **kwargs): self.make_matmul_fp16_or_fp32(matmul, name, root_input, **kwargs) @@ -512,7 +517,6 @@ def make_embedding(self, embedding): self.layernorm_attrs["root_input"] = layernorm_attrs_value self.layernorm_attrs["skip_input"] = layernorm_attrs_value - def make_layernorm(self, layer_id, layernorm, skip, simple, location): root_input = self.layernorm_attrs["root_input"] skip_input = self.layernorm_attrs["skip_input"] @@ -552,7 +556,7 @@ def make_layernorm(self, layer_id, layernorm, skip, simple, location): return output_0 - def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): + def make_rotary_embedding_caches(self, rotemb): cos_cache_name, sin_cache_name = "cos_cache", "sin_cache" if self.rotemb_attrs["create_rotary_embedding_caches"]: @@ -576,11 +580,195 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): self.rotemb_attrs["create_rotary_embedding_caches"] = False + return cos_cache_name, sin_cache_name + + def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): + cos_cache_name, sin_cache_name = self.make_rotary_embedding_caches(rotemb) + inputs = [root_input, kwargs.pop("position_ids"), cos_cache_name, sin_cache_name] output = f"{name}/output_0" self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=0, **kwargs) self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * (self.num_kv_heads if "k_rotary" in name else self.num_attn_heads)]) + # TODO: This function and any corresponding changes to support it are temporary until ORT supports GQA for CPU + def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): + # Make subgraph that repeats tensor of shape (batch_size, sequence_length, num_kv_heads, head_size) + # to shape (batch_size, sequence_length, num_attn_heads, head_size) in an interleaved pattern + # and updates the KV caches + # + # root_input + # | + # Reshape + # | + # Transpose + # | + # | past_kv + # | / + # Concat + # | \ + # | present_kv + # | + # +-------+---------+ + # | | + # | Shape + # | | + # | +-----------+-----------+-----------+ + # | | | | | + # | Gather Gather Gather Gather + # | (idx=0) (idx=1) (idx=2) (idx=3) + # | | | | | + # | Unsqueeze Unsqueeze Unsqueeze Unsqueeze + # | | | | | + # | +-----------+-----------+-----------+ + # | | + # | +-----------------------+ + # | | | + # | | Mul + # | | | + # | Concat Concat + # | (5D) (4D) + # | | | + # | Reshape | + # | / | \ | + # | / | \ | + # | / | \ / + # | / | \ / + # | / | \ / + # | / Shape \ / + # | / | \ / + # | | ConstantOfShape \ / + # | \ | \ \ / + # | \ | Mul | / + # | \ | | / / + # | \ | Equal / + # | \ | / / + # \ \ | / / + # \ \ | / / + # \ \ | / / + # \ \ | / / + # Unsqueeze Where / + # \ / / + # \ / / + # \ / / + # \ / / + # Expand / + # | / + # | / + # | / + # | / + # | / + # Reshape + # | + # Transpose + # | + # Reshape + basename = f"/model/layers.{layer_id}/attn/{'k_proj' if past_kv.endswith('key') else 'v_proj'}/repeat_kv" + + # Make the initial subgraph + # + # +------> Gather --> Unsqueeze -----+ + # | | + # past_kv +------> Gather --> Unsqueeze -----+---> Mul --> Concat (4D) + # | | | + # root_input --> Reshape --> Transpose --> Concat --> Shape ---> Gather --> Unsqueeze -----+---> Concat (5D) + # | | | + # present_kv +------> Gather --> Unsqueeze -----+ + reshape_1_name = f"{basename}/Reshape_1" + reshape_1_inputs = [root_input, f"/model/constants/TensorProto.INT64/1D/0, 0, {self.num_kv_heads}, -1"] + self.make_reshape(reshape_1_name, reshape_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_kv_heads, self.head_size]) + transpose_1_name = f"{basename}/Transpose_1" + transpose_1_input = f"{reshape_1_name}/output_0" + self.make_transpose(transpose_1_name, transpose_1_input, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 'sequence_length', self.head_size], perm=[0,2,1,3]) + concat_1_name = f"{basename}/Concat_1" + concat_1_inputs = [past_kv, f"{transpose_1_name}/output_0"] + self.make_node("Concat", inputs=concat_1_inputs, outputs=[present_kv], name=concat_1_name, axis=2) + + shape_1_name = f"{basename}/Shape_1" + self.make_shape(shape_1_name, present_kv, shape=[4]) + gather_1_name = f"{basename}/Gather_1" + gather_1_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/0"] + self.make_gather(gather_1_name, gather_1_inputs, axis=0) + unsqueeze_1_name = f"{basename}/Unsqueeze_1" + unsqueeze_1_inputs = [f"{gather_1_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"] + self.make_unsqueeze(unsqueeze_1_name, unsqueeze_1_inputs, dtype=TensorProto.INT64, shape=[1]) + gather_2_name = f"{basename}/Gather_2" + gather_2_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/1"] + self.make_gather(gather_2_name, gather_2_inputs, axis=0) + unsqueeze_2_name = f"{basename}/Unsqueeze_2" + unsqueeze_2_inputs = [f"{gather_2_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"] + self.make_unsqueeze(unsqueeze_2_name, unsqueeze_2_inputs, dtype=TensorProto.INT64, shape=[1]) + gather_3_name = f"{basename}/Gather_3" + gather_3_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/2"] + self.make_gather(gather_3_name, gather_3_inputs, axis=0) + unsqueeze_3_name = f"{basename}/Unsqueeze_3" + unsqueeze_3_inputs = [f"{gather_3_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"] + self.make_unsqueeze(unsqueeze_3_name, unsqueeze_3_inputs, dtype=TensorProto.INT64, shape=[1]) + gather_4_name = f"{basename}/Gather_4" + gather_4_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/3"] + self.make_gather(gather_4_name, gather_4_inputs, axis=0) + unsqueeze_4_name = f"{basename}/Unsqueeze_4" + unsqueeze_4_inputs = [f"{gather_4_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"] + self.make_unsqueeze(unsqueeze_4_name, unsqueeze_4_inputs, dtype=TensorProto.INT64, shape=[1]) + concat_2_name = f"{basename}/Concat_2" + concat_2_inputs = [f"{unsqueeze_1_name}/output_0", f"{unsqueeze_2_name}/output_0", f"/model/constants/TensorProto.INT64/1D/{self.num_attn_heads // self.num_kv_heads}", f"{unsqueeze_3_name}/output_0", f"{unsqueeze_4_name}/output_0"] + self.make_concat(concat_2_name, concat_2_inputs, dtype=TensorProto.INT64, shape=[5], axis=0) + + mul_1_name = f"{basename}/Mul_1" + mul_1_inputs = [f"{unsqueeze_2_name}/output_0", f"/model/constants/TensorProto.INT64/0D/{self.num_attn_heads // self.num_kv_heads}"] + self.make_mul(mul_1_name, mul_1_inputs, dtype=TensorProto.INT64, shape=None) + concat_3_name = f"{basename}/Concat_3" + concat_3_inputs = [f"{unsqueeze_1_name}/output_0", f"{mul_1_name}/output_0", f"{unsqueeze_3_name}/output_0", f"{unsqueeze_4_name}/output_0"] + self.make_concat(concat_3_name, concat_3_inputs, dtype=TensorProto.INT64, shape=[4], axis=0) + + # Make the subgraph that follows the initial subgraph + # + # Mul ---> Equal + # / \ + # Reshape --> Shape --> ConstantOfShape --> Where + # | | + # +----------------------------------------+ + reshape_2_name = f"{basename}/Reshape_2" + reshape_2_inputs = [f"{concat_2_name}/output_0", "/model/constants/TensorProto.INT64/1D/-1"] + self.make_reshape(reshape_2_name, reshape_2_inputs, dtype=TensorProto.INT64, shape=None) + shape_2_name = f"{basename}/Shape_2" + self.make_shape(shape_2_name, f"{reshape_2_name}/output_0", shape=[1]) + constant_shape_name = f"{basename}/ConstantOfShape" + constant_shape_value = numpy_helper.from_array(np.array([1], dtype="int64")) + self.make_constant_of_shape(constant_shape_name, f"{shape_2_name}/output_0", value=constant_shape_value, dtype=TensorProto.INT64, shape=[5]) + mul_2_name = f"{basename}/Mul" + mul_2_inputs = [f"{constant_shape_name}/output_0", "/model/constants/TensorProto.INT64/0D/-1"] + self.make_mul(mul_2_name, mul_2_inputs, dtype=TensorProto.INT64, shape=[5]) + equal_name = f"{basename}/Equal" + equal_inputs = [f"{reshape_2_name}/output_0", f"{mul_2_name}/output_0"] + self.make_equal(equal_name, equal_inputs, shape=[5]) + where_name = f"{basename}/Where" + where_inputs = [f"{equal_name}/output_0", f"{constant_shape_name}/output_0", f"{reshape_2_name}/output_0"] + self.make_where(where_name, where_inputs, dtype=TensorProto.INT64, shape=[5]) + + # Make the final nodes + # + # Where (from above) Concat (from above) + # \ \ + # Unsqueeze --> Expand --> Reshape --> Transpose --> Reshape + unsqueeze_5_name = f"{basename}/Unsqueeze_5" + unsqueeze_5_inputs = [present_kv, "/model/constants/TensorProto.INT64/1D/2"] + self.make_unsqueeze(unsqueeze_5_name, unsqueeze_5_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 1, 'sequence_length', self.head_size]) + expand_name = f"{basename}/Expand" + expand_inputs = [f"{unsqueeze_5_name}/output_0", f"{where_name}/output_0"] + self.make_expand(expand_name, expand_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, self.num_attn_heads // self.num_kv_heads, 'sequence_length', self.head_size]) + reshape_3_name = f"{basename}/Reshape_3" + reshape_3_inputs = [f"{expand_name}/output_0", f"{concat_3_name}/output_0"] + self.make_reshape(reshape_3_name, reshape_3_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_attn_heads, 'sequence_length', self.head_size]) + transpose_2_name = f"{basename}/Transpose_2" + transpose_2_input = f"{reshape_3_name}/output_0" + self.make_transpose(transpose_2_name, transpose_2_input, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads, self.head_size], perm=[0,2,1,3]) + reshape_4_name = f"{basename}/Reshape_4" + reshape_4_inputs = [f"{transpose_2_name}/output_0", f"/model/constants/TensorProto.INT64/1D/0, 0, {self.num_attn_heads * self.head_size}"] + self.make_reshape(reshape_4_name, reshape_4_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads * self.head_size]) + + input_to_attention = f"{reshape_4_name}/output_0" + return input_to_attention + def make_attention_op(self, name, **kwargs): op_type = self.attention_attrs["op_type"] @@ -648,13 +836,20 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): # | # O_Add + q_input_to_attention = "" + k_input_to_attention = "" + v_input_to_attention = "" + # Make MatMul nodes q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul" self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input) + q_input_to_attention = f"{q_matmul_name}/output_0" k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul" self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input) + k_input_to_attention = f"{k_matmul_name}/output_0" v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul" self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input) + v_input_to_attention = f"{v_matmul_name}/output_0" # Make Add nodes (if bias exists) q_bias_exists = attention.q_proj.bias is not None @@ -664,27 +859,42 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): if q_bias_exists: q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add" self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=f"{q_matmul_name}/output_0") + q_input_to_attention = f"{q_add_name}/output_0" if k_bias_exists: k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add" self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=f"{k_matmul_name}/output_0") + k_input_to_attention = f"{k_add_name}/output_0" if v_bias_exists: v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add" self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=f"{v_matmul_name}/output_0") + v_input_to_attention = f"{v_add_name}/output_0" # Make RotaryEmbedding nodes q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding" q_rotary_input = f"{q_matmul_name if not q_bias_exists else q_add_name}/output_0" self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, q_rotary_input, position_ids=kwargs.get("position_ids", "position_ids")) + q_input_to_attention = f"{q_rotary_name}/output_0" + k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding" k_rotary_input = f"{k_matmul_name if not k_bias_exists else k_add_name}/output_0" self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, k_rotary_input, position_ids=kwargs.get("position_ids", "position_ids")) + k_input_to_attention = f"{k_rotary_name}/output_0" + + # Make repeat KV nodes (TODO: remove once ORT supports GQA for CPU) + past_k = f"past_key_values.{layer_id}.key" + past_v = f"past_key_values.{layer_id}.value" + present_k = f"present.{layer_id}.key" + present_v = f"present.{layer_id}.value" + if self.num_attn_heads != self.num_kv_heads and not self.attention_attrs['use_gqa']: + k_input_to_attention = self.make_repeat_kv(layer_id, k_input_to_attention, past_k, present_k) + v_input_to_attention = self.make_repeat_kv(layer_id, v_input_to_attention, past_v, present_v) + past_k, past_v, present_k, present_v = "", "", "", "" # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.) attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}" self.make_attention_op( - attn_name, q_path=f"{q_rotary_name}/output_0", k_path=f"{k_rotary_name}/output_0", v_path=f"{v_matmul_name if not v_bias_exists else v_add_name}/output_0", - past_k=f"past_key_values.{layer_id}.key", past_v=f"past_key_values.{layer_id}.value", - present_k=f"present.{layer_id}.key", present_v=f"present.{layer_id}.value", **kwargs, + attn_name, q_path=q_input_to_attention, k_path=k_input_to_attention, v_path=v_input_to_attention, + past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, **kwargs, ) # Make MatMul node (output projection weight node) @@ -938,8 +1148,8 @@ def make_attention_mask_reformatting(self): # TODO: replace Concat with Expand for performance gains concat_name = f"{basename}/Concat" - concat_inputs = [f"{end_add_name}/output_0" for _ in range(self.num_kv_heads)] - concat_shape = ["batch_size", self.num_kv_heads, "source_sequence_length", "target_sequence_length"] + concat_inputs = [f"{end_add_name}/output_0" for _ in range(self.num_attn_heads)] + concat_shape = ["batch_size", self.num_attn_heads, "source_sequence_length", "target_sequence_length"] self.make_concat(concat_name, concat_inputs, dtype=self.io_dtype, shape=concat_shape, axis=1) # Shape of mask is now (B, N, S, T) self.mask_attrs["mask_name"] = concat_name @@ -1027,7 +1237,7 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name): # Merged path reshape_name = f"{basename}/Reshape" reshape_inputs = [f"{add_2_name}/output_0", f"{concat_3_name}/output_0"] - self.make_reshape(reshape_name, reshape_inputs) + self.make_reshape(reshape_name, reshape_inputs, dtype=TensorProto.INT64, shape=None) less_name = f"{basename}/Less" less_inputs = [f"{range_name}/output_0", f"{reshape_name}/output_0"] self.make_less(less_name, less_inputs) @@ -1147,7 +1357,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for self.make_mul(mul_name, mul_inputs, dtype=TensorProto.INT64, shape=["unk"]) equal_name = f"{basename}/Equal" equal_inputs = [f"{concat_name}/output_0", f"{mul_name}/output_0"] - self.make_equal(equal_name, equal_inputs) + self.make_equal(equal_name, equal_inputs, shape=[4]) where_name = f"{basename}/Where_1" where_inputs = [f"{equal_name}/output_0", f"{constant_shape_name}/output_0", f"{concat_name}/output_0"] @@ -1159,7 +1369,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for self.make_expand(expand_name, expand_inputs, dtype=expand_dtype, shape=expand_shape) return expand_name - + class LlamaModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): @@ -1247,7 +1457,7 @@ def make_position_ids_reformatting(self): self.make_concat(concat_name, concat_inputs, dtype=TensorProto.INT64, shape=[2], axis=0) reshape_name = f"{basename}/Reshape" reshape_inputs = ["position_ids", f"{concat_name}/output_0"] - self.make_reshape(reshape_name, reshape_inputs) + self.make_reshape(reshape_name, reshape_inputs, dtype=TensorProto.INT64, shape=None) return reshape_name @@ -1265,54 +1475,6 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) - - def make_group_query_attention(self, name, **kwargs): - if self.layer_id < self.num_layers - 3: - super().make_group_query_attention(name, **kwargs) - return - - # Cast inputs and outputs of GroupQueryAttention - input_kwargs = {"q_path", "k_path", "v_path", "past_k", "past_v"} - new_kwargs = {} - - # Make input cast nodes to bfloat16 - for input_name in input_kwargs: - cast_name = f"/model/layers.{self.layer_id}/attn/{input_name.replace('path', 'proj')}/Cast" - cast_shape = ['batch_size', 'sequence_length', self.hidden_size] if input_name in {"q_path", "k_path", "v_path"} else ["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size] - self.make_cast(cast_name, kwargs[input_name], dtype=TensorProto.BFLOAT16, shape=cast_shape) - new_kwargs[input_name] = f"{cast_name}/output_0" - - # Make GroupQueryAttention node - inputs = [ - new_kwargs["q_path"], new_kwargs["k_path"], new_kwargs["v_path"], - new_kwargs["past_k"], new_kwargs["past_v"], - kwargs.get("seqlens_k", ""), kwargs.get("total_seq_len", ""), - ] - outputs = [f"{name}/Cast/output_0", f"{name}/output_1", f"{name}/output_2"] - self.make_node("GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads) - self.make_value_info(outputs[0], TensorProto.BFLOAT16, shape=['batch_size', 'sequence_length', self.hidden_size]) - - present_kv_shape = ["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size] - self.make_value_info(outputs[1], TensorProto.BFLOAT16, shape=present_kv_shape) - self.make_value_info(outputs[2], TensorProto.BFLOAT16, shape=present_kv_shape) - - # Make output cast nodes to float16 - target_dtype = TensorProto.FLOAT16 - - cast_o_path_name = f"{name}/o_proj/Cast" - cast_o_path_output = f"{name}/output_0" - self.make_node("Cast", inputs=[outputs[0]], outputs=[cast_o_path_output], name=cast_o_path_name, to=target_dtype) - self.make_value_info(cast_o_path_output, target_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) - - cast_present_k_name = f"{name}/present_k/Cast" - cast_present_k_output = f"present.{self.layer_id}.key" - self.make_node("Cast", inputs=[outputs[1]], outputs=[cast_present_k_output], name=cast_present_k_name, to=target_dtype) - self.make_value_info(cast_present_k_output, target_dtype, shape=present_kv_shape) - - cast_present_v_name = f"{name}/present_v/Cast" - cast_present_v_output = f"present.{self.layer_id}.value" - self.make_node("Cast", inputs=[outputs[2]], outputs=[cast_present_v_output], name=cast_present_v_name, to=target_dtype) - self.make_value_info(cast_present_v_output, target_dtype, shape=present_kv_shape) def make_mlp(self, layer_id, mlp, root_input): # Make nodes for the MLP subgraph From 28de36d28e083db4ef0c97e3207d2b3cc4937250 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Fri, 22 Mar 2024 09:24:34 -0700 Subject: [PATCH 18/36] Update README.md (#223) --- README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.md b/README.md index 350a2fe67..cc87d474c 100644 --- a/README.md +++ b/README.md @@ -98,10 +98,6 @@ huggingface-cli login --token python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o ``` -## Known issues - -* Mistral and Gemma support on CUDA only - ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a From a8a31a1e578ff5762b608bacdf3792f6044a3f34 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Fri, 22 Mar 2024 18:26:17 -0400 Subject: [PATCH 19/36] enable nuget publishing (#209) --- .pipelines/nuget-publishing.yml | 13 +++- .../stages/jobs/nuget-linux-packaging-job.yml | 50 ------------ .../stages/jobs/nuget-packaging-job.yml | 77 +++++++++++++++++++ .../stages/jobs/nuget-win-packaging-job.yml | 70 ----------------- .../stages/jobs/py-linux-packaging-job.yml | 8 +- .../stages/jobs/py-win-packaging-job.yml | 30 +------- .../stages/jobs/steps/capi-linux-step.yml | 23 +++++- .../stages/jobs/steps/capi-win-step.yml | 36 ++++++++- .../jobs/steps/compliant/esrp_nuget.yml | 31 ++++++++ .../jobs/steps/nuget-releasing-step.yml | 49 ++++++++++++ .../stages/jobs/steps/nuget-win-step.yml | 16 +++- .../stages/jobs/steps/utils/capi-archive.yml | 6 +- .../stages/jobs/steps/utils/download-ort.yml | 2 +- .../get-nuget-package-version-as-variable.yml | 42 ++++++++++ .pipelines/stages/nuget-packaging-stage.yml | 28 +++++-- 15 files changed, 313 insertions(+), 168 deletions(-) delete mode 100644 .pipelines/stages/jobs/nuget-linux-packaging-job.yml create mode 100644 .pipelines/stages/jobs/nuget-packaging-job.yml delete mode 100644 .pipelines/stages/jobs/nuget-win-packaging-job.yml create mode 100644 .pipelines/stages/jobs/steps/compliant/esrp_nuget.yml create mode 100644 .pipelines/stages/jobs/steps/nuget-releasing-step.yml create mode 100644 .pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml index bb639be7c..e91b57489 100644 --- a/.pipelines/nuget-publishing.yml +++ b/.pipelines/nuget-publishing.yml @@ -33,6 +33,16 @@ parameters: - '12.2' default: '11.8' +- name: publish_to_ado_feed + displayName: 'Publish to Azure DevOps Feed' + type: boolean + default: false + +- name: publish_to_nuget + displayName: 'Publish to NuGet.org' + type: boolean + default: false + resources: repositories: - repository: manylinux @@ -51,4 +61,5 @@ stages: enable_linux_cuda: ${{ parameters.enable_linux_cuda }} ort_version: ${{ parameters.ort_version }} cuda_version: ${{ parameters.cuda_version }} - + publish_to_nuget: ${{ parameters.publish_to_nuget }} + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} \ No newline at end of file diff --git a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml b/.pipelines/stages/jobs/nuget-linux-packaging-job.yml deleted file mode 100644 index fdf9d7106..000000000 --- a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml +++ /dev/null @@ -1,50 +0,0 @@ -parameters: -- name: arch - type: string -- name: ep - type: string -- name: ort_version - type: string -- name: cuda_version - type: string - default: '' - -jobs: -- job: Linux_Nuget_Packaging_${{ parameters.ep }}_${{ parameters.arch }} - pool: 'onnxruntime-Ubuntu2204-AMD-CPU' - timeoutInMinutes: 180 - variables: - - name: artifactName - value: 'onnxruntime-genai-capi-linux-${{ parameters.ep }}-${{ parameters.arch }}' - - name: ort_version - value: ${{ parameters.ort_version }} - - name: arch - value: ${{ parameters.arch }} - - name: ep - value: ${{ parameters.ep }} - - name: buildDir - value: 'build/${{ parameters.ep }}' - - name: ort_filename - ${{ if eq(parameters.ep, 'cpu') }}: - value: 'onnxruntime-linux-${{ parameters.arch }}-${{ parameters.ort_version }}' - ${{ else}}: - ${{if eq(parameters.cuda_version, '11.8') }}: - value: 'onnxruntime-linux-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' - ${{ else }}: - value: 'onnxruntime-linux-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' - workspace: - clean: all - steps: - - template: steps/capi-linux-step.yml - parameters: - target: 'onnxruntime-genai' - -# TODO: Add a step to build the nuget package - - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime Genai capi' - inputs: - ArtifactName: $(artifactName) - - - template: steps/compliant-and-cleanup-step.yml - diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml new file mode 100644 index 000000000..af5250c3c --- /dev/null +++ b/.pipelines/stages/jobs/nuget-packaging-job.yml @@ -0,0 +1,77 @@ +parameters: +- name: arch + type: string +- name: ep + type: string +- name: ort_version + type: string +- name: cuda_version + type: string + default: '' +- name: os + type: string +- name: publish_to_ado_feed + type: boolean +- name: publish_to_nuget + type: boolean +jobs: +- job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging + ${{ if eq(parameters.os, 'linux') }}: + pool: 'onnxruntime-Ubuntu2204-AMD-CPU' + ${{ if eq(parameters.os, 'win') }}: + pool: 'onnxruntime-Win-CPU-2022' + timeoutInMinutes: 180 +# set variables here to be used in the template and steps + variables: + - name: arch + value: ${{ parameters.arch }} + - name: artifactName + value: 'onnxruntime-genai-${{ parameters.os }}-${{ parameters.ep }}-${{ parameters.arch }}' + - name: buildConfig + value: 'Release' + - name: buildDir + value: 'build/${{ parameters.ep }}' + - name: cuda_version + value: ${{ parameters.cuda_version }} + - name: ep + value: ${{ parameters.ep }} + - name: ort_version + value: ${{ parameters.ort_version }} + - name: GDN_CODESIGN_TARGETDIRECTORY + value: '$(Build.ArtifactStagingDirectory)/nuget' + - name: ort_filename + ${{ if eq(parameters.ep, 'cpu') }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}' + ${{ else}}: + ${{if eq(parameters.cuda_version, '11.8') }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' + ${{ else }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' + - name: genai_nuget_ext + ${{ if eq(parameters.ep, 'cpu') }}: + value: '' + ${{ if eq(parameters.ep, 'cuda') }}: + value: '.Cuda' + - name: ort_nuget_ext + ${{ if eq(parameters.ep, 'cpu') }}: + value: '' + ${{ if eq(parameters.ep, 'cuda') }}: + value: '.Gpu' + workspace: + clean: all + steps: + - template: steps/capi-${{ parameters.os }}-step.yml + parameters: + target: 'onnxruntime-genai' + +# TODO: Add a step to build the linux nuget package + - ${{ if eq(parameters.os, 'win') }}: + - template: steps/nuget-${{ parameters.os }}-step.yml + - ${{ if or(eq(parameters.publish_to_nuget, true), eq(parameters.publish_to_ado_feed, true))}}: + - template: steps/nuget-releasing-step.yml + parameters: + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + publish_to_nuget: ${{ parameters.publish_to_nuget }} + + - template: steps/compliant-and-cleanup-step.yml + diff --git a/.pipelines/stages/jobs/nuget-win-packaging-job.yml b/.pipelines/stages/jobs/nuget-win-packaging-job.yml deleted file mode 100644 index b15ceb1ee..000000000 --- a/.pipelines/stages/jobs/nuget-win-packaging-job.yml +++ /dev/null @@ -1,70 +0,0 @@ -parameters: -- name: arch - type: string - values: - - 'x64' - - 'arm64' -- name: ep - type: string - values: - - 'cpu' - - 'cuda' -- name: ort_version - type: string -- name: cuda_version - type: string - default: '' -jobs: -- job: Windows_Nuget_Packaging_${{ parameters.ep }}_${{ parameters.arch }} - pool: 'onnxruntime-Win-CPU-2022' - timeoutInMinutes: 180 - variables: - - name: buildConfig - value: 'Release' - - name: cuda_version - value: ${{ parameters.cuda_version }} - - name: ort_version - value: ${{ parameters.ort_version }} - - name: arch - value: ${{ parameters.arch }} - - name: ep - value: ${{ parameters.ep }} - - name: buildDir - value: 'build\${{ parameters.ep }}' - - name: artifactName - value : 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}' - - name: ort_filename - ${{ if eq(parameters.ep, 'cpu') }}: - value: 'onnxruntime-win-${{ parameters.arch }}-${{ parameters.ort_version }}' - ${{ else}}: - ${{if eq(parameters.cuda_version, '11.8') }}: - value: 'onnxruntime-win-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' - ${{ else }}: - value: 'onnxruntime-win-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' - - name: genai_nuget_ext - ${{ if eq(parameters.ep, 'cpu') }}: - value: '' - ${{ if eq(parameters.ep, 'cuda') }}: - value: '.Cuda' - - name: ort_nuget_ext - ${{ if eq(parameters.ep, 'cpu') }}: - value: '' - ${{ if eq(parameters.ep, 'cuda') }}: - value: '.Gpu' - workspace: - clean: all - steps: - - template: steps/capi-win-step.yml - parameters: - target: 'onnxruntime-genai' - - - template: steps/nuget-win-step.yml - - - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime Genai capi' - inputs: - ArtifactName: $(artifactName) - - - template: steps/compliant-and-cleanup-step.yml - diff --git a/.pipelines/stages/jobs/py-linux-packaging-job.yml b/.pipelines/stages/jobs/py-linux-packaging-job.yml index 7c9e55044..b7c35d6a5 100644 --- a/.pipelines/stages/jobs/py-linux-packaging-job.yml +++ b/.pipelines/stages/jobs/py-linux-packaging-job.yml @@ -31,6 +31,7 @@ jobs: workspace: clean: all pool: 'onnxruntime-Ubuntu2204-AMD-CPU' +# set variables here to be used in the template and steps variables: # The build machine pool doesn't have dotnet, so it can't run CG. - name: skipComponentGovernanceDetection @@ -39,6 +40,8 @@ jobs: value: ${{ parameters.arch }} - name: ep value: ${{ parameters.ep }} + - name: artifactName + value: 'onnxruntime-genai-capi-linux-${{ parameters.ep }}-${{ parameters.arch }}-python' - name: cuda_version value: ${{ parameters.cuda_version }} - name: ort_version @@ -57,10 +60,5 @@ jobs: parameters: target: 'python' - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime python wheel' - inputs: - ArtifactName: onnxruntime-genai-linux-$(ep)-$(arch) - - template: steps/compliant-and-cleanup-step.yml diff --git a/.pipelines/stages/jobs/py-win-packaging-job.yml b/.pipelines/stages/jobs/py-win-packaging-job.yml index 88b285506..0989398eb 100644 --- a/.pipelines/stages/jobs/py-win-packaging-job.yml +++ b/.pipelines/stages/jobs/py-win-packaging-job.yml @@ -24,11 +24,14 @@ jobs: Python312_x64: PythonVersion: '3.12' timeoutInMinutes: 180 +# set variables here to be used in the template and steps variables: - name: ep value: ${{ parameters.ep }} - name: cuda_version value: ${{ parameters.cuda_version }} + - name: artifactName + value: 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}-wheel' - name: arch value: ${{ parameters.arch }} - name: ort_version @@ -64,32 +67,5 @@ jobs: - template: steps/capi-win-step.yml parameters: target: 'python' -# ep: ${{ parameters.ep }} - - - template: steps/compliant/win-esrp-dll-step.yml - parameters: - FolderPath: '$(Build.Repository.LocalPath)\build\$(ep)\wheel\onnxruntime_genai' - DisplayName: 'ESRP - PYD Sign' - DoEsrp: true - Pattern: '*.pyd' - - - powershell: | - cmake --build --preset windows_$(arch)_$(ep)_release --parallel --PyPackageBuild - displayName: 'Build Python Wheel' - - - powershell: | - Get-ChildItem -Path $(Build.Repository.LocalPath) -Recurse - - - task: CopyFiles@2 - displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.Repository.LocalPath)\build\$(ep)\wheel' - Contents: '*.whl' - TargetFolder: '$(Build.ArtifactStagingDirectory)' - - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime Genai python wheel' - inputs: - ArtifactName: onnxruntime-genai-win-$(ep)-$(arch) - template: steps/compliant-and-cleanup-step.yml \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml index 897fae5d5..6fa0f3c92 100644 --- a/.pipelines/stages/jobs/steps/capi-linux-step.yml +++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml @@ -66,6 +66,13 @@ steps: - template: utils/capi-archive.yml parameters: archiveType: tar + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime Genai capi' + inputs: + ArtifactName: $(artifactName)-capi + PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi' + - ${{ if eq(parameters.target, 'python') }}: - bash: | set -e -x @@ -82,6 +89,13 @@ steps: --target python" displayName: 'Build Python $(PyNoDotVer)' workingDirectory: '$(Build.Repository.LocalPath)' + + - task: BinSkim@4 + displayName: 'Run BinSkim' + inputs: + AnalyzeTargetGlob: '$(Build.Repository.LocalPath)/**/*.pyd' + continueOnError: true + - bash: | set -e -x docker run \ @@ -97,12 +111,19 @@ steps: --target PyPackageBuild" displayName: 'PyPackageBuild $(PyNoDotVer)' workingDirectory: '$(Build.Repository.LocalPath)' + - task: CopyFiles@2 displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' inputs: SourceFolder: '$(Build.Repository.LocalPath)/build/$(ep)/wheel' Contents: '*manylinux*.whl' - TargetFolder: '$(Build.ArtifactStagingDirectory)' + TargetFolder: '$(Build.ArtifactStagingDirectory)/wheel' + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime python wheel' + inputs: + ArtifactName: $(artifactName) + PathtoPublish: '$(Build.ArtifactStagingDirectory)/wheel' - script: | ls $(Build.Repository.LocalPath) -R diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index 1e4d9e2d3..aebc4cd13 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -27,7 +27,6 @@ steps: echo "ep=$(ep)" echo "cuda_version=$(cuda_version)" echo "target=${{ parameters.target }}" - echo "ort_filename=$(ort_filename)" displayName: 'Print Parameters' - template: utils/download-ort.yml @@ -62,6 +61,7 @@ steps: parameters: FolderPath: '$(buildDir)' DisplayName: 'ESRP - Sign C++ dlls' + Pattern: '*genai.dll' - task: BinSkim@4 displayName: 'Run BinSkim' @@ -73,9 +73,43 @@ steps: parameters: archiveType: zip + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime Genai capi' + inputs: + ArtifactName: $(artifactName)-capi + PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi' + - ${{ if eq(parameters.target, 'python') }}: - task: BinSkim@4 displayName: 'Run BinSkim' inputs: AnalyzeTargetGlob: '$(Build.Repository.LocalPath)\**\*.pyd' continueOnError: true + + - template: compliant/win-esrp-dll-step.yml + parameters: + FolderPath: '$(Build.Repository.LocalPath)\build\$(ep)\wheel\onnxruntime_genai' + DisplayName: 'ESRP - PYD Sign' + DoEsrp: true + Pattern: '*.pyd' + + - powershell: | + cmake --build --preset windows_$(arch)_$(ep)_release --parallel --PyPackageBuild + displayName: 'Build Python Wheel' + + - powershell: | + Get-ChildItem -Path $(Build.Repository.LocalPath) -Recurse + + - task: CopyFiles@2 + displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.Repository.LocalPath)\build\$(ep)\wheel' + Contents: '*.whl' + TargetFolder: '$(Build.ArtifactStagingDirectory)\wheel' + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime python wheel' + inputs: + ArtifactName: $(artifactName)-wheel + PathtoPublish: '$(Build.ArtifactStagingDirectory)\wheel' + diff --git a/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml new file mode 100644 index 000000000..081e7a809 --- /dev/null +++ b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml @@ -0,0 +1,31 @@ +parameters: + FolderPath: '' + DisplayName: '' + DoEsrp: 'false' + +steps: +- ${{ if eq(parameters['DoEsrp'], 'true') }}: + - task: SFP.build-tasks.custom-build-task-1.EsrpCodeSigning@2 + displayName: ${{ parameters.DisplayName }} + inputs: + ConnectedServiceName: 'OnnxRuntime CodeSign 20190817' + FolderPath: ${{ parameters.FolderPath }} + Pattern: '*.nupkg' + signConfigType: inlineSignParams + inlineOperation: | + [ + { + "keyCode": "CP-401405", + "operationSetCode": "NuGetSign", + "parameters": [ ], + "toolName": "sign", + "toolVersion": "1.0" + }, + { + "keyCode": "CP-401405", + "operationSetCode": "NuGetVerify", + "parameters": [ ], + "toolName": "sign", + "toolVersion": "1.0" + } + ] \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/nuget-releasing-step.yml b/.pipelines/stages/jobs/steps/nuget-releasing-step.yml new file mode 100644 index 000000000..8442fd069 --- /dev/null +++ b/.pipelines/stages/jobs/steps/nuget-releasing-step.yml @@ -0,0 +1,49 @@ +parameters: +- name: publish_to_ado_feed + type: boolean +- name: publish_to_nuget + type: boolean +steps: +- task: NuGetToolInstaller@1 + inputs: + versionSpec: 6.8.x + +- powershell: | + New-Item -Path $(Agent.TempDirectory) -Name "binfiles" -ItemType "directory" + $base_path_name = Join-Path -Path $(Agent.TempDirectory) -ChildPath "binfiles" + Get-ChildItem $(GDN_CODESIGN_TARGETDIRECTORY) -Filter *.nupkg | + Foreach-Object { + $dir_name = Join-Path -Path $base_path_name -ChildPath $_.Basename + $cmd = "7z.exe x $($_.FullName) -y -o$dir_name" + Write-Output $cmd + Invoke-Expression -Command $cmd + } + dir $(Agent.TempDirectory) + tree $(Agent.TempDirectory) + workingDirectory: '$(Agent.TempDirectory)' + +- task: CodeSign@1 + displayName: 'Run Codesign Validation' + +- task: PublishSecurityAnalysisLogs@3 + displayName: 'Publish Security Analysis Logs' + continueOnError: true + +- task: PostAnalysis@2 + inputs: + GdnBreakAllTools: true + GdnBreakPolicy: M365 + GdnBreakPolicyMinSev: Error + +- template: utils/get-nuget-package-version-as-variable.yml + parameters: + packageFolder: '$(GDN_CODESIGN_TARGETDIRECTORY)' +#This task must be run on a Windows machine +- ${{ if eq(parameters.publish_to_ado_feed, true) }}: + - task: NuGetCommand@2 + displayName: 'NuGet push to Azure DevOps Feed' + inputs: + command: push + packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg' + publishVstsFeed: 'PublicPackages/onnxruntime-genai' + allowPackageConflicts: true \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml index e54c63336..b3c5e4fa8 100644 --- a/.pipelines/stages/jobs/steps/nuget-win-step.yml +++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml @@ -14,7 +14,7 @@ steps: parameters: FolderPath: '$(Build.Repository.LocalPath)\src\csharp\bin\Release\' DisplayName: 'ESRP - Sign C# dlls' - + Pattern: '*OnnxRuntimeGenAI*.dll' - powershell: | $VERSION = '0.1.0-rc1' nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec ` @@ -41,4 +41,16 @@ steps: inputs: SourceFolder: '$(Build.Repository.LocalPath)\nuget' Contents: '*.nupkg' - TargetFolder: '$(Build.ArtifactStagingDirectory)' \ No newline at end of file + TargetFolder: '$(Build.ArtifactStagingDirectory)\nuget' + +- template: compliant/esrp_nuget.yml + parameters: + DisplayName: 'ESRP - sign NuGet package' + FolderPath: '$(Build.ArtifactStagingDirectory)\nuget' + DoEsrp: 'true' + +- task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime Genai capi' + inputs: + PathtoPublish: '$(Build.ArtifactStagingDirectory)\nuget' + ArtifactName: $(artifactName)-nuget' \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/utils/capi-archive.yml b/.pipelines/stages/jobs/steps/utils/capi-archive.yml index 6f198fe97..1395b31f7 100644 --- a/.pipelines/stages/jobs/steps/utils/capi-archive.yml +++ b/.pipelines/stages/jobs/steps/utils/capi-archive.yml @@ -5,7 +5,7 @@ steps: - bash: | echo "##[error]Error: artifactName and buildDir are not set" exit 1 - displayName: 'Check if variables ort_filename and ort_filename are set' + displayName: 'Check if variables artifactName and buildDir are set' condition: or( eq (variables['artifactName'], ''), eq (variables['buildDir'], '')) - task: CopyFiles@2 @@ -60,9 +60,9 @@ steps: archiveType: ${{ parameters.archiveType }} ${{ if eq(parameters.archiveType, 'tar') }}: tarCompression: 'gz' - archiveFile: '$(Build.ArtifactStagingDirectory)/$(artifactName).tgz' + archiveFile: '$(Build.ArtifactStagingDirectory)/capi/$(artifactName).tgz' ${{ else }}: - archiveFile: '$(Build.ArtifactStagingDirectory)/$(artifactName).zip' + archiveFile: '$(Build.ArtifactStagingDirectory)/capi/$(artifactName).zip' replaceExistingArchive: true - task: DeleteFiles@1 diff --git a/.pipelines/stages/jobs/steps/utils/download-ort.yml b/.pipelines/stages/jobs/steps/utils/download-ort.yml index 25bea32c6..366e3009e 100644 --- a/.pipelines/stages/jobs/steps/utils/download-ort.yml +++ b/.pipelines/stages/jobs/steps/utils/download-ort.yml @@ -5,7 +5,7 @@ steps: - bash: | echo "##[error]Error: ort_version and ort_filename are not set" exit 1 - displayName: 'Check if variables ort_filename and ort_filename are set' + displayName: 'Check if variables ort_version and ort_filename are set' condition: or( eq (variables['ort_version'], ''), eq (variables['ort_filename'], '')) - task: DownloadGitHubRelease@0 diff --git a/.pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml b/.pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml new file mode 100644 index 000000000..4edf0d03a --- /dev/null +++ b/.pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml @@ -0,0 +1,42 @@ +parameters: + packageFolder: $(Build.ArtifactStagingDirectory) + +steps: +- task: CmdLine@2 + condition: eq(variables['Agent.OS'], 'Windows_NT') + displayName: 'Extract version number from the NuPkg file, Windows VMs' + inputs: + workingDirectory: '${{ parameters.packageFolder }}' + script: | + SETLOCAL EnableDelayedExpansion + FOR /R %%i IN (Microsoft.ML.OnnxRuntime.Managed*.nupkg) do ( + set filename=%%~ni + set ortversion=!filename:~33! + @echo ortversion is !ortversion! + @echo ##vso[task.setvariable variable=NuGetPackageVersionNumber;]!ortversion! + ) +- task: CmdLine@2 + condition: eq(variables['Agent.OS'], 'Windows_NT') + displayName: 'Extract version number from the DirectML NuPkg file, Windows VMs' + inputs: + workingDirectory: '${{ parameters.packageFolder }}' + script: | + SETLOCAL EnableDelayedExpansion + FOR /R %%i IN (Microsoft.ML.OnnxRuntime.DirectML*.nupkg) do ( + set filename=%%~ni + set ortversion=!filename:~34! + @echo DirectMLNuGetPackageVersionNumber is !ortversion! + @echo ##vso[task.setvariable variable=DirectMLNuGetPackageVersionNumber;]!ortversion! + ) +- task: CmdLine@2 + condition: not(eq(variables['Agent.OS'], 'Windows_NT')) + displayName: 'Extract version number from the NuPkg file, Unix VMs' + inputs: + workingDirectory: '${{ parameters.packageFolder }}' + script: | + filenamewithext=$(ls Microsoft.ML.OnnxRuntime.Managed*nupkg) + filename=${filenamewithext%.*} + ortversion=${filename:33} + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x + echo "##vso[task.setvariable variable=NuGetPackageVersionNumber;]$ortversion" diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml index e1125cf96..db500916b 100644 --- a/.pipelines/stages/nuget-packaging-stage.yml +++ b/.pipelines/stages/nuget-packaging-stage.yml @@ -12,35 +12,49 @@ parameters: - name: cuda_version type: string default: '' - +- name: publish_to_ado_feed + type: boolean +- name: publish_to_nuget + type: boolean stages: - stage: nuget_packaging jobs: - ${{ if eq(parameters.enable_win_cpu, true) }}: - - template: jobs/nuget-win-packaging-job.yml + - template: jobs/nuget-packaging-job.yml parameters: arch: 'x64' ep: 'cpu' ort_version: ${{ parameters.ort_version }} + os: 'win' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + publish_to_nuget: ${{ parameters.publish_to_nuget }} - ${{ if eq(parameters.enable_win_cuda, true) }}: - - template: jobs/nuget-win-packaging-job.yml + - template: jobs/nuget-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' ort_version: ${{ parameters.ort_version }} - + os: 'win' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + publish_to_nuget: ${{ parameters.publish_to_nuget }} - ${{ if eq(parameters.enable_linux_cpu, true) }}: - - template: jobs/nuget-linux-packaging-job.yml + - template: jobs/nuget-packaging-job.yml parameters: arch: 'x64' ep: 'cpu' ort_version: ${{ parameters.ort_version }} + os: 'linux' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + publish_to_nuget: ${{ parameters.publish_to_nuget }} - ${{ if eq(parameters.enable_linux_cuda, true) }}: - - template: jobs/nuget-linux-packaging-job.yml + - template: jobs/nuget-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' - ort_version: ${{ parameters.ort_version }} \ No newline at end of file + ort_version: ${{ parameters.ort_version }} + os: 'linux' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + publish_to_nuget: ${{ parameters.publish_to_nuget }} \ No newline at end of file From b3e62ad68eb682fc685d473d4d1e5f5a33f1f308 Mon Sep 17 00:00:00 2001 From: aciddelgado <139922440+aciddelgado@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:35:54 -0700 Subject: [PATCH 20/36] fix benchmark wall clock (#216) Corrects a small error with the benchmarking script implemented in [the original benchmark script PR ](https://github.com/microsoft/onnxruntime-genai/pull/114). All tests done with same parameters. ## Performance Metrics ## Fp32-cpu - **Average Tokenization Latency (per token):** 0.4079251408984419 ms - **Average Tokenization Throughput (per token):** 2451.4301761287193 tps - **Average Prompt Processing Latency (per token):** 9.474278237737508 ms - **Average Prompt Processing Throughput (per token):** 105.5489373340173 tps - **Average Token Generation Latency (per token):** 186.07972450856732 ms - **Average Token Generation Throughput (per token):** 5.3740406303856005 tps - **Average Sampling Latency (per token):** 0.1549795838015365 ms - **Average Sampling Throughput (per token):** 6452.462804910989 tps - **Average Wall Clock Time:** 49.07776738643646 s - **Average Wall Clock Throughput:** 7.8243168026858 tps ## Int4-cpu - **Average Tokenization Latency (per token):** 0.12926810202770866 ms - **Average Tokenization Throughput (per token):** 7735.860466069577 tps - **Average Prompt Processing Latency (per token):** 10.303267585383082 ms - **Average Prompt Processing Throughput (per token):** 97.05658828260155 tps - **Average Token Generation Latency (per token):** 84.14963581704069 ms - **Average Token Generation Throughput (per token):** 11.88359272491938 tps - **Average Sampling Latency (per token):** 0.1820565843723898 ms - **Average Sampling Throughput (per token):** 5492.797766405076 tps - **Average Wall Clock Time:** 22.826100442409516 s - **Average Wall Clock Throughput:** 16.822847203745376 tps ## Fp16-cuda - **Average Tokenization Latency (per token):** 0.2424314897507429 ms - **Average Tokenization Throughput (per token):** 4124.876685896518 tps - **Average Prompt Processing Latency (per token):** 0.07853922086042076 ms - **Average Prompt Processing Throughput (per token):** 12732.491983555472 tps - **Average Token Generation Latency (per token):** 8.747893429670668 ms - **Average Token Generation Throughput (per token):** 114.3132352993979 tps - **Average Sampling Latency (per token):** 0.02670477955492011 ms - **Average Sampling Throughput (per token):** 37446.480243112856 tps - **Average Wall Clock Time:** 2.2886383962631225 s - **Average Wall Clock Throughput:** 167.785352472891 tps ## Int4-cuda - **Average Tokenization Latency (per token):** 0.11371983797289431 ms - **Average Tokenization Throughput (per token):** 8793.540492366468 tps - **Average Prompt Processing Latency (per token):** 0.10714008702052524 ms - **Average Prompt Processing Throughput (per token):** 9333.574648006643 tps - **Average Token Generation Latency (per token):** 4.4145354868173055 ms - **Average Token Generation Throughput (per token):** 226.52439944954617 tps - **Average Sampling Latency (per token):** 0.024083436303499184 ms - **Average Sampling Throughput (per token):** 41522.31381759694 tps - **Average Wall Clock Time:** 1.1688858103752136 s - **Average Wall Clock Throughput:** 328.51797548704576 tps --- benchmark/python/benchmark_e2e.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py index f5cfd3143..44e8194eb 100644 --- a/benchmark/python/benchmark_e2e.py +++ b/benchmark/python/benchmark_e2e.py @@ -85,6 +85,8 @@ def main(args): wall_clock_times = [] if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}") for _ in tqdm(range(num_repetitions)): + wall_clock_start_time = time.time() + # Prepare run generator = og.Generator(model, params) @@ -94,6 +96,12 @@ def main(args): tokenize_end_time = time.perf_counter() tokenize_times.append(tokenize_end_time - tokenize_start_time) + # Prepare run + params = og.GeneratorParams(model) + params.input_ids = tokens + params.set_search_options({"max_length":max_length, "min_length":max_length}) + generator = og.Generator(model, params) + # Measure prompt processing prompt_start_time = time.perf_counter() generator.compute_logits() @@ -106,7 +114,6 @@ def main(args): sampling_times.append(sampling_end_time - sampling_start_time) # Measure token generation - wall_clock_start_time = time.time() while not generator.is_done(): # Run inference token_gen_start_time = time.perf_counter() @@ -154,7 +161,7 @@ def main(args): # Calculate wall clock time avg_wall_clock_time = sum(wall_clock_times) / len(wall_clock_times) - avg_wall_clock_thrpt = batch_size * (generation_length / avg_wall_clock_time) + avg_wall_clock_thrpt = batch_size * (max_length / avg_wall_clock_time) print(f"Average Wall Clock Time: {avg_wall_clock_time} s") print(f"Average Wall Clock Throughput: {avg_wall_clock_thrpt} tps") From 879e6194cce1361ed88ce37a5b75c2c8b4146856 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Fri, 22 Mar 2024 20:15:42 -0700 Subject: [PATCH 21/36] Remove GenerateNextToken* special case functions (#221) set_search_options already supports the functionality, so the extra functions are confusing users since there are multiple ways to do the same thing. set_search_options is also more flexible as it supports all future options without the need for extra APIs. --- src/config.h | 2 +- src/csharp/Generator.cs | 20 --------- src/csharp/NativeMethods.cs | 23 ---------- src/generators.cpp | 37 ++++++---------- src/generators.h | 4 -- src/ort_genai_c.cpp | 28 ------------ src/ort_genai_c.h | 17 -------- src/python/python.cpp | 20 --------- test/c_api_tests.cpp | 40 +++-------------- test/csharp/TestOnnxRuntimeGenAIAPI.cs | 41 +++--------------- test/model_tests.cpp | 6 +-- test/sampling_tests.cpp | 60 ++++++++++++++++++-------- 12 files changed, 72 insertions(+), 226 deletions(-) diff --git a/src/config.h b/src/config.h index 6cc634658..2621edc21 100644 --- a/src/config.h +++ b/src/config.h @@ -79,7 +79,7 @@ struct Config { int num_return_sequences{1}; float repetition_penalty{1.0f}; // 1.0 means no penalty. int top_k{}; // Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in the generate method of the model. - float top_p{1.0f}; // If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. + float top_p{}; // If set to float >0 and <1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. float temperature{1.0f}; bool early_stopping{true}; // Whether to stop the beam search when at least num_beams sentences are finished per batch or not. int no_repeat_ngram_size{}; diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs index 10c3d4e47..64c1c5623 100644 --- a/src/csharp/Generator.cs +++ b/src/csharp/Generator.cs @@ -30,26 +30,6 @@ public void GenerateNextToken() Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken(_generatorHandle)); } - public void GenerateNextTokenTop() - { - Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_Top(_generatorHandle)); - } - - public void GenerateNextTokenTopK(int k, float temperature) - { - Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK(_generatorHandle, k, temperature)); - } - - public void GenerateNextTokenTopP(float p, float temperature) - { - Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopP(_generatorHandle, p, temperature)); - } - - public void GenerateNextTokenTopKTopP(int k, float p, float temperature) - { - Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK_TopP(_generatorHandle, k, p, temperature)); - } - public ReadOnlySpan GetSequence(ulong index) { ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64(); diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index 039dfb4de..552c9046a 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -80,29 +80,6 @@ internal class NativeLib [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken(IntPtr /* OgaGenerator* */ generator); - // This function is used to generate the next token in the sequence using the greedy search algorithm. - [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_Top(IntPtr /* OgaGenerator* */ generator); - - // This function is used to generate the next token in the sequence using the greedy search algorithm. - [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK(IntPtr /* OgaGenerator* */ generator, - int /* int32_t */ k, - float /* single_t */ t); - - // This function is used to generate the next token in the sequence using the greedy search algorithm. - [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopP(IntPtr /* OgaGenerator* */ generator, - float /* single_t */ p, - float /* single_t */ t); - - // This function is used to generate the next token in the sequence using the greedy search algorithm. - [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK_TopP(IntPtr /* OgaGenerator* */ generator, - int /* int32_t */ k, - float /* single_t */ p, - float /* single_t */ t); - // This function returns the length of the sequence at the given index. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator, diff --git a/src/generators.cpp b/src/generators.cpp index 6844a9aaf..cee8b1b02 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -84,7 +84,7 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_ void Generator::ComputeLogits() { if (computed_logits_) - throw std::runtime_error("ComputeLogits called again without calling GenerateNextToken* first"); + throw std::runtime_error("ComputeLogits called again without calling GenerateNextToken first"); search_->SetLogits(state_->Run(search_->GetSequenceLength(), search_->GetNextTokens(), search_->GetNextIndices())); computed_logits_ = true; @@ -101,46 +101,37 @@ bool Generator::IsDone() const { return search_->IsDone(); } -void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temperature) { +void Generator::GenerateNextToken() { if (!computed_logits_) - throw std::runtime_error("Must call ComputeLogits before GenerateNextToken*"); + throw std::runtime_error("Must call ComputeLogits before GenerateNextToken"); computed_logits_ = false; - if (top_k == 1) { + auto& search = search_->params_->search; + if (!search.do_sample || search.top_k == 1) { search_->SelectTop(); return; } // The user explicitly called TopK_TopP on a beam search - if (search_->params_->search.num_beams != 1) + if (search.num_beams != 1) throw std::runtime_error("TopK and TopP cannot be used with a beam search"); // Sanity checks - if (top_p < 0.0f || top_p > 1.0f) + if (search.top_p < 0.0f || search.top_p > 1.0f) throw std::runtime_error("top_p must be between 0.0 and 1.0"); - if (top_k < 0) + if (search.top_k < 0) throw std::runtime_error("top_k must be 0 or greater"); - if (top_p > 0.0f && top_p < 1.0f && top_k > 1) { - search_->SampleTopKTopP(top_k, top_p, temperature); - } else if (top_k > 1) { - search_->SampleTopK(top_k, temperature); + if (search.top_p > 0.0f && search.top_p < 1.0f && search.top_k > 1) { + search_->SampleTopKTopP(search.top_k, search.top_p, search.temperature); + } else if (search.top_k > 1) { + search_->SampleTopK(search.top_k, search.temperature); } else { - assert(top_k == 0); - if (top_p == 0.0f) - throw std::runtime_error("top_k and top_p cannot both be zero"); - search_->SampleTopP(top_p, temperature); + assert(search.top_k == 0); + search_->SampleTopP(search.top_p, search.temperature); } } -void Generator::GenerateNextToken() { - auto& search = search_->params_->search; - if (search.do_sample) - GenerateNextToken_TopK_TopP(search.top_k, search.top_p, search.temperature); - else - GenerateNextToken_Top(); -} - RoamingArray Generator::GetSequence(int index) const { return search_->GetSequence(index); } diff --git a/src/generators.h b/src/generators.h index 1b42b45e9..3fb9f5201 100644 --- a/src/generators.h +++ b/src/generators.h @@ -100,10 +100,6 @@ struct Generator { bool IsDone() const; void ComputeLogits(); - void GenerateNextToken_TopK_TopP(int top_k, float top_p, float temperature); - void GenerateNextToken_TopP(float p, float temperature) { GenerateNextToken_TopK_TopP(0, p, temperature); } - void GenerateNextToken_TopK(int k, float temperature) { GenerateNextToken_TopK_TopP(k, 0.0f, temperature); } - void GenerateNextToken_Top() { GenerateNextToken_TopK_TopP(1, 0.0f, 0.0f); } void GenerateNextToken(); RoamingArray GetSequence(int index) const; diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index bbf84be51..1beb2a43b 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -156,34 +156,6 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator) OGA_CATCH } -OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator) { - OGA_TRY - reinterpret_cast(generator)->GenerateNextToken_Top(); - return nullptr; - OGA_CATCH -} - -OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t) { - OGA_TRY - reinterpret_cast(generator)->GenerateNextToken_TopK(k, t); - return nullptr; - OGA_CATCH -} - -OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t) { - OGA_TRY - reinterpret_cast(generator)->GenerateNextToken_TopP(p, t); - return nullptr; - OGA_CATCH -} - -OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t) { - OGA_TRY - reinterpret_cast(generator)->GenerateNextToken_TopK_TopP(k, p, t); - return nullptr; - OGA_CATCH -} - size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) { auto& generator = *reinterpret_cast(oga_generator); return generator.GetSequence(static_cast(index)).GetCPU().size(); diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index e702082fc..fbd394f10 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -172,23 +172,6 @@ OGA_EXPORT bool OGA_API_CALL OgaGenerator_IsDone(const OgaGenerator* generator); * \return OgaResult containing the error message if the computation of the logits failed. */ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_ComputeLogits(OgaGenerator* generator); - -/* - * \brief Generates the next token based on the computed logits using the greedy search. - * \param[in] generator The generator to generate the next token for. - * \return OgaResult containing the error message if the generation of the next token failed. - */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator); - -/* Top-K sampling: most probable words from the model's output probability distribution for the next word - */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t); - -/*Top-P sampling selects words from the smallest set of words whose cumulative probability exceeds a predefined threshold (p) - */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t); - -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t); OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator); /* diff --git a/src/python/python.cpp b/src/python/python.cpp index 1c8db803d..584beb97c 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -137,22 +137,6 @@ struct PyGenerator { generator_->ComputeLogits(); } - void GenerateNextToken_TopK_TopP(int top_k, float top_p, float temperature) { - generator_->GenerateNextToken_TopK_TopP(top_k, top_p, temperature); - } - - void GenerateNextToken_TopP(float p, float temperature) { - generator_->GenerateNextToken_TopP(p, temperature); - } - - void GenerateNextToken_TopK(int k, float temperature) { - generator_->GenerateNextToken_TopK(k, temperature); - } - - void GenerateNextToken_Top() { - generator_->GenerateNextToken_Top(); - } - void GenerateNextToken() { generator_->GenerateNextToken(); } @@ -235,10 +219,6 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def("is_done", &PyGenerator::IsDone) .def("compute_logits", &PyGenerator::ComputeLogits) .def("generate_next_token", &PyGenerator::GenerateNextToken) - .def("generate_next_token_top", &PyGenerator::GenerateNextToken_Top) - .def("generate_next_token_top_p", &PyGenerator::GenerateNextToken_TopP) - .def("generate_next_token_top_k", &PyGenerator::GenerateNextToken_TopK) - .def("generate_next_token_top_k_top_p", &PyGenerator::GenerateNextToken_TopK_TopP) .def("get_next_tokens", &PyGenerator::GetNextTokens) .def("get_sequence", &PyGenerator::GetSequence); diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 2ac6bfb71..3a04a8180 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -187,6 +187,7 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { CheckResult(OgaCreateGeneratorParams(model, ¶ms)); OgaGeneratorParamsPtr params_ptr{params}; CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", max_length)); + CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", false)); CheckResult(OgaGeneratorParamsSetInputIDs(params, input_ids.data(), input_ids.size(), sequence_length, batch_size)); OgaGenerator* generator; @@ -195,7 +196,7 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { while (!OgaGenerator_IsDone(generator)) { CheckResult(OgaGenerator_ComputeLogits(generator)); - CheckResult(OgaGenerator_GenerateNextToken_Top(generator)); + CheckResult(OgaGenerator_GenerateNextToken(generator)); } // Verify outputs match expected outputs @@ -252,20 +253,11 @@ TEST(CAPITests, TopKCAPI) { CheckResult(OgaCreateGeneratorParams(model, ¶ms)); OgaGeneratorParamsPtr params_ptr{params}; CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); - - OgaGenerator* generator; - CheckResult(OgaCreateGenerator(model, params, &generator)); - OgaGeneratorPtr generator_ptr{generator}; - - while (!OgaGenerator_IsDone(generator)) { - CheckResult(OgaGenerator_ComputeLogits(generator)); - CheckResult(OgaGenerator_GenerateNextToken_TopK(generator, top_k, temp)); - } - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); + CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); + OgaSequences* output_sequences; CheckResult(OgaGenerate(model, params, &output_sequences)); OgaSequencesPtr output_sequences_ptr{output_sequences}; @@ -310,20 +302,10 @@ TEST(CAPITests, TopPCAPI) { CheckResult(OgaCreateGeneratorParams(model, ¶ms)); OgaGeneratorParamsPtr params_ptr{params}; CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); - - OgaGenerator* generator; - CheckResult(OgaCreateGenerator(model, params, &generator)); - OgaGeneratorPtr generator_ptr{generator}; - - while (!OgaGenerator_IsDone(generator)) { - CheckResult(OgaGenerator_ComputeLogits(generator)); - CheckResult(OgaGenerator_GenerateNextToken_TopP(generator, top_p, temp)); - } - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); + CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); OgaSequences* output_sequences; CheckResult(OgaGenerate(model, params, &output_sequences)); OgaSequencesPtr output_sequences_ptr{output_sequences}; @@ -369,21 +351,11 @@ TEST(CAPITests, TopKTopPCAPI) { CheckResult(OgaCreateGeneratorParams(model, ¶ms)); OgaGeneratorParamsPtr params_ptr{params}; CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); - - OgaGenerator* generator; - CheckResult(OgaCreateGenerator(model, params, &generator)); - OgaGeneratorPtr generator_ptr{generator}; - - while (!OgaGenerator_IsDone(generator)) { - CheckResult(OgaGenerator_ComputeLogits(generator)); - CheckResult(OgaGenerator_GenerateNextToken_TopK_TopP(generator, top_k, top_p, temp)); - } - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p)); CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); + CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); OgaSequences* output_sequences; CheckResult(OgaGenerate(model, params, &output_sequences)); OgaSequencesPtr output_sequences_ptr{output_sequences}; diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index 7bca5ffdc..156f943b4 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -64,7 +64,7 @@ public void TestGreedySearch() while (!generator.IsDone()) { generator.ComputeLogits(); - generator.GenerateNextTokenTop(); + generator.GenerateNextToken(); } for (ulong i = 0; i < batchSize; i++) @@ -92,7 +92,7 @@ public void TestTopKSearch() { int topK = 100; float temp = 0.6f; - ulong maxLength = 40; + ulong maxLength = 20; string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); using (var model = new Model(modelPath)) @@ -115,17 +115,8 @@ public void TestTopKSearch() using GeneratorParams generatorParams = new GeneratorParams(model); Assert.NotNull(generatorParams); - generatorParams.SetSearchOption("max_length", 20); generatorParams.SetInputSequences(sequences); - - using Generator generator = new Generator(model, generatorParams); - Assert.NotNull(generator); - while (!generator.IsDone()) - { - generator.ComputeLogits(); - generator.GenerateNextTokenTopK(topK, temp); - } - + generatorParams.SetSearchOption("max_length", maxLength); generatorParams.SetSearchOption("do_sample", true); generatorParams.SetSearchOption("top_k", topK); generatorParams.SetSearchOption("temperature", temp); @@ -143,7 +134,7 @@ public void TestTopPSearch() { float topP = 0.6f; float temp = 0.6f; - ulong maxLength = 40; + ulong maxLength = 20; string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); using (var model = new Model(modelPath)) @@ -166,17 +157,8 @@ public void TestTopPSearch() using GeneratorParams generatorParams = new GeneratorParams(model); Assert.NotNull(generatorParams); - generatorParams.SetSearchOption("max_length", 20); generatorParams.SetInputSequences(sequences); - - using Generator generator = new Generator(model, generatorParams); - Assert.NotNull(generator); - while (!generator.IsDone()) - { - generator.ComputeLogits(); - generator.GenerateNextTokenTopP(topP, temp); - } - + generatorParams.SetSearchOption("max_length", maxLength); generatorParams.SetSearchOption("do_sample", true); generatorParams.SetSearchOption("top_p", topP); generatorParams.SetSearchOption("temperature", temp); @@ -195,7 +177,7 @@ public void TestTopKTopPSearch() int topK = 100; float topP = 0.6f; float temp = 0.6f; - ulong maxLength = 40; + ulong maxLength = 20; string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); using (var model = new Model(modelPath)) @@ -218,17 +200,8 @@ public void TestTopKTopPSearch() using GeneratorParams generatorParams = new GeneratorParams(model); Assert.NotNull(generatorParams); - generatorParams.SetSearchOption("max_length", 20); generatorParams.SetInputSequences(sequences); - - using Generator generator = new Generator(model, generatorParams); - Assert.NotNull(generator); - while (!generator.IsDone()) - { - generator.ComputeLogits(); - generator.GenerateNextTokenTopKTopP(topK, topP, temp); - } - + generatorParams.SetSearchOption("max_length", maxLength); generatorParams.SetSearchOption("do_sample", true); generatorParams.SetSearchOption("top_k", topK); generatorParams.SetSearchOption("top_p", topP); diff --git a/test/model_tests.cpp b/test/model_tests.cpp index a2b3a7832..79c1d2c64 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -44,7 +44,7 @@ TEST(ModelTests, GreedySearchGptFp32) { while (!generator->IsDone()) { generator->ComputeLogits(); - generator->GenerateNextToken_Top(); + generator->GenerateNextToken(); } // Verify outputs match expected outputs @@ -128,7 +128,7 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label) while (!generator->IsDone()) { generator->ComputeLogits(); - generator->GenerateNextToken_Top(); + generator->GenerateNextToken(); } // Verify outputs match expected outputs @@ -226,7 +226,7 @@ Print all primes between 1 and n auto generator = Generators::CreateGenerator(*model, *params); while (!generator->IsDone()) { generator->ComputeLogits(); - generator->GenerateNextToken_Top(); + generator->GenerateNextToken(); } auto result = generator->GetSequence(0); diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index 531270f78..239c71ab6 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -27,6 +27,8 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) { int batch_size = 4; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample=true; + params->search.top_p=0.25f; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -37,7 +39,7 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) { generator->search_->SetLogits(logits_span); generator->computed_logits_ = true; // Verify outputs match expected outputs - generator->GenerateNextToken_TopP(0.25f, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t))); } @@ -53,6 +55,8 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) { int batch_size = 4; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = 2; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -64,8 +68,7 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) { generator->computed_logits_ = true; // Verify outputs match expected outputs - int k = 2; - generator->GenerateNextToken_TopK(k, 1.0); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -85,6 +88,9 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) { int batch_size = 4; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = 2; + params->search.top_p = 0.25f; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -95,9 +101,7 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) { generator->search_->SetLogits(Generators::cpu_span(logits_copy)); generator->computed_logits_ = true; // Verify outputs match expected outputs - float p = 0.25f; - int k = 2; - generator->GenerateNextToken_TopK_TopP(k, p, 1.0); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -130,6 +134,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) { std::vector input_ids{0, 1, 2, 3, 4}; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_p = 0.95f; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -147,7 +153,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) { auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); generator->computed_logits_ = true; - generator->GenerateNextToken_TopP(0.95f, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { @@ -166,6 +172,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { std::vector input_ids{0, 1, 2, 3, 4}; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = k; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -183,7 +191,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { auto logits_copy=logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); generator->computed_logits_ = true; - generator->GenerateNextToken_TopK(k, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { @@ -203,6 +211,9 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { std::vector input_ids{0, 1, 2, 3, 4}; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = k; + params->search.top_p = p; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -220,7 +231,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { auto logits_copy = logits_cpu; generator->search_->SetLogits(Generators::cpu_span(logits_copy)); generator->computed_logits_ = true; - generator->GenerateNextToken_TopK_TopP(k, p, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); // Verify outputs match expected outputs for (int b = 0; b < batch_size; b++) { @@ -248,6 +259,8 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) { int batch_size = 4; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_p = 0.25f; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -259,7 +272,7 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) { generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); generator->computed_logits_ = true; // Verify outputs match expected outputs - generator->GenerateNextToken_TopP(0.25f, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t))); } @@ -276,6 +289,8 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) { int batch_size = 4; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = 2; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -287,8 +302,7 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) { generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); generator->computed_logits_ = true; // Verify outputs match expected outputs - int k = 2; - generator->GenerateNextToken_TopK(k, 1.0); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -309,6 +323,9 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { int batch_size = 4; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = 2; + params->search.top_p = 0.25f; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -320,9 +337,7 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), logits_cpu.size())); generator->computed_logits_ = true; // Verify outputs match expected outputs - float p = 0.25f; - int k = 2; - generator->GenerateNextToken_TopK_TopP(k, p, 1.0); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); for (int b = 0; b < batch_size; b++) { auto next_token = next_tokens[b]; @@ -338,6 +353,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { std::vector input_ids{0, 1, 2, 3, 4}; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_p = 0.95f; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -358,7 +375,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; - generator->GenerateNextToken_TopP(0.95f, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs @@ -378,6 +395,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { std::vector input_ids{0, 1, 2, 3, 4}; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = k; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -398,7 +417,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; - generator->GenerateNextToken_TopK(k, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs @@ -419,6 +438,9 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { std::vector input_ids{0, 1, 2, 3, 4}; auto params = Generators::CreateGeneratorParams(); params->search.max_length = 10; + params->search.do_sample = true; + params->search.top_k = k; + params->search.top_p = p; params->batch_size = batch_size; params->sequence_length = 1; params->vocab_size = vocab_size; @@ -439,7 +461,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; - generator->GenerateNextToken_TopK_TopP(k, p, 1.0f); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs @@ -478,7 +500,7 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { auto generator = Generators::CreateGenerator(*model, *params); generator->search_->SetLogits(Generators::gpu_span(logits_gpu.get(), vocab_size * batch_size)); generator->computed_logits_ = true; - generator->GenerateNextToken_Top(); + generator->GenerateNextToken(); auto next_tokens = generator->search_->GetNextTokens().GetCPU(); cudaStreamSynchronize(params->cuda_stream); // Verify outputs match expected outputs From a76a9ac46f91a1aeb7ecad8958dde3f0ac0f3d81 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 25 Mar 2024 16:26:53 -0400 Subject: [PATCH 22/36] Fix a typo (#232) --- .pipelines/stages/jobs/steps/nuget-win-step.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml index b3c5e4fa8..3191412a3 100644 --- a/.pipelines/stages/jobs/steps/nuget-win-step.yml +++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml @@ -53,4 +53,4 @@ steps: displayName: 'Publish Artifact: ONNXRuntime Genai capi' inputs: PathtoPublish: '$(Build.ArtifactStagingDirectory)\nuget' - ArtifactName: $(artifactName)-nuget' \ No newline at end of file + ArtifactName: $(artifactName)-nuget \ No newline at end of file From 4022ba6b48827768ab7b06a8d5e84c6921f493d1 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Mon, 25 Mar 2024 15:47:48 -0700 Subject: [PATCH 23/36] Easy to use C++ API Wrapper (#225) Wraps the C API in a 0-overhead C++ style API that automatically manages resources and gives C++ style interfaces. It removes all of the C "glue" and should be much simpler and safer to use. I switched the C API example to use it, plus our C API unit tests. Before: ``` OgaModel* model; CheckResult(OgaCreateModel("phi-2", OgaDeviceTypeCPU, &model)); OgaModelPtr model_ptr{model}; OgaTokenizer* tokenizer; CheckResult(OgaCreateTokenizer(model, &tokenizer)); OgaTokenizerPtr tokenizer_ptr{tokenizer}; const char* prompt = "def is_prime(num):"; std::cout << "Prompt: " << std::endl << prompt << std::endl; OgaSequences* sequences; CheckResult(OgaCreateSequences(&sequences)); OgaSequencesPtr sequences_ptr{sequences}; CheckResult(OgaTokenizerEncode(tokenizer, prompt, sequences)); OgaGeneratorParams* params; CheckResult(OgaCreateGeneratorParams(model, ¶ms)); OgaGeneratorParamsPtr params_ptr{params}; CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 200)); CheckResult(OgaGeneratorParamsSetInputSequences(params, sequences)); OgaSequences* output_sequences; CheckResult(OgaGenerate(model, params, &output_sequences)); OgaSequencesPtr output_sequences_ptr{output_sequences}; size_t sequence_length = OgaSequencesGetSequenceCount(output_sequences, 0); const int32_t* sequence = OgaSequencesGetSequenceData(output_sequences, 0); const char* out_string; CheckResult(OgaTokenizerDecode(tokenizer, sequence, sequence_length, &out_string)); std::cout << "Output: " << std::endl << out_string << std::endl; ``` After: ``` auto model = OgaModel::Create("phi-2"); auto tokenizer = OgaTokenizer::Create(*model); const char* prompt = "def is_prime(num):"; std::cout << "Prompt: " << std::endl << prompt << std::endl; auto sequences = OgaSequences::Create(); tokenizer->Encode(prompt, *sequences); auto params = OgaGeneratorParams::Create(*model); params->SetSearchOption("max_length", 200); params->SetInputSequences(*sequences); auto output_sequences = model->Generate(*params); auto out_string = tokenizer->Decode(output_sequences->Get(0)); std::cout << "Output: " << std::endl << out_string << std::endl; ``` --- examples/c/README.md | 3 +- examples/c/src/main.cpp | 97 ++++++----- src/ort_genai.h | 189 ++++++++++++++++++++++ src/ort_genai_c.cpp | 29 ++-- src/ort_genai_c.h | 2 +- test/c_api_tests.cpp | 348 ++++++++++++---------------------------- 6 files changed, 361 insertions(+), 307 deletions(-) create mode 100644 src/ort_genai.h diff --git a/examples/c/README.md b/examples/c/README.md index 6bd58b38d..8cd2168fd 100644 --- a/examples/c/README.md +++ b/examples/c/README.md @@ -33,10 +33,9 @@ python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o - onnxruntime.dll - onnxruntime_providers_shared.dll - onnxruntime_providers_cuda.dll - - onnxruntime.lib - onnxruntime-genai.dll - onnxruntime-genai.lib -2. Copy over the `ort_genai_c.h` header file to the [include](include) directory. +2. Copy over the `ort_genai.h` and `ort_genai_c.h` header files to the [include](include) directory. On Windows: ```bash diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp index 60ee8b837..d9aeb68a8 100644 --- a/examples/c/src/main.cpp +++ b/examples/c/src/main.cpp @@ -1,72 +1,61 @@ #include -#include "ort_genai_c.h" +#include +#include "ort_genai.h" -struct Deleters { - void operator()(OgaResult* p) { - OgaDestroyResult(p); - } - void operator()(OgaSequences* p) { - OgaDestroySequences(p); - } - void operator()(OgaModel* p) { - OgaDestroyModel(p); - } - void operator()(OgaGeneratorParams* p) { - OgaDestroyGeneratorParams(p); - } - void operator()(OgaGenerator* p) { - OgaDestroyGenerator(p); - } - void operator()(OgaTokenizer* p) { - OgaDestroyTokenizer(p); - } -}; +// C++ API Example -using OgaResultPtr = std::unique_ptr; -using OgaSequencesPtr = std::unique_ptr; -using OgaModelPtr = std::unique_ptr; -using OgaGeneratorParamsPtr = std::unique_ptr; -using OgaGeneratorPtr = std::unique_ptr; -using OgaTokenizerPtr = std::unique_ptr; +void CXX_API() { + auto model = OgaModel::Create("phi-2"); + auto tokenizer = OgaTokenizer::Create(*model); -void CheckResult(OgaResult* result) { - if (!result) - return; + const char* prompt = "def is_prime(num):"; + std::cout << "Prompt: " << std::endl << prompt << std::endl; + + auto sequences = OgaSequences::Create(); + tokenizer->Encode(prompt, *sequences); + + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 200); + params->SetInputSequences(*sequences); + + auto output_sequences = model->Generate(*params); + auto out_string = tokenizer->Decode(output_sequences->Get(0)); - OgaResultPtr result_ptr{result}; - throw std::runtime_error(OgaResultGetError(result)); + std::cout << "Output: " << std::endl << out_string << std::endl; } -int main() { - std::cout << "-------------" << std::endl; - std::cout << "Hello, Phi-2!" << std::endl; - std::cout << "-------------" << std::endl; +// C API Example + +void CheckResult(OgaResult* result) { + if (result) { + std::string string=OgaResultGetError(result); + OgaDestroyResult(result); + throw std::runtime_error(string); + } +} +void C_API() { OgaModel* model; - CheckResult(OgaCreateModel("phi-2", OgaDeviceTypeCPU, &model)); - OgaModelPtr model_ptr{model}; + OgaCreateModel("phi-2", &model); OgaTokenizer* tokenizer; CheckResult(OgaCreateTokenizer(model, &tokenizer)); - OgaTokenizerPtr tokenizer_ptr{tokenizer}; const char* prompt = "def is_prime(num):"; - std::cout << "Prompt: " << std::endl << prompt << std::endl; + std::cout << "Prompt: " << std::endl + << prompt << std::endl; OgaSequences* sequences; CheckResult(OgaCreateSequences(&sequences)); - OgaSequencesPtr sequences_ptr{sequences}; CheckResult(OgaTokenizerEncode(tokenizer, prompt, sequences)); OgaGeneratorParams* params; CheckResult(OgaCreateGeneratorParams(model, ¶ms)); - OgaGeneratorParamsPtr params_ptr{params}; CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 200)); CheckResult(OgaGeneratorParamsSetInputSequences(params, sequences)); OgaSequences* output_sequences; CheckResult(OgaGenerate(model, params, &output_sequences)); - OgaSequencesPtr output_sequences_ptr{output_sequences}; size_t sequence_length = OgaSequencesGetSequenceCount(output_sequences, 0); const int32_t* sequence = OgaSequencesGetSequenceData(output_sequences, 0); @@ -74,7 +63,27 @@ int main() { const char* out_string; CheckResult(OgaTokenizerDecode(tokenizer, sequence, sequence_length, &out_string)); - std::cout << "Output: " << std::endl << out_string << std::endl; + std::cout << "Output: " << std::endl + << out_string << std::endl; + + OgaDestroyString(out_string); + OgaDestroySequences(output_sequences); + OgaDestroyGeneratorParams(params); + OgaDestroySequences(sequences); + OgaDestroyTokenizer(tokenizer); + OgaDestroyModel(model); +} + +int main() { + std::cout << "-------------" << std::endl; + std::cout << "Hello, Phi-2!" << std::endl; + std::cout << "-------------" << std::endl; + + std::cout << "C++ API" << std::endl; + CXX_API(); + + std::cout << "C API" << std::endl; + C_API(); return 0; } \ No newline at end of file diff --git a/src/ort_genai.h b/src/ort_genai.h new file mode 100644 index 000000000..82f8c722c --- /dev/null +++ b/src/ort_genai.h @@ -0,0 +1,189 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#include "ort_genai_c.h" + +// GenAI C++ API +// +// This is a zero cost wrapper around the C API, and provides for a set of C++ classes with automatic resource management + +/* A simple end to end example of how to generate an answer from a prompt: + * + * auto model = OgaModel::Create("phi-2"); + * auto tokenizer = OgaTokenizer::Create(*model); + * + * auto sequences = OgaSequences::Create(); + * tokenizer->Encode("A great recipe for Kung Pao chicken is ", *sequences); + * + * auto params = OgaGeneratorParams::Create(*model); + * params->SetInputSequences(*sequences); + * params->SetSearchOption("max_length", 200); + * + * auto output_sequences = model->Generate(*params); + * auto out_string = tokenizer->Decode(output_sequences->Get(0)); + * + * std::cout << "Output: " << std::endl << out_string << std::endl; + */ + +// The types defined in this file are to give us zero overhead C++ style interfaces around an opaque C pointer. +// For example, there is no actual 'OgaModel' type defined anywhere, so we create a fake definition here +// that lets users have a C++ style OgaModel type that can be held in a std::unique_ptr. +// +// This OgaAbstract struct is to prevent accidentally trying to use them by value. +struct OgaAbstract { + OgaAbstract() = delete; + OgaAbstract(const OgaAbstract&) = delete; + void operator=(const OgaAbstract&) = delete; +}; + +struct OgaResult : OgaAbstract { + const char* GetError() const { return OgaResultGetError(this); } + static void operator delete(void* p) { OgaDestroyResult(reinterpret_cast(p)); } +}; + +// This is used to turn OgaResult return values from the C API into std::runtime_error exceptions +inline void OgaCheckResult(OgaResult* result) { + if (result) { + std::unique_ptr p_result{result}; // Take ownership so it's destroyed properly + throw std::runtime_error(p_result->GetError()); + } +} + +struct OgaModel : OgaAbstract { + static std::unique_ptr Create(const char* config_path) { + OgaModel* p; + OgaCheckResult(OgaCreateModel(config_path, &p)); + return std::unique_ptr(p); + } + + std::unique_ptr Generate(const OgaGeneratorParams& params) { + OgaSequences* p; + OgaCheckResult(OgaGenerate(this, ¶ms, &p)); + return std::unique_ptr(p); + } + + static void operator delete(void* p) { OgaDestroyModel(reinterpret_cast(p)); } +}; + +struct OgaString { + OgaString(const char* p) : p_{p} {} + ~OgaString() { OgaDestroyString(p_); } + + operator const char*() const { return p_; } + + const char* p_; +}; + +struct OgaSequences : OgaAbstract { + static std::unique_ptr Create() { + OgaSequences* p; + OgaCheckResult(OgaCreateSequences(&p)); + return std::unique_ptr(p); + } + + size_t Count() const { + return OgaSequencesCount(this); + } + + std::span Get(size_t index) const { + return {OgaSequencesGetSequenceData(this, index), OgaSequencesGetSequenceCount(this, index)}; + } + + static void operator delete(void* p) { OgaDestroySequences(reinterpret_cast(p)); } +}; + +struct OgaTokenizer : OgaAbstract { + static std::unique_ptr Create(const OgaModel& model) { + OgaTokenizer* p; + OgaCheckResult(OgaCreateTokenizer(&model, &p)); + return std::unique_ptr(p); + } + + void Encode(const char* str, OgaSequences& sequences) const { + OgaCheckResult(OgaTokenizerEncode(this, str, &sequences)); + } + + OgaString Decode(std::span tokens) const { + const char* p; + OgaCheckResult(OgaTokenizerDecode(this, tokens.data(), tokens.size(), &p)); + return p; + } + + static void operator delete(void* p) { OgaDestroyTokenizer(reinterpret_cast(p)); } +}; + +struct OgaTokenizerStream : OgaAbstract { + static std::unique_ptr Create(const OgaTokenizer& tokenizer) { + OgaTokenizerStream* p; + OgaCheckResult(OgaCreateTokenizerStream(&tokenizer, &p)); + return std::unique_ptr(p); + } + + /* + * Decode a single token in the stream. If this results in a word being generated, it will be returned in 'out'. + * The caller is responsible for concatenating each chunk together to generate the complete result. + * 'out' is valid until the next call to OgaTokenizerStreamDecode or when the OgaTokenizerStream is destroyed + */ + const char* Decode(int32_t token) { + const char* out; + OgaCheckResult(OgaTokenizerStreamDecode(this, token, &out)); + return out; + } + + static void operator delete(void* p) { OgaDestroyTokenizerStream(reinterpret_cast(p)); } +}; + +struct OgaGeneratorParams : OgaAbstract { + static std::unique_ptr Create(const OgaModel& model) { + OgaGeneratorParams* p; + OgaCheckResult(OgaCreateGeneratorParams(&model, &p)); + return std::unique_ptr(p); + } + + void SetSearchOption(const char* name, int value) { + OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value)); + } + + void SetSearchOption(const char* name, double value) { + OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value)); + } + + void SetSearchOption(const char* name, bool value) { + OgaCheckResult(OgaGeneratorParamsSetSearchBool(this, name, value)); + } + + void SetInputIDs(const int32_t* input_ids, size_t input_ids_count, size_t sequence_length, size_t batch_size) { + OgaCheckResult(OgaGeneratorParamsSetInputIDs(this, input_ids, input_ids_count, sequence_length, batch_size)); + } + + void SetInputSequences(const OgaSequences& sequences) { + OgaCheckResult(OgaGeneratorParamsSetInputSequences(this, &sequences)); + } + + static void operator delete(void* p) { OgaDestroyGeneratorParams(reinterpret_cast(p)); } +}; + +struct OgaGenerator : OgaAbstract { + static std::unique_ptr Create(const OgaModel& model, const OgaGeneratorParams& params) { + OgaGenerator* p; + OgaCheckResult(OgaCreateGenerator(&model, ¶ms, &p)); + return std::unique_ptr(p); + } + + bool IsDone() const { + return OgaGenerator_IsDone(this); + } + + void ComputeLogits() { + OgaCheckResult(OgaGenerator_ComputeLogits(this)); + } + + void GenerateNextToken() { + OgaCheckResult(OgaGenerator_GenerateNextToken(this)); + } + + std::span GetSequence(size_t index) const { + return {OgaGenerator_GetSequence(this, index), OgaGenerator_GetSequenceLength(this, index)}; + } + + static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast(p)); } +}; diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 1beb2a43b..e9548d509 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -1,11 +1,12 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "ort_genai_c.h" #include #include -#include +#include #include #include +#include "span.h" +#include "ort_genai_c.h" #include "generators.h" #include "models/model.h" #include "search.h" @@ -22,24 +23,24 @@ OrtEnv& GetOrtEnv() { return *g_ort_env; } +struct Result { + explicit Result(const char* what) : what_{what} {} + std::string what_; +}; + } // namespace Generators extern "C" { #define OGA_TRY try { -#define OGA_CATCH \ - } \ - catch (const std::exception& e) { \ - return new OgaResult{e.what()}; \ +#define OGA_CATCH \ + } \ + catch (const std::exception& e) { \ + return reinterpret_cast(std::make_unique(e.what()).release()); \ } -struct OgaResult { - explicit OgaResult(const char* what) : what_{what} {} - std::string what_; -}; - -const char* OGA_API_CALL OgaResultGetError(OgaResult* result) { - return result->what_.c_str(); +const char* OGA_API_CALL OgaResultGetError(const OgaResult* result) { + return reinterpret_cast(result)->what_.c_str(); } OgaResult* OGA_API_CALL OgaCreateSequences(OgaSequences** out) { @@ -231,7 +232,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGetCurrentGpuDeviceId(int* device_id) { } void OGA_API_CALL OgaDestroyResult(OgaResult* p) { - delete p; + delete reinterpret_cast(p); } void OGA_API_CALL OgaDestroyString(const char* p) { diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index fbd394f10..41eb65909 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -42,7 +42,7 @@ typedef struct OgaTokenizerStream OgaTokenizerStream; * \return Error message contained in the OgaResult. The const char* is owned by the OgaResult * and can will be freed when the OgaResult is destroyed. */ -OGA_EXPORT const char* OGA_API_CALL OgaResultGetError(OgaResult* result); +OGA_EXPORT const char* OGA_API_CALL OgaResultGetError(const OgaResult* result); /* * \param[in] result OgaResult to be destroyed. diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index 3a04a8180..a1ec2b923 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -3,71 +3,22 @@ #include #include #include -#include +#include #ifndef MODEL_PATH #define MODEL_PATH "../../test/test_models/" #endif -struct Deleters { - void operator()(OgaResult* p) { - OgaDestroyResult(p); - } - void operator()(OgaSequences* p) { - OgaDestroySequences(p); - } - void operator()(OgaModel* p) { - OgaDestroyModel(p); - } - void operator()(OgaTokenizer* p) { - OgaDestroyTokenizer(p); - } - void operator()(OgaTokenizerStream* p) { - OgaDestroyTokenizerStream(p); - } - void operator()(OgaGeneratorParams* p) { - OgaDestroyGeneratorParams(p); - } - void operator()(OgaGenerator* p) { - OgaDestroyGenerator(p); - } -}; - -using OgaResultPtr = std::unique_ptr; -using OgaSequencesPtr = std::unique_ptr; -using OgaModelPtr = std::unique_ptr; -using OgaTokenizerPtr = std::unique_ptr; -using OgaTokenizerStreamPtr = std::unique_ptr; -using OgaGeneratorParamsPtr = std::unique_ptr; -using OgaGeneratorPtr = std::unique_ptr; - -void CheckResult(OgaResult* result) { - if (!result) - return; - - OgaResultPtr result_ptr{result}; - throw std::runtime_error(OgaResultGetError(result)); -} - TEST(CAPITests, TokenizerCAPI) { #if TEST_PHI2 - OgaModel* model; - CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); - OgaModelPtr model_ptr{model}; - - OgaTokenizer* tokenizer; - CheckResult(OgaCreateTokenizer(model, &tokenizer)); - OgaTokenizerPtr tokenizer_ptr{tokenizer}; + auto model = OgaModel::Create(MODEL_PATH "phi-2"); + auto tokenizer = OgaTokenizer::Create(*model); // Encode single decode single { const char* input_string = "She sells sea shells by the sea shore."; - OgaSequences* input_sequences; - CheckResult(OgaCreateSequences(&input_sequences)); - CheckResult(OgaTokenizerEncode(tokenizer, input_string, input_sequences)); - OgaSequencesPtr input_sequences_ptr{input_sequences}; - - std::span sequence{OgaSequencesGetSequenceData(input_sequences, 0), OgaSequencesGetSequenceCount(input_sequences, 0)}; - const char* out_string; - CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); + auto input_sequences = OgaSequences::Create(); + tokenizer->Encode(input_string, *input_sequences); + + auto out_string = tokenizer->Decode(input_sequences->Get(0)); ASSERT_STREQ(input_string, out_string); } @@ -77,39 +28,30 @@ TEST(CAPITests, TokenizerCAPI) { "The quick brown fox jumps over the lazy dog.", }; - OgaSequences* sequences; - CheckResult(OgaCreateSequences(&sequences)); - OgaSequencesPtr sequences_ptr{sequences}; + auto sequences = OgaSequences::Create(); // Encode all strings { - for (auto &string : input_strings) - CheckResult(OgaTokenizerEncode(tokenizer, string, sequences)); + for (auto& string : input_strings) + tokenizer->Encode(string, *sequences); } // Decode one at a time - for (size_t i = 0; i < OgaSequencesCount(sequences); i++) { - std::span sequence{OgaSequencesGetSequenceData(sequences, i), OgaSequencesGetSequenceCount(sequences, i)}; - const char* out_string; - CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); + for (size_t i = 0; i < sequences->Count(); i++) { + auto out_string = tokenizer->Decode(sequences->Get(i)); std::cout << "Decoded string:" << out_string << std::endl; if (strcmp(input_strings[i], out_string) != 0) throw std::runtime_error("Token decoding mismatch"); - OgaDestroyString(out_string); } // Stream Decode one at a time - for (size_t i = 0; i < OgaSequencesCount(sequences); i++) { - OgaTokenizerStream* tokenizer_stream; - CheckResult(OgaCreateTokenizerStream(tokenizer, &tokenizer_stream)); - OgaTokenizerStreamPtr tokenizer_stream_ptr{tokenizer_stream}; + for (size_t i = 0; i < sequences->Count(); i++) { + auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); - std::span sequence{OgaSequencesGetSequenceData(sequences, i), OgaSequencesGetSequenceCount(sequences, i)}; + std::span sequence = sequences->Get(i); std::string stream_result; for (auto& token : sequence) { - const char* chunk; - CheckResult(OgaTokenizerStreamDecode(tokenizer_stream, token, &chunk)); - stream_result += std::string(chunk); + stream_result += tokenizer_stream->Decode(token); } std::cout << "Stream decoded string:" << stream_result << std::endl; if (strcmp(input_strings[i], stream_result.c_str()) != 0) @@ -120,17 +62,8 @@ TEST(CAPITests, TokenizerCAPI) { TEST(CAPITests, EndToEndPhiBatch) { #if TEST_PHI2 - OgaModel* model; - CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); - OgaModelPtr model_ptr{model}; - - OgaTokenizer* tokenizer; - CheckResult(OgaCreateTokenizer(model, &tokenizer)); - OgaTokenizerPtr tokenizer_ptr{tokenizer}; - - OgaSequences* input_sequences; - CheckResult(OgaCreateSequences(&input_sequences)); - OgaSequencesPtr sequences_ptr{input_sequences}; + auto model = OgaModel::Create(MODEL_PATH "phi-2"); + auto tokenizer = OgaTokenizer::Create(*model); const char* input_strings[] = { "This is a test.", @@ -138,27 +71,20 @@ TEST(CAPITests, EndToEndPhiBatch) { "The quick brown fox jumps over the lazy dog.", }; + auto input_sequences = OgaSequences::Create(); for (auto& string : input_strings) - CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); + tokenizer->Encode(string, *input_sequences); - OgaGeneratorParams* params; - CheckResult(OgaCreateGeneratorParams(model, ¶ms)); - OgaGeneratorParamsPtr params_ptr{params}; - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 20)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 20); + params->SetInputSequences(*input_sequences); - OgaSequences* output_sequences; - CheckResult(OgaGenerate(model, params, &output_sequences)); - OgaSequencesPtr output_sequences_ptr{output_sequences}; + auto output_sequences = model->Generate(*params); // Decode The Batch - for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { - std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; - - const char* out_string; - CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); + for (size_t i = 0; i < output_sequences->Count(); i++) { + auto out_string = tokenizer->Decode(output_sequences->Get(i)); std::cout << "Decoded string:" << out_string << std::endl; - OgaDestroyString(out_string); } #endif } @@ -179,44 +105,33 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20 // And copy the resulting gpt2_init_past_fp32.onnx file into these two files (as it's the same for gpt2) - OgaModel* model; - CheckResult(OgaCreateModel(MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32", &model)); - OgaModelPtr model_ptr{model}; + auto model = OgaModel::Create(MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); - OgaGeneratorParams* params; - CheckResult(OgaCreateGeneratorParams(model, ¶ms)); - OgaGeneratorParamsPtr params_ptr{params}; - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", max_length)); - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", false)); - CheckResult(OgaGeneratorParamsSetInputIDs(params, input_ids.data(), input_ids.size(), sequence_length, batch_size)); + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", max_length); + params->SetInputIDs(input_ids.data(), input_ids.size(), sequence_length, batch_size); - OgaGenerator* generator; - CheckResult(OgaCreateGenerator(model, params, &generator)); - OgaGeneratorPtr generator_ptr{generator}; + auto generator = OgaGenerator::Create(*model, *params); - while (!OgaGenerator_IsDone(generator)) { - CheckResult(OgaGenerator_ComputeLogits(generator)); - CheckResult(OgaGenerator_GenerateNextToken(generator)); + while (!generator->IsDone()) { + generator->ComputeLogits(); + generator->GenerateNextToken(); } // Verify outputs match expected outputs for (int i = 0; i < batch_size; i++) { - size_t token_count = OgaGenerator_GetSequenceLength(generator, i); - const int32_t* data = OgaGenerator_GetSequence(generator, i); - std::vector sequence(data, data + token_count); + auto sequence = generator->GetSequence(i); auto* expected_output_start = &expected_output[i * max_length]; EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t))); } // Test high level API - OgaSequences* sequences; - CheckResult(OgaGenerate(model, params, &sequences)); - OgaSequencesPtr sequences_ptr{sequences}; + auto sequences = model->Generate(*params); // Verify outputs match expected outputs for (int i = 0; i < batch_size; i++) { - std::span sequence{OgaSequencesGetSequenceData(sequences, i), OgaSequencesGetSequenceCount(sequences, i)}; + auto sequence = sequences->Get(i); auto* expected_output_start = &expected_output[i * max_length]; EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t))); @@ -224,151 +139,92 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { } #if TEST_PHI2 -TEST(CAPITests, TopKCAPI) { - float top_k = 50; - float temp = 0.6f; - OgaModel* model; - CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); - OgaModelPtr model_ptr{model}; +struct Phi2Test { + Phi2Test() { + model_ = OgaModel::Create(MODEL_PATH "phi-2"); + tokenizer_ = OgaTokenizer::Create(*model_); - OgaTokenizer* tokenizer; - CheckResult(OgaCreateTokenizer(model, &tokenizer)); - OgaTokenizerPtr tokenizer_ptr{tokenizer}; + input_sequences_ = OgaSequences::Create(); - OgaSequences* input_sequences; - CheckResult(OgaCreateSequences(&input_sequences)); - OgaSequencesPtr sequences_ptr{input_sequences}; + const char* input_strings[] = { + "This is a test.", + "Rats are awesome pets!", + "The quick brown fox jumps over the lazy dog.", + }; - const char* input_strings[] = { - "This is a test.", - "Rats are awesome pets!", - "The quick brown fox jumps over the lazy dog.", - }; + for (auto& string : input_strings) + tokenizer_->Encode(string, *input_sequences_); - for (auto& string : input_strings) - CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); + params_ = OgaGeneratorParams::Create(*model_); + params_->SetInputSequences(*input_sequences_); + params_->SetSearchOption("max_length", 40); + } - OgaGeneratorParams* params; - CheckResult(OgaCreateGeneratorParams(model, ¶ms)); - OgaGeneratorParamsPtr params_ptr{params}; - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); + void Run() { + // Low level loop + { + auto generator = OgaGenerator::Create(*model_, *params_); - OgaSequences* output_sequences; - CheckResult(OgaGenerate(model, params, &output_sequences)); - OgaSequencesPtr output_sequences_ptr{output_sequences}; + while (!generator->IsDone()) { + generator->ComputeLogits(); + generator->GenerateNextToken(); + } - // Decode The Batch - for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { - std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; + // Decode One at a time + for (size_t i = 0; i < 3; i++) { + auto out_string = tokenizer_->Decode(generator->GetSequence(i)); + std::cout << "Decoded string:" << out_string << std::endl; + } + } - const char* out_string; - CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); - std::cout << "Decoded string:" << out_string << std::endl; - OgaDestroyString(out_string); - } -} + // High level + { + auto output_sequences = model_->Generate(*params_); -TEST(CAPITests, TopPCAPI) { - float top_p = 0.6f; - float temp = 0.6f; + // Decode The Batch + for (size_t i = 0; i < output_sequences->Count(); i++) { + auto out_string = tokenizer_->Decode(output_sequences->Get(i)); + std::cout << "Decoded string:" << out_string << std::endl; + } + } + } - OgaModel* model; - CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); - OgaModelPtr model_ptr{model}; + std::unique_ptr model_; + std::unique_ptr tokenizer_; + std::unique_ptr input_sequences_; + std::unique_ptr params_; +}; - OgaTokenizer* tokenizer; - CheckResult(OgaCreateTokenizer(model, &tokenizer)); - OgaTokenizerPtr tokenizer_ptr{tokenizer}; +TEST(CAPITests, TopKCAPI) { + Phi2Test test; - OgaSequences* input_sequences; - CheckResult(OgaCreateSequences(&input_sequences)); - OgaSequencesPtr sequences_ptr{input_sequences}; + test.params_->SetSearchOption("do_sample", true); + test.params_->SetSearchOption("top_k", 50); + test.params_->SetSearchOption("temperature", 0.6f); - const char* input_strings[] = { - "This is a test.", - "Rats are awesome pets!", - "The quick brown fox jumps over the lazy dog.", - }; + test.Run(); +} - for (auto& string : input_strings) - CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); - - OgaGeneratorParams* params; - CheckResult(OgaCreateGeneratorParams(model, ¶ms)); - OgaGeneratorParamsPtr params_ptr{params}; - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); - OgaSequences* output_sequences; - CheckResult(OgaGenerate(model, params, &output_sequences)); - OgaSequencesPtr output_sequences_ptr{output_sequences}; +TEST(CAPITests, TopPCAPI) { + Phi2Test test; - // Decode The Batch - for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { - std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; + test.params_->SetSearchOption("do_sample", true); + test.params_->SetSearchOption("top_p", 0.6f); + test.params_->SetSearchOption("temperature", 0.6f); - const char* out_string; - CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); - std::cout << "Decoded string:" << out_string << std::endl; - OgaDestroyString(out_string); - } + test.Run(); } TEST(CAPITests, TopKTopPCAPI) { - float top_p = 0.6f; - int top_k = 50; - float temp = 0.6f; - - OgaModel* model; - CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model)); - OgaModelPtr model_ptr{model}; + Phi2Test test; - OgaTokenizer* tokenizer; - CheckResult(OgaCreateTokenizer(model, &tokenizer)); - OgaTokenizerPtr tokenizer_ptr{tokenizer}; + test.params_->SetSearchOption("do_sample", true); + test.params_->SetSearchOption("top_k", 50); + test.params_->SetSearchOption("top_p", 0.6f); + test.params_->SetSearchOption("temperature", 0.6f); - OgaSequences* input_sequences; - CheckResult(OgaCreateSequences(&input_sequences)); - OgaSequencesPtr sequences_ptr{input_sequences}; - - const char* input_strings[] = { - "This is a test.", - "Rats are awesome pets!", - "The quick brown fox jumps over the lazy dog.", - }; - - for (auto& string : input_strings) - CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences)); - - OgaGeneratorParams* params; - CheckResult(OgaCreateGeneratorParams(model, ¶ms)); - OgaGeneratorParamsPtr params_ptr{params}; - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40)); - CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p)); - CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp)); - CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences)); - OgaSequences* output_sequences; - CheckResult(OgaGenerate(model, params, &output_sequences)); - OgaSequencesPtr output_sequences_ptr{output_sequences}; - - // Decode The Batch - for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) { - std::span sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)}; - - const char* out_string; - CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string)); - std::cout << "Decoded string:" << out_string << std::endl; - OgaDestroyString(out_string); - } + test.Run(); } -#endif // TEST_PHI2 +#endif // TEST_PHI2 From 53df7dcb52092c2b73e20e1cc3a4089bc251139a Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:00:51 -0700 Subject: [PATCH 24/36] Support DML provider on Windows (#220) --- CMakeLists.txt | 8 ++++++++ build.py | 4 ++++ cmake/options.cmake | 1 + src/models/model.cpp | 14 +++++++++++++- 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 280a6148d..de12d6482 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,14 @@ else() list(REMOVE_ITEM generator_srcs ${generator_cuda_srcs}) endif() +if(USE_DML) + if(WIN32) + add_compile_definitions(USE_DML=1) + else() + message(FATAL_ERROR "USE_DML is ON but this isn't windows.") + endif() +endif() + if(ENABLE_TESTS AND TEST_PHI2) add_compile_definitions(TEST_PHI2=1) else() diff --git a/build.py b/build.py index 899415e3e..150ba7a54 100644 --- a/build.py +++ b/build.py @@ -96,6 +96,7 @@ def validate_cuda_home(cuda_home: str | bytes | os.PathLike | None): def build( skip_wheel: bool = False, use_cuda: bool | None = None, + use_dml: bool | None = None, cuda_home: str | bytes | os.PathLike | None = None, cmake_generator: str | None = None, ort_home: str | bytes | os.PathLike | None = None, @@ -141,6 +142,7 @@ def build( "-DCMAKE_POSITION_INDEPENDENT_CODE=ON", "-DUSE_CXX17=ON", "-DUSE_CUDA=ON" if cuda_home else "-DUSE_CUDA=OFF", + "-DUSE_DML=ON" if use_dml else "-DUSE_DML=OFF", f"-DBUILD_WHEEL={build_wheel}", ] @@ -218,6 +220,7 @@ def build( parser.add_argument("--skip_csharp", action="store_true", help="Skip building the C# API.") parser.add_argument("--build_dir", default=None, help="Path to output directory.") parser.add_argument("--use_cuda", action="store_true", help="Whether to use CUDA. Default is to not use cuda.") + parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.") parser.add_argument("--parallel", action="store_true", help="Enable parallel build.") parser.add_argument( "--config", @@ -231,6 +234,7 @@ def build( build( skip_wheel=args.skip_wheel, use_cuda=args.use_cuda, + use_dml=args.use_dml, cuda_home=args.cuda_home, cmake_generator=args.cmake_generator, ort_home=args.ort_home, diff --git a/cmake/options.cmake b/cmake/options.cmake index d42ea1ce1..80f004215 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -1,6 +1,7 @@ include(CMakeDependentOption) option(USE_CUDA "Build with CUDA support" ON) +option(USE_DML "Build with DML support" OFF) option(NO_TOKENIZER "Don't include the Tokenizer" OFF) option(ENABLE_PYTHON "Build the Python API." ON) option(ENABLE_TESTS "Enable tests" ON) diff --git a/src/models/model.cpp b/src/models/model.cpp index a31b1ed84..55ae62728 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -9,6 +9,11 @@ #include "decoder_only.h" #include "whisper.h" #include "kernels.h" +#ifdef USE_DML +// Because dml_provider_factory includes windows headers that #define min and max, this next line will prevent this from happening +#define NOMINMAX +#include "dml_provider_factory.h" +#endif namespace Generators { @@ -291,7 +296,14 @@ void Model::CreateSessionOptions() { Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size())); ort_options.AppendExecutionProvider_ROCM(ort_provider_options); - device_type_ = DeviceType::CPU; // Scoring uses CPU, even though the model uses ROCM +#ifdef USE_DML + } else if (provider_options.name == "dml") { + const OrtDmlApi* p_dml_api{}; + Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&p_dml_api))); + if (!p_dml_api) + throw std::runtime_error("Unexpected nullptr getting OrtDmlApi"); + p_dml_api->SessionOptionsAppendExecutionProvider_DML(&ort_options, 0); +#endif } else throw std::runtime_error("Unknown provider type: " + provider_options.name); } From 1a13baefa53fc7899c0ca4ca26271610ee73d785 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Wed, 27 Mar 2024 12:52:18 -0400 Subject: [PATCH 25/36] Provide instruction on how to download and extract nightly ORT (#238) Provide instruction on how to download and extract nightly ORT --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index cc87d474c..b572a94fd 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,32 @@ Export int4 CPU version huggingface-cli login --token python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o ``` +## Getting the latest nightly Onnxruntime build +By default, onnxruntime-genai uses the latest stable release of onnxruntime. If you want to use the latest nightly build +of onnxruntime, you can download the nightly build of onnxruntime from our +[Azure DevOps Artifacts](https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/OnnxRuntime/). +nuget package can be uncompressed by renaming the extension to `.zip` and extracting the contents. +The onnxruntime dynamlic libraries and header files are available in the nightly build. You can extract the nuget package +and copy the dynamic libraries and header files to the `ort/` folder under onnxruntime-genai project root on the same level +as this `README.md` file. + +The library files are located in the `runtime/$OS-$Arch/native` folder and the header files are located in the +`build/native/include` folder in the nuget package. + +The final folder structure should look like this: +``` +onnxruntime-genai +│ README.md +│ ... +│ ort/ +│ │ include/ +│ │ │ coreml_provider_factory.h +│ │ │ ... +│ │ │ provider_options.h +│ │ lib/ +│ │ │ (lib)onnxruntime.(so|dylib|dll) +│ │ │ (lib)onnxruntime_providers_shared.(so|dylib|dll) +``` ## Contributing From f9e8e40e5d5453a7183db15e50231ba66e182f99 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Wed, 27 Mar 2024 13:15:17 -0400 Subject: [PATCH 26/36] Adding py-ado-release to pipeline (#233) --- .pipelines/nuget-publishing.yml | 6 - .pipelines/pypl-publishing.yml | 6 + .../stages/jobs/nuget-packaging-job.yml | 29 ++--- .../stages/jobs/py-linux-packaging-job.yml | 64 ----------- .pipelines/stages/jobs/py-packaging-job.yml | 105 ++++++++++++++++++ .../stages/jobs/py-win-packaging-job.yml | 71 ------------ .../stages/jobs/steps/capi-linux-step.yml | 2 +- .../stages/jobs/steps/capi-win-step.yml | 2 +- ....yml => nuget-ado-feed-releasing-step.yml} | 20 ++-- .../jobs/steps/py-ado-feed-releasing-step.yml | 10 ++ .pipelines/stages/nuget-packaging-stage.yml | 8 +- .pipelines/stages/py-packaging-stage.yml | 22 +++- 12 files changed, 163 insertions(+), 182 deletions(-) delete mode 100644 .pipelines/stages/jobs/py-linux-packaging-job.yml create mode 100644 .pipelines/stages/jobs/py-packaging-job.yml delete mode 100644 .pipelines/stages/jobs/py-win-packaging-job.yml rename .pipelines/stages/jobs/steps/{nuget-releasing-step.yml => nuget-ado-feed-releasing-step.yml} (73%) create mode 100644 .pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml index e91b57489..451411fd2 100644 --- a/.pipelines/nuget-publishing.yml +++ b/.pipelines/nuget-publishing.yml @@ -38,11 +38,6 @@ parameters: type: boolean default: false -- name: publish_to_nuget - displayName: 'Publish to NuGet.org' - type: boolean - default: false - resources: repositories: - repository: manylinux @@ -61,5 +56,4 @@ stages: enable_linux_cuda: ${{ parameters.enable_linux_cuda }} ort_version: ${{ parameters.ort_version }} cuda_version: ${{ parameters.cuda_version }} - publish_to_nuget: ${{ parameters.publish_to_nuget }} publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} \ No newline at end of file diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml index d5cb45dca..edce0b37d 100644 --- a/.pipelines/pypl-publishing.yml +++ b/.pipelines/pypl-publishing.yml @@ -32,6 +32,11 @@ parameters: - '11.8' - '12.2' +- name: publish_to_ado_feed + displayName: 'Whether to publish the packages to ADO feed.' + type: boolean + default: false + resources: repositories: - repository: manylinux @@ -50,3 +55,4 @@ stages: enable_win_cuda: ${{ parameters.enable_win_cuda }} ort_version: ${{ parameters.ort_version }} cuda_version: ${{ parameters.cuda_version }} + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml index af5250c3c..9bfd6454d 100644 --- a/.pipelines/stages/jobs/nuget-packaging-job.yml +++ b/.pipelines/stages/jobs/nuget-packaging-job.yml @@ -10,10 +10,12 @@ parameters: default: '' - name: os type: string + values: + - 'linux' + - 'win' - name: publish_to_ado_feed type: boolean -- name: publish_to_nuget - type: boolean + jobs: - job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging ${{ if eq(parameters.os, 'linux') }}: @@ -21,7 +23,7 @@ jobs: ${{ if eq(parameters.os, 'win') }}: pool: 'onnxruntime-Win-CPU-2022' timeoutInMinutes: 180 -# set variables here to be used in the template and steps + # set variables here to be used in the template and steps variables: - name: arch value: ${{ parameters.arch }} @@ -60,18 +62,19 @@ jobs: workspace: clean: all steps: - - template: steps/capi-${{ parameters.os }}-step.yml - parameters: - target: 'onnxruntime-genai' + - ${{ if eq(parameters.os, 'linux') }}: + - template: steps/capi-linux-step.yml + parameters: + target: 'onnxruntime-genai' + # TODO: Add a step to build the linux nuget package -# TODO: Add a step to build the linux nuget package - ${{ if eq(parameters.os, 'win') }}: - - template: steps/nuget-${{ parameters.os }}-step.yml - - ${{ if or(eq(parameters.publish_to_nuget, true), eq(parameters.publish_to_ado_feed, true))}}: - - template: steps/nuget-releasing-step.yml - parameters: - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - publish_to_nuget: ${{ parameters.publish_to_nuget }} + - template: steps/capi-win-step.yml + parameters: + target: 'onnxruntime-genai' + - template: steps/nuget-win-step.yml + - ${{ if eq(parameters.publish_to_ado_feed, true)}}: + - template: steps/nuget-ado-feed-releasing-step.yml - template: steps/compliant-and-cleanup-step.yml diff --git a/.pipelines/stages/jobs/py-linux-packaging-job.yml b/.pipelines/stages/jobs/py-linux-packaging-job.yml deleted file mode 100644 index b7c35d6a5..000000000 --- a/.pipelines/stages/jobs/py-linux-packaging-job.yml +++ /dev/null @@ -1,64 +0,0 @@ -parameters: -- name: arch - type: string -- name: ort_version - type: string -- name: ep - type: string -- name: cuda_version - type: string - default: '' -jobs: -- job: Linux_${{ parameters.ep }}_${{ parameters.arch }}_Wheels - strategy: - matrix: - Python38: - PyDotVer: '3.8' - PyNoDotVer: '38' - Python39: - PyDotVer: '3.9' - PyNoDotVer: '39' - Python310: - PyDotVer: '3.10' - PyNoDotVer: '310' - Python311: - PyDotVer: '3.11' - PyNoDotVer: '311' - Python312: - PyDotVer: '3.12' - PyNoDotVer: '312' - timeoutInMinutes: 240 - workspace: - clean: all - pool: 'onnxruntime-Ubuntu2204-AMD-CPU' -# set variables here to be used in the template and steps - variables: - # The build machine pool doesn't have dotnet, so it can't run CG. - - name: skipComponentGovernanceDetection - value: true - - name: arch - value: ${{ parameters.arch }} - - name: ep - value: ${{ parameters.ep }} - - name: artifactName - value: 'onnxruntime-genai-capi-linux-${{ parameters.ep }}-${{ parameters.arch }}-python' - - name: cuda_version - value: ${{ parameters.cuda_version }} - - name: ort_version - value: ${{ parameters.ort_version }} - - name: ort_filename - ${{ if eq(parameters.ep, 'cpu') }}: - value: 'onnxruntime-linux-${{ parameters.arch }}-${{ parameters.ort_version }}' - ${{ else}}: - ${{if eq(parameters.cuda_version, '11.8') }}: - value: 'onnxruntime-linux-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' - ${{ else }}: - value: 'onnxruntime-linux-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' - steps: - - - template: steps/capi-linux-step.yml - parameters: - target: 'python' - - - template: steps/compliant-and-cleanup-step.yml - diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml new file mode 100644 index 000000000..7066b070e --- /dev/null +++ b/.pipelines/stages/jobs/py-packaging-job.yml @@ -0,0 +1,105 @@ +parameters: +- name: arch + type: string +- name: ep + type: string +- name: ort_version + type: string +- name: cuda_version + type: string + default: '' +- name: os + type: string + values: + - 'linux' + - 'win' +- name: publish_to_ado_feed + type: boolean + +jobs: +- job: python_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging + ${{ if eq(parameters.os, 'linux') }}: + pool: 'onnxruntime-Ubuntu2204-AMD-CPU' + ${{ if eq(parameters.os, 'win') }}: + pool: 'onnxruntime-Win-CPU-2022' + strategy: + matrix: + Python38: + PyDotVer: '3.8' + PyNoDotVer: '38' + Python39: + PyDotVer: '3.9' + PyNoDotVer: '39' + Python310: + PyDotVer: '3.10' + PyNoDotVer: '310' + Python311: + PyDotVer: '3.11' + PyNoDotVer: '311' + Python312: + PyDotVer: '3.12' + PyNoDotVer: '312' + timeoutInMinutes: 240 + workspace: + clean: all + # set variables here to be used in the template and steps + variables: + - name: skipComponentGovernanceDetection + ${{ if eq(parameters.os, 'linux') }}: + value: true + ${{ if eq(parameters.os, 'win') }}: + value: false + - name: arch + value: ${{ parameters.arch }} + - name: artifactName + value: 'onnxruntime-genai-${{ parameters.os }}-${{ parameters.ep }}-${{ parameters.arch }}' + - name: buildConfig + value: 'Release' + - name: buildDir + value: 'build/${{ parameters.ep }}' + - name: cuda_version + value: ${{ parameters.cuda_version }} + - name: ep + value: ${{ parameters.ep }} + - name: ort_version + value: ${{ parameters.ort_version }} + - name: ort_filename + ${{ if eq(parameters.ep, 'cpu') }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}' + ${{ else}}: + ${{if eq(parameters.cuda_version, '11.8') }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' + ${{ else }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: $(PyDotVer) + addToPath: true + architecture: $(arch) + - task: PythonScript@0 + inputs: + scriptSource: inline + script: | + import sys + import subprocess + subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'build', 'packaging', 'twine']) + workingDirectory: '$(Build.BinariesDirectory)' + displayName: 'Install python modules' + + - ${{ if eq(parameters.os, 'linux') }}: + - template: steps/capi-linux-step.yml + parameters: + target: 'python' + + # Windows job needs to set the python version and install the required packages + - ${{ if eq(parameters.os, 'win') }}: + - template: steps/capi-win-step.yml + parameters: + target: 'python' + + - ${{ if eq(parameters.publish_to_ado_feed, true)}}: + - template: steps/py-ado-feed-releasing-step.yml + + - template: steps/compliant-and-cleanup-step.yml + diff --git a/.pipelines/stages/jobs/py-win-packaging-job.yml b/.pipelines/stages/jobs/py-win-packaging-job.yml deleted file mode 100644 index 0989398eb..000000000 --- a/.pipelines/stages/jobs/py-win-packaging-job.yml +++ /dev/null @@ -1,71 +0,0 @@ -parameters: -- name: arch - type: string -- name: ort_version - type: string -- name: cuda_version - type: string - default: '' -- name: ep - type: string -jobs: -- job: Windows_${{ parameters.ep }}_${{ parameters.arch }}_Wheels - pool: 'onnxruntime-Win-CPU-2022' - strategy: - matrix: - Python38_x64: - PythonVersion: '3.8' - Python39_x64: - PythonVersion: '3.9' - Python310_x64: - PythonVersion: '3.10' - Python311_x64: - PythonVersion: '3.11' - Python312_x64: - PythonVersion: '3.12' - timeoutInMinutes: 180 -# set variables here to be used in the template and steps - variables: - - name: ep - value: ${{ parameters.ep }} - - name: cuda_version - value: ${{ parameters.cuda_version }} - - name: artifactName - value: 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}-wheel' - - name: arch - value: ${{ parameters.arch }} - - name: ort_version - value: ${{ parameters.ort_version }} - - name: ort_filename - ${{ if eq(parameters.ep, 'cpu') }}: - value: 'onnxruntime-win-${{ parameters.arch }}-${{ parameters.ort_version }}' - ${{ else}}: - ${{if eq(parameters.cuda_version, '11.8') }}: - value: 'onnxruntime-win-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' - ${{ else }}: - value: 'onnxruntime-win-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' - workspace: - clean: all - steps: - - - task: UsePythonVersion@0 - inputs: - versionSpec: $(PythonVersion) - addToPath: true - architecture: $(arch) - - - task: PythonScript@0 - inputs: - scriptSource: inline - script: | - import sys - import subprocess - subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'build', 'packaging']) - workingDirectory: '$(Build.BinariesDirectory)' - displayName: 'Install python modules' - - - template: steps/capi-win-step.yml - parameters: - target: 'python' - - - template: steps/compliant-and-cleanup-step.yml \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml index 6fa0f3c92..03f76feb1 100644 --- a/.pipelines/stages/jobs/steps/capi-linux-step.yml +++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml @@ -93,7 +93,7 @@ steps: - task: BinSkim@4 displayName: 'Run BinSkim' inputs: - AnalyzeTargetGlob: '$(Build.Repository.LocalPath)/**/*.pyd' + AnalyzeTargetGlob: '$(Build.Repository.LocalPath)/build/**/*cpython*.so' continueOnError: true - bash: | diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index aebc4cd13..3681bffd4 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -59,7 +59,7 @@ steps: - ${{ if eq(parameters.target, 'onnxruntime-genai') }}: - template: compliant/win-esrp-dll-step.yml parameters: - FolderPath: '$(buildDir)' + FolderPath: '$(Build.Repository.LocalPath)\$(buildDir)' DisplayName: 'ESRP - Sign C++ dlls' Pattern: '*genai.dll' diff --git a/.pipelines/stages/jobs/steps/nuget-releasing-step.yml b/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml similarity index 73% rename from .pipelines/stages/jobs/steps/nuget-releasing-step.yml rename to .pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml index 8442fd069..331a9ea7c 100644 --- a/.pipelines/stages/jobs/steps/nuget-releasing-step.yml +++ b/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml @@ -1,8 +1,3 @@ -parameters: -- name: publish_to_ado_feed - type: boolean -- name: publish_to_nuget - type: boolean steps: - task: NuGetToolInstaller@1 inputs: @@ -39,11 +34,10 @@ steps: parameters: packageFolder: '$(GDN_CODESIGN_TARGETDIRECTORY)' #This task must be run on a Windows machine -- ${{ if eq(parameters.publish_to_ado_feed, true) }}: - - task: NuGetCommand@2 - displayName: 'NuGet push to Azure DevOps Feed' - inputs: - command: push - packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg' - publishVstsFeed: 'PublicPackages/onnxruntime-genai' - allowPackageConflicts: true \ No newline at end of file +- task: NuGetCommand@2 + displayName: 'NuGet push to Azure DevOps Feed' + inputs: + command: push + packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg' + publishVstsFeed: 'PublicPackages/onnxruntime-genai' + allowPackageConflicts: true \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml b/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml new file mode 100644 index 000000000..85c0a7e3d --- /dev/null +++ b/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml @@ -0,0 +1,10 @@ +steps: +- task: TwineAuthenticate@1 + inputs: + artifactFeed: PublicPackages/onnxruntime-genai +- script: 'python -m twine upload -r onnxruntime-genai --config-file $(PYPIRC_PATH) --non-interactive *.whl' + workingDirectory: '$(Build.ArtifactStagingDirectory)/wheel' + displayName: 'Uploading wheels to PublicPackages/onnxruntime-genai' + retryCountOnTaskFailure: 3 + env: + SYSTEM_ACCESSTOKEN: $(System.AccessToken) \ No newline at end of file diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml index db500916b..f962337ac 100644 --- a/.pipelines/stages/nuget-packaging-stage.yml +++ b/.pipelines/stages/nuget-packaging-stage.yml @@ -14,8 +14,6 @@ parameters: default: '' - name: publish_to_ado_feed type: boolean -- name: publish_to_nuget - type: boolean stages: - stage: nuget_packaging @@ -28,7 +26,6 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'win' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - publish_to_nuget: ${{ parameters.publish_to_nuget }} - ${{ if eq(parameters.enable_win_cuda, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -38,7 +35,6 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'win' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - publish_to_nuget: ${{ parameters.publish_to_nuget }} - ${{ if eq(parameters.enable_linux_cpu, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -47,7 +43,6 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'linux' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - publish_to_nuget: ${{ parameters.publish_to_nuget }} - ${{ if eq(parameters.enable_linux_cuda, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -56,5 +51,4 @@ stages: ep: 'cuda' ort_version: ${{ parameters.ort_version }} os: 'linux' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - publish_to_nuget: ${{ parameters.publish_to_nuget }} \ No newline at end of file + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} \ No newline at end of file diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml index 62181490a..e23581f56 100644 --- a/.pipelines/stages/py-packaging-stage.yml +++ b/.pipelines/stages/py-packaging-stage.yml @@ -12,37 +12,47 @@ parameters: - name: cuda_version type: string default: '' +- name: publish_to_ado_feed + type: boolean + stages: -- stage: Python_Packaging_Stage +- stage: python_packaging jobs: - ${{ if eq(parameters.enable_win_cpu, true) }}: - - template: jobs/py-win-packaging-job.yml + - template: jobs/py-packaging-job.yml parameters: arch: 'x64' ep: 'cpu' ort_version: ${{ parameters.ort_version }} + os: 'win' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - ${{ if eq(parameters.enable_win_cuda, true) }}: - - template: jobs/py-win-packaging-job.yml + - template: jobs/py-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' ort_version: ${{ parameters.ort_version }} - + os: 'win' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - ${{ if eq(parameters.enable_linux_cpu, true) }}: - - template: jobs/py-linux-packaging-job.yml + - template: jobs/py-packaging-job.yml parameters: arch: 'x64' ep: 'cpu' ort_version: ${{ parameters.ort_version }} + os: 'linux' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - ${{ if eq(parameters.enable_linux_cuda, true) }}: - - template: jobs/py-linux-packaging-job.yml + - template: jobs/py-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' ort_version: ${{ parameters.ort_version }} + os: 'linux' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} From d656be953c66c3d1fba17a546b12a30348558323 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Wed, 27 Mar 2024 13:30:15 -0400 Subject: [PATCH 27/36] Mergin rel-0.1.0 back to main (#231) --- .pipelines/stages/jobs/steps/nuget-win-step.yml | 2 +- VERSION_INFO | 2 +- examples/csharp/README.md | 8 ++++++++ examples/python/README.md | 4 ++++ test/csharp/TestOnnxRuntimeGenAIAPI.cs | 6 +++--- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml index 3191412a3..11b134caa 100644 --- a/.pipelines/stages/jobs/steps/nuget-win-step.yml +++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml @@ -16,7 +16,7 @@ steps: DisplayName: 'ESRP - Sign C# dlls' Pattern: '*OnnxRuntimeGenAI*.dll' - powershell: | - $VERSION = '0.1.0-rc1' + $VERSION = '0.1.0-rc4' nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec ` -Prop version=$VERSION ` -Prop genai_nuget_ext=$(genai_nuget_ext) ` diff --git a/VERSION_INFO b/VERSION_INFO index 49ffebcaa..3e2177af6 100644 --- a/VERSION_INFO +++ b/VERSION_INFO @@ -1 +1 @@ -0.1.0-dev \ No newline at end of file +0.1.0rc4 \ No newline at end of file diff --git a/examples/csharp/README.md b/examples/csharp/README.md index edb71a717..7052a02d4 100644 --- a/examples/csharp/README.md +++ b/examples/csharp/README.md @@ -1,5 +1,13 @@ # Gen-AI C# Phi-2 Example +## Install the onnxruntime-genai library + +* Install the python package + + ```bash + pip install onnxruntime-genai + ``` + ## Get the model You can generate the model using the model builder provided with this library, or bring your own model. diff --git a/examples/python/README.md b/examples/python/README.md index cf7fe3450..6d1490de2 100644 --- a/examples/python/README.md +++ b/examples/python/README.md @@ -4,6 +4,10 @@ Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). + ```bash + cd build/wheel + pip install onnxruntime_genai-*.whl + ``` ## Get the model diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs index 156f943b4..2113ffdca 100644 --- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs +++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs @@ -93,7 +93,7 @@ public void TestTopKSearch() int topK = 100; float temp = 0.6f; ulong maxLength = 20; - + string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); using (var model = new Model(modelPath)) { @@ -135,7 +135,7 @@ public void TestTopPSearch() float topP = 0.6f; float temp = 0.6f; ulong maxLength = 20; - + string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); using (var model = new Model(modelPath)) { @@ -178,7 +178,7 @@ public void TestTopKTopPSearch() float topP = 0.6f; float temp = 0.6f; ulong maxLength = 20; - + string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2"); using (var model = new Model(modelPath)) { From 7aef327a36bc318f126c6d2703367b60595dd27b Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Fri, 29 Mar 2024 10:58:26 -0700 Subject: [PATCH 28/36] DML - Preload DirectML.dll to not use OS version (#241) If we fail to preload, we abort using DirectML as using the OS copy will fail when we try to use it. Also Copy DirectML.dll to the install folder as part of the other onnxruntime files --- CMakeLists.txt | 3 +++ src/models/model.cpp | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index de12d6482..ff70bea11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -183,6 +183,9 @@ endif() # Copy the onnxruntime binaries into the build folder so it's found on launch file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}") +if(USE_DML) + list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll") +endif() foreach(DLL_FILE ${onnxruntime_libs}) add_custom_command( TARGET onnxruntime-genai POST_BUILD diff --git a/src/models/model.cpp b/src/models/model.cpp index 55ae62728..5e4c519ee 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -13,6 +13,23 @@ // Because dml_provider_factory includes windows headers that #define min and max, this next line will prevent this from happening #define NOMINMAX #include "dml_provider_factory.h" + +EXTERN_C IMAGE_DOS_HEADER __ImageBase; + +static std::wstring CurrentModulePath() { + wchar_t path[MAX_PATH]; + GetModuleFileNameW((HINSTANCE)&__ImageBase, path, _countof(path)); + + wchar_t absolute_path[MAX_PATH]; + wchar_t* name; + GetFullPathNameW(path, _countof(path), absolute_path, &name); + + auto idx = std::distance(absolute_path, name); + auto out_path = std::wstring(absolute_path); + out_path.resize(idx); + + return out_path; +} #endif namespace Generators { @@ -302,6 +319,9 @@ void Model::CreateSessionOptions() { Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast(&p_dml_api))); if (!p_dml_api) throw std::runtime_error("Unexpected nullptr getting OrtDmlApi"); + auto directml_dll = CurrentModulePath() + L"DirectML.dll"; + if (LoadLibraryExW(directml_dll.c_str(), nullptr, 0) == NULL) + throw std::runtime_error("DirectML.dll not found"); p_dml_api->SessionOptionsAppendExecutionProvider_DML(&ort_options, 0); #endif } else From a11c9a74109996a2bb58ef471dc14fdceddac61d Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Mon, 1 Apr 2024 13:11:25 -0700 Subject: [PATCH 29/36] Fix calculating rotary embedding dim (#244) ### Description This PR fixes how `rotary_embedding_dim` is calculated. ### Motivation and Context This PR fixes [this issue](https://github.com/microsoft/onnxruntime-genai/issues/237). --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index fa022f7aa..740c7ca40 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -1471,7 +1471,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # self.input_shapes["position_ids"] = [1] # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor self.layernorm_attrs["simple"] = False self.rotemb_attrs["num_heads"] = self.num_attn_heads - self.rotemb_attrs["rotary_embedding_dim"] = self.num_attn_heads + self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"]) def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) From 7869d91d5a2c0c5d5c04770fb11084247237b640 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Mon, 1 Apr 2024 14:19:39 -0700 Subject: [PATCH 30/36] Add fp32 test to nightly run (#242) --- test/python/test_onnxruntime_genai_e2e.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/python/test_onnxruntime_genai_e2e.py b/test/python/test_onnxruntime_genai_e2e.py index cc6f9dde2..f76354261 100644 --- a/test/python/test_onnxruntime_genai_e2e.py +++ b/test/python/test_onnxruntime_genai_e2e.py @@ -10,7 +10,7 @@ def download_model( - download_path: str | bytes | os.PathLike, device: str, model_identifier: str + download_path: str | bytes | os.PathLike, device: str, model_identifier: str, precision: str ): # python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o download_path command = [ @@ -20,7 +20,7 @@ def download_model( "-m", model_identifier, "-p", - "int4", + precision, "-e", device, "-o", @@ -51,7 +51,9 @@ def run_model(model_path: str | bytes | os.PathLike): if __name__ == "__main__": for model_name in ["microsoft/phi-2"]: - with tempfile.TemporaryDirectory() as temp_dir: - device = "cuda" if og.is_cuda_available() else "cpu" - download_model(temp_dir, device, model_name) - run_model(temp_dir) + for precision in ["int4", "fp32"]: + with tempfile.TemporaryDirectory() as temp_dir: + device = "cuda" if og.is_cuda_available() else "cpu" + download_model(temp_dir, device, model_name, precision) + run_model(temp_dir) + From 6ad63e199b528a3d075d3a0b2e1dc91f8ff8a6e1 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Tue, 2 Apr 2024 00:36:52 -0700 Subject: [PATCH 31/36] Make position_ids be an optional input (#246) --- src/models/model.cpp | 8 +++++ src/models/model.h | 3 ++ src/models/position_ids.cpp | 58 ++++++++++++++++++++----------------- src/models/position_ids.h | 1 + 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/src/models/model.cpp b/src/models/model.cpp index 5e4c519ee..6ab493f19 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -210,6 +210,14 @@ SessionInfo::SessionInfo(OrtSession& session) { } } +bool SessionInfo::HasInput(const std::string& name) const { + return inputs_.find(name) != inputs_.end(); +} + +bool SessionInfo::HasOutput(const std::string& name) const { + return outputs_.find(name) != outputs_.end(); +} + ONNXTensorElementDataType SessionInfo::GetInputDataType(const std::string& name) const { auto result = inputs_.find(name); if (result == inputs_.end()) diff --git a/src/models/model.h b/src/models/model.h index 9af784362..a4b70ae46 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -88,6 +88,9 @@ struct Tokenizer : std::enable_shared_from_this { struct SessionInfo { SessionInfo(OrtSession& session); + bool HasInput(const std::string& name) const; + bool HasOutput(const std::string& name) const; + ONNXTensorElementDataType GetInputDataType(const std::string& name) const; ONNXTensorElementDataType GetOutputDataType(const std::string& name) const; diff --git a/src/models/position_ids.cpp b/src/models/position_ids.cpp index a0e8d6b56..ec6ebd579 100644 --- a/src/models/position_ids.cpp +++ b/src/models/position_ids.cpp @@ -8,7 +8,9 @@ namespace Generators { PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray& sequence_lengths_unk) : model_{model}, state_{state} { - type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.position_ids); + has_position_ids_ = model_.session_info_->HasInput(model_.config_->model.decoder.inputs.position_ids); + type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.attention_mask); + if (type_ != Ort::TypeToTensorType::type && type_ != Ort::TypeToTensorType::type) throw std::runtime_error("position_ids & attention_mask only support int32 or int64 types"); @@ -33,38 +35,42 @@ PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray void PositionIDs::Add() { input_index_ = state_.inputs_.size(); - state_.inputs_.push_back(position_ids_.get()); - state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str()); + if (has_position_ids_) { + state_.inputs_.push_back(position_ids_.get()); + state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str()); + } state_.inputs_.push_back(attention_mask_.get()); state_.input_names_.push_back(model_.config_->model.decoder.inputs.attention_mask.c_str()); } void PositionIDs::Update(int current_length) { - // Reallocate position_ids for the 2nd and onward shape - if (position_ids_next_) { - position_ids_ = std::move(position_ids_next_); - position_ids_shape_[1] = 1; - state_.inputs_[input_index_] = position_ids_.get(); - } else { // Just incrementing existing position IDs - switch (model_.device_type_) { - case DeviceType::CPU: { - if (type_ == Ort::TypeToTensorType::type) - UpdatePositionIDs(); - else - UpdatePositionIDs(); - break; - } + if (has_position_ids_) { + // Reallocate position_ids for the 2nd and onward shape + if (position_ids_next_) { + position_ids_ = std::move(position_ids_next_); + position_ids_shape_[1] = 1; + state_.inputs_[input_index_] = position_ids_.get(); + } else { // Just incrementing existing position IDs + switch (model_.device_type_) { + case DeviceType::CPU: { + if (type_ == Ort::TypeToTensorType::type) + UpdatePositionIDs(); + else + UpdatePositionIDs(); + break; + } #if USE_CUDA - case DeviceType::CUDA: - if (type_ == Ort::TypeToTensorType::type) - cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData(), static_cast(position_ids_shape_[0]), model_.cuda_stream_); - else - cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData(), static_cast(position_ids_shape_[0]), model_.cuda_stream_); - break; + case DeviceType::CUDA: + if (type_ == Ort::TypeToTensorType::type) + cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData(), static_cast(position_ids_shape_[0]), model_.cuda_stream_); + else + cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData(), static_cast(position_ids_shape_[0]), model_.cuda_stream_); + break; #endif - default: - throw std::runtime_error("PositionIDs::Update - Unsupported device type"); + default: + throw std::runtime_error("PositionIDs::Update - Unsupported device type"); + } } } @@ -95,7 +101,7 @@ void PositionIDs::Update(int current_length) { throw std::runtime_error("PositionIDs::Update - Unsupported device type"); } attention_mask_ = std::move(next_attention_mask); - state_.inputs_[input_index_ + 1] = attention_mask_.get(); + state_.inputs_[input_index_ + has_position_ids_] = attention_mask_.get(); } } diff --git a/src/models/position_ids.h b/src/models/position_ids.h index 411b55c1c..22601f359 100644 --- a/src/models/position_ids.h +++ b/src/models/position_ids.h @@ -21,6 +21,7 @@ struct PositionIDs { State& state_; size_t input_index_{~0U}; ONNXTensorElementDataType type_; // Common type for position_ids and attention_mask + bool has_position_ids_; std::array position_ids_shape_{}; // {params.batch_size*params.beam_size, params.sequence_length} std::unique_ptr position_ids_; From 935ed35651aca79ff628da4a53b12d77be2af269 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Wed, 3 Apr 2024 12:15:25 -0700 Subject: [PATCH 32/36] refine the c example (#248) --- examples/c/CMakeLists.txt | 31 ++++++++++++++++++++++++------- examples/c/README.md | 2 +- examples/c/src/main.cpp | 24 +++++++++++++++++------- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt index d44909286..9b33a3ed3 100644 --- a/examples/c/CMakeLists.txt +++ b/examples/c/CMakeLists.txt @@ -4,13 +4,24 @@ project(phi2) set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++2a") +set(ORT_GENAI_LIB_DIR ${CMAKE_SOURCE_DIR}/lib) + +if(WIN32) + set(ONNXRUNTIME_GENAI_LIB "onnxruntime-genai.dll") + set(ONNXRUNTIME_GENAI_DEPENDENCY "*.dll") +elseif(APPLE) + set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.dylib") + set(ONNXRUNTIME_GENAI_DEPENDENCY "*.dylib") +else() + set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.so") + set(ONNXRUNTIME_GENAI_DEPENDENCY "*.so") +endif() + add_executable(phi2 ${CMAKE_SOURCE_DIR}/src/main.cpp) -add_library(onnxruntime-genai SHARED IMPORTED) -set_target_properties(onnxruntime-genai PROPERTIES - IMPORTED_LOCATION_RELEASE ${CMAKE_SOURCE_DIR}/lib/onnxruntime-genai.dll - IMPORTED_IMPLIB_RELEASE ${CMAKE_SOURCE_DIR}/lib/onnxruntime-genai.lib -) + +target_link_directories(phi2 PRIVATE ${ORT_GENAI_LIB_DIR}) +target_link_libraries(phi2 PRIVATE ${ONNXRUNTIME_GENAI_LIB}) target_include_directories(phi2 PRIVATE ${CMAKE_SOURCE_DIR}/include) target_link_libraries( @@ -18,5 +29,11 @@ target_link_libraries( PUBLIC onnxruntime-genai) -file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/phi-2" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/Release") -file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/lib/" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/Release") +file(GLOB ort_genai_libs "${CMAKE_SOURCE_DIR}/lib/${ONNXRUNTIME_GENAI_DEPENDENCY}") + +foreach(DLL_FILE ${ort_genai_libs}) + add_custom_command( + TARGET phi2 POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_FILE} $ + ) +endforeach() \ No newline at end of file diff --git a/examples/c/README.md b/examples/c/README.md index 8cd2168fd..0a45578cd 100644 --- a/examples/c/README.md +++ b/examples/c/README.md @@ -48,5 +48,5 @@ cmake --build . --config Release ```bash cd build\\Release -.\phi2.exe +.\phi2.exe path_to_model ``` diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp index d9aeb68a8..e4be639f2 100644 --- a/examples/c/src/main.cpp +++ b/examples/c/src/main.cpp @@ -4,8 +4,8 @@ // C++ API Example -void CXX_API() { - auto model = OgaModel::Create("phi-2"); +void CXX_API(const char* model_path) { + auto model = OgaModel::Create(model_path); auto tokenizer = OgaTokenizer::Create(*model); const char* prompt = "def is_prime(num):"; @@ -34,9 +34,9 @@ void CheckResult(OgaResult* result) { } } -void C_API() { +void C_API(const char* model_path) { OgaModel* model; - OgaCreateModel("phi-2", &model); + OgaCreateModel(model_path, &model); OgaTokenizer* tokenizer; CheckResult(OgaCreateTokenizer(model, &tokenizer)); @@ -74,16 +74,26 @@ void C_API() { OgaDestroyModel(model); } -int main() { +static void print_usage(int /*argc*/, char** argv) { + std::cerr << "usage: " << argv[0] << " model_path" << std::endl; +} + +int main(int argc, char** argv) { + if (argc != 2) { + print_usage(argc, argv); + return -1; + } + + std::cout << "-------------" << std::endl; std::cout << "Hello, Phi-2!" << std::endl; std::cout << "-------------" << std::endl; std::cout << "C++ API" << std::endl; - CXX_API(); + CXX_API(argv[1]); std::cout << "C API" << std::endl; - C_API(); + C_API(argv[1]); return 0; } \ No newline at end of file From 75f87021b445247b650ef616e8f3d8211f26b64c Mon Sep 17 00:00:00 2001 From: rui-ren Date: Wed, 3 Apr 2024 13:14:21 -0700 Subject: [PATCH 33/36] update README doc (#247) 1. Update README.md - Add the `model builder` step to `README.md` - Use `abspath` for `og.Model`. - Update to `GeneratorParams` 2. Add `pandas` as dependent packages for benchmark. Co-authored-by: Ubuntu --- README.md | 13 +++++++++++-- benchmark/python/README | 4 ++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b572a94fd..0e8b42566 100644 --- a/README.md +++ b/README.md @@ -50,10 +50,19 @@ See full documentation at [https://onnxruntime.ai/docs/genai]. [Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package. +1. Build the model +```shell +python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./models/phi2 +``` + +2. Run inference ```python +import os import onnxruntime_genai as og -model = og.Model(f'models/microsoft/phi-2') +model_path = os.path.abspath("./models/phi2") + +model = og.Model(model_path) tokenizer = og.Tokenizer(model) @@ -64,7 +73,7 @@ prompt = '''def print_prime(n): tokens = tokenizer.encode(prompt) -params = og.SearchParams(model) +params = og.GeneratorParams(model) params.set_search_options({"max_length":200}) params.input_ids = tokens diff --git a/benchmark/python/README b/benchmark/python/README index da1174309..67cac3ccb 100644 --- a/benchmark/python/README +++ b/benchmark/python/README @@ -2,7 +2,7 @@ This is an end-to-end benchmarking script for any GenAI-supported ONNX model. Prerequisites: -0) Install onnxruntime-genai and onnxruntime +0) Install pandas, onnxruntime-genai and onnxruntime 1) Use builder.py to build the desired ONNX model @@ -10,4 +10,4 @@ Prerequisites: Example call to benchmarking script -python benchmark_e2e.py -i {model folder} -b 1 -l 128 -g 256 -r 100 -w 10 -k 5 -o {output csv file name} \ No newline at end of file +python benchmark_e2e.py -i {model folder} -b 1 -l 128 -g 256 -r 100 -w 10 -k 5 -o {output csv file name} From a2789fb84a381b202790a905527e1aaae716524a Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Mon, 8 Apr 2024 13:25:51 -0700 Subject: [PATCH 34/36] Add a random_seed search option (#250) It goes into the search options as `"random_seed":1234` (for example). The default value is -1, which means to use a random seed. There is an issue with CUDA where the output can eventually differ even if the same seed is used. I did some investigation and the random numbers match, but the tokens chosen will differ eventually in longer outputs. With CPU the output always matches. --- src/config.cpp | 2 ++ src/config.h | 1 + src/cuda_sampling.cu | 42 +++++++++++++++++++++--------------------- src/cuda_sampling.cuh | 20 +++++++++++--------- src/search.cpp | 12 +++++++++++- src/search.h | 1 - src/search_cuda.cpp | 8 +++++++- 7 files changed, 53 insertions(+), 33 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 74045a524..24d789bdd 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -285,6 +285,8 @@ struct Search_Element : JSON::Element { v_.diversity_penalty = static_cast(value); } else if (name == "length_penalty") { v_.length_penalty = static_cast(value); + } else if (name == "random_seed") { + v_.random_seed = static_cast(value); } else throw JSON::unknown_value_error{}; } diff --git a/src/config.h b/src/config.h index 2621edc21..b5eb67dcc 100644 --- a/src/config.h +++ b/src/config.h @@ -86,6 +86,7 @@ struct Config { float diversity_penalty{}; float length_penalty{1.0f}; // Exponential penalty to the length that is used with beam-based generation. length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences. bool past_present_share_buffer{}; // The past/present kv tensors are shared and allocated once to max_length (cuda only) + int random_seed{-1}; // -1 = Seed with random device, otherwise use value to seed RNG } search; }; diff --git a/src/cuda_sampling.cu b/src/cuda_sampling.cu index 471d593ae..bef166d9f 100644 --- a/src/cuda_sampling.cu +++ b/src/cuda_sampling.cu @@ -11,7 +11,6 @@ #include "smartptrs.h" #include #include -#include #include namespace Generators { @@ -20,7 +19,15 @@ namespace cuda { constexpr int kMaxThreads = 1024; constexpr int kGPUWarpSize = 32; -SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream) { +__global__ void InitCurandStates(unsigned long long seed, curandState* states, int batch_size) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= batch_size) + return; + + curand_init(seed, index, 0, &states[index]); +} + +SamplingData::SamplingData(unsigned long long random_seed, int batch_size, int vocab_size, cudaStream_t stream) { indices_sorted = CudaMallocArray(vocab_size * batch_size); scores_sorted = CudaMallocArray(vocab_size * batch_size); scores_softmaxed = CudaMallocArray(vocab_size * batch_size); @@ -28,10 +35,13 @@ SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream) thresholds = CudaMallocArray(batch_size); indices_in = CudaMallocArray(vocab_size * batch_size); offsets = CudaMallocArray(batch_size + 1); + curand_states = CudaMallocArray(batch_size); temp_storage_bytes = 0; cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, (float*)nullptr, (float*)nullptr, (int*)nullptr, (int*)nullptr, vocab_size*batch_size, batch_size, (int*)nullptr, (int*)nullptr, 0, sizeof(float) * 8, stream); temp_buffer = CudaMallocArray(temp_storage_bytes / sizeof(float)); + + InitCurandStates<<>>(random_seed, curand_states.get(), batch_size); } // Softmax Kernels and Launchers @@ -431,37 +441,31 @@ void LaunchGetTopKSubset(cudaStream_t stream, float* scores_in, float* scores_ou } // Sets up random thresholds for top p or top k sampling -__global__ void RandomThresholdKernelTopPAndK(int seed, float* thresholds, float* prefix_sums, int batch_size, float p, int k) { +__global__ void RandomThresholdKernelTopPAndK(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, float p, int k) { int index = threadIdx.x + blockIdx.x * blockDim.x; - curandState state; - curand_init(seed, index, 0, &state); float k_prob = prefix_sums[k-1]; if (index < batch_size) { float min_p = fminf(p, k_prob); - thresholds[index] = min_p * curand_uniform(&state); + thresholds[index] = min_p * curand_uniform(&curand_states[index]); } } // Sets up random thresholds for top p or top k sampling -__global__ void RandomThresholdKernelTopP(int seed, float* thresholds, float* prefix_sums, int batch_size, float p) { +__global__ void RandomThresholdKernelTopP(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, float p) { int index = threadIdx.x + blockIdx.x * blockDim.x; - curandState state; - curand_init(seed, index, 0, &state); if (index < batch_size) { - thresholds[index] = p * curand_uniform(&state); + thresholds[index] = p * curand_uniform(&curand_states[index]); } } // Sets up random thresholds for top p or top k sampling -__global__ void RandomThresholdKernelTopK(int seed, float* thresholds, float* prefix_sums, int batch_size, int k) { +__global__ void RandomThresholdKernelTopK(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, int k) { int index = threadIdx.x + blockIdx.x * blockDim.x; - curandState state; - curand_init(seed, index, 0, &state); if (index < batch_size) { - thresholds[index] = prefix_sums[k-1] * curand_uniform(&state); + thresholds[index] = prefix_sums[k - 1] * curand_uniform(&curand_states[index]); } } @@ -502,16 +506,12 @@ void LaunchSampleKernel(SamplingData* data, cudaStream_t stream, float* scores, PrefixSumKernel<256><<>>(scores, prefix_sums.data(), sample_range, batch_size); // Random Thresholds for Top P or Top K Sampling std::span thresholds{data->thresholds.get(), static_cast(batch_size)}; - std::random_device rd; - std::mt19937 eee(rd()); - std::uniform_int_distribution dist(0, std::numeric_limits::max()); - int seed = dist(eee); if (p > 0.0 && k > 1) { - RandomThresholdKernelTopPAndK<<>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p, k); + RandomThresholdKernelTopPAndK<<>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, p, k); } else if (p > 0.0) { - RandomThresholdKernelTopP<<>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p); + RandomThresholdKernelTopP<<>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, p); } else if (k > 1) { - RandomThresholdKernelTopK<<>>(seed, thresholds.data(), prefix_sums.data(), batch_size, k); + RandomThresholdKernelTopK<<>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, k); } SampleKernel<256><<>>(prefix_sums.data(), indices, index_out, sample_range, thresholds.data()); } diff --git a/src/cuda_sampling.cuh b/src/cuda_sampling.cuh index f7f74b827..cc8ab9867 100644 --- a/src/cuda_sampling.cuh +++ b/src/cuda_sampling.cuh @@ -1,20 +1,22 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "smartptrs.h" +#include namespace Generators { namespace cuda { struct SamplingData { - SamplingData(int batch_size, int vocab_size, cudaStream_t stream); - std::unique_ptr indices_sorted = nullptr; - std::unique_ptr scores_sorted = nullptr; - std::unique_ptr scores_softmaxed = nullptr; - std::unique_ptr prefix_sums = nullptr; - std::unique_ptr thresholds = nullptr; - std::unique_ptr indices_in = nullptr; - std::unique_ptr offsets = nullptr; - std::unique_ptr temp_buffer = nullptr; + SamplingData(unsigned long long random_seed, int batch_size, int vocab_size, cudaStream_t stream); + cuda_unique_ptr indices_sorted; + cuda_unique_ptr scores_sorted; + cuda_unique_ptr scores_softmaxed; + cuda_unique_ptr prefix_sums; + cuda_unique_ptr thresholds; + cuda_unique_ptr indices_in; + cuda_unique_ptr offsets; + cuda_unique_ptr temp_buffer; + cuda_unique_ptr curand_states; size_t temp_storage_bytes = 0; }; diff --git a/src/search.cpp b/src/search.cpp index dd3389270..aeff79c1d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -15,7 +15,17 @@ Search_Cpu::Search_Cpu(const GeneratorParams& params) } GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params) - : Search_Cpu(params), gen_(rd_()) { + : Search_Cpu(params) { + if (params_->search.random_seed != -1) + gen_.seed(params_->search.random_seed); + else { + std::random_device rd; + std::array data; + std::generate(std::begin(data), std::end(data), std::ref(rd)); + std::seed_seq seq(data.begin(), data.end()); + gen_.seed(seq); + } + next_tokens_buffer_ = AllocateArray(params.batch_size, &next_tokens_); memset(next_tokens_.data(), 0, next_tokens_.size_bytes()); diff --git a/src/search.h b/src/search.h index 5a52c11e2..70dab187b 100644 --- a/src/search.h +++ b/src/search.h @@ -83,7 +83,6 @@ struct GreedySearch_Cpu : Search_Cpu { std::unique_ptr eos_seen_buffer_; int not_done_count_{params_->batch_size}; // When zero, every batch entry is done (starts at batch_size_) - std::random_device rd_; std::mt19937 gen_; }; diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp index aa6d85431..9a5c6e9de 100644 --- a/src/search_cuda.cpp +++ b/src/search_cuda.cpp @@ -33,7 +33,13 @@ GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params) : Search_Cuda{params} { next_tokens_buffer_ = CudaMallocArray(params.batch_size, &next_tokens_); cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_->cuda_stream); - samplingdata_ = std::make_unique(params_->batch_size, params_->vocab_size, params_->cuda_stream); + + unsigned long long random_seed; + if (params_->search.random_seed != -1) + random_seed = params_->search.random_seed; + else + random_seed = std::random_device{}(); + samplingdata_ = std::make_unique(random_seed, params_->batch_size, params_->vocab_size, params_->cuda_stream); } BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params) From 18adb67dce145579779dd0b6a3e2b25826a7b459 Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Mon, 8 Apr 2024 16:47:10 -0700 Subject: [PATCH 35/36] Add packed QKV and rotary embedding within GroupQueryAttention to model builder (#245) --- .gitignore | 2 +- src/python/py/models/README.md | 4 +- src/python/py/models/builder.py | 265 ++++++++++++++++++++------------ 3 files changed, 168 insertions(+), 103 deletions(-) diff --git a/.gitignore b/.gitignore index 60b60827f..d42e707d4 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,7 @@ /test/test_models/* /cache_models /onnxruntime-linux-x64-* -/*.csv +*.csv .idea cache_dir example-models diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md index 0fdd2c818..34f24083e 100644 --- a/src/python/py/models/README.md +++ b/src/python/py/models/README.md @@ -62,10 +62,10 @@ python3 builder.py -m model_name -o path_to_output_folder -p precision -e execut This scenario is where your PyTorch model has been customized or finetuned for one of the currently supported model architectures and your model can be loaded in Hugging Face. ``` # From wheel: -python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider +python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files # From source: -python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider +python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files ``` ### GGUF Model diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 740c7ca40..16f864c38 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -19,6 +19,7 @@ import os import textwrap + class Model: def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.context_length = config.max_position_embeddings @@ -48,7 +49,13 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.value_infos = [] self.nodes = [] - # Map input names to input shapes + # Map input names to their types and shapes + self.input_names = ["input_ids", "attention_mask", "position_ids"] + self.input_types = { + "input_ids": TensorProto.INT64, + "attention_mask": TensorProto.INT64, + "position_ids": TensorProto.INT64, + } self.input_shapes = { "input_ids": ["batch_size", "sequence_length"], "attention_mask": ["batch_size", "total_sequence_length"], @@ -105,19 +112,35 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000 self.rotemb_attrs = { "create_rotary_embedding_caches": True, # Create cos/sin caches for rotary embeddings + "theta": rope_theta, # Base value if calculating cos/sin caches from scratch "partial_rotary_factor": partial_rotary_factor, # Factor for partial rotary embeddings + "interleaved": 0, # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0) "num_heads": 0, # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0) "rotary_embedding_dim": 0, # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0) - "theta": rope_theta, # Base value if calculating cos/sin caches from scratch } # Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.) self.attention_attrs = { - "op_type": "MultiHeadAttention", # Attention op to use - "use_gqa": ep == "cuda" and io_dtype == TensorProto.FLOAT16 # Check if GroupQueryAttention can be used + "op_type": "MultiHeadAttention", # Attention op to use + "use_rotemb_in_gqa": False, # Use rotary embeddings within GroupQueryAttention (instead of a separate RotaryEmbedding op) + "use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V) } - if self.attention_attrs["use_gqa"]: + if ep == "cuda" and io_dtype == TensorProto.FLOAT16: self.attention_attrs["op_type"] = "GroupQueryAttention" + print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 CUDA and FP16 CUDA.") + + self.attention_attrs["use_packed_matmul"] = self.num_attn_heads == self.num_kv_heads + + # GQA + Rot.Emb. does not require `position ids` as input + self.attention_attrs["use_rotemb_in_gqa"] = True + self.input_names.remove("position_ids") + + # MLP-specific variables + self.mlp_attrs = { + "use_proj": True, # Use projection style for MLP (GateProj/UpProj/DownProj) + "use_fc": False, # Use fully-connected style for MLP (FC1/FC2) + "output_0": "", # Output 0 for MLP layer + } # Quantization-specific variables (INT4, INT8, etc.) self.quant_attrs = { @@ -129,7 +152,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): config = GenerationConfig.from_pretrained(model_name_or_path, **extra_kwargs) - inputs = dict(zip(self.input_shapes.keys(), self.input_shapes.keys())) + inputs = dict(zip(self.input_names, self.input_names)) inputs.update({ "past_key_names": "past_key_values.%d.key", "past_value_names": "past_key_values.%d.value", @@ -238,6 +261,7 @@ def save_model(self, out_dir): if os.path.exists(data_path): print(f"Overwriting {data_path}") os.remove(data_path) + save_model( model, out_path, @@ -305,9 +329,10 @@ def make_graph(self, *args, doc_string=None, **kwargs): def make_inputs_and_outputs(self): # Add model-specific inputs to list of model inputs inputs = [] - for name in self.model_inputs: + for name in self.input_names: + dtype = self.input_types[name] shape = self.input_shapes[name] - inputs.append(helper.make_tensor_value_info(name, TensorProto.INT64, shape=shape)) + inputs.append(helper.make_tensor_value_info(name, dtype, shape=shape)) # Add model-specific outputs to list of model outputs outputs = [ @@ -474,9 +499,13 @@ def make_matmul_fp16_or_fp32(self, matmul, name, root_input, **kwargs): # self.make_node("MatMulNBits", inputs=[root_input, weight, scales], outputs=[output], name=name) # self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) - # TODO: make packed QKV MatMul - # def make_packed_matmul(self, q_matmul, k_matmul, v_matmul, name, root_input, **kwargs): - # pass + def make_packed_matmul(self, q_matmul, k_matmul, v_matmul, name, root_input, **kwargs): + # N = num_heads * head_size, H = hidden_size + # Combine 3 Matmuls of shape NxH into 1 packed MatMul of shape 3NxH + # Note: Packed MatMul is of shape 3NxH instead of Hx3N because `make_matmul` will apply a transpose before saving + N, H = q_matmul.shape + matmul = np.stack((q_matmul.transpose(), k_matmul.transpose(), v_matmul.transpose()), axis=1).reshape(H, 3*N).transpose() + self.make_matmul(matmul, name, root_input, **kwargs) def make_add_bias(self, add, name, root_input, **kwargs): bias = name[1:].replace("/", ".") + ".bias" @@ -492,6 +521,11 @@ def make_add_bias(self, add, name, root_input, **kwargs): else: self.make_add(name, add_bias_inputs, dtype=self.io_dtype, shape=shape) + def make_packed_add(self, q_add, k_add, v_add, name, root_input, **kwargs): + # Combine 3 Adds of shape H into 1 packed Add of shape 3H + add = np.stack((q_add, k_add, v_add), axis=0).flatten() + self.make_add_bias(add, name, root_input, **kwargs) + def make_embedding(self, embedding): weight = "model.embed_tokens.weight" self.make_external_tensor(embedding.astype(self.to_numpy_dtype[self.io_dtype]), weight) @@ -587,7 +621,7 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): inputs = [root_input, kwargs.pop("position_ids"), cos_cache_name, sin_cache_name] output = f"{name}/output_0" - self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=0, **kwargs) + self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=self.rotemb_attrs["interleaved"], **kwargs) self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * (self.num_kv_heads if "k_rotary" in name else self.num_attn_heads)]) # TODO: This function and any corresponding changes to support it are temporary until ORT supports GQA for CPU @@ -795,10 +829,15 @@ def make_group_query_attention(self, name, **kwargs): kwargs["q_path"], kwargs["k_path"], kwargs["v_path"], kwargs.get("past_k", ""), kwargs.get("past_v", ""), kwargs.get("seqlens_k", ""), kwargs.get("total_seq_len", ""), + kwargs.get("cos_cache", ""), kwargs.get("sin_cache", "") ] output = f"{name}/output_0" outputs = [output, kwargs.get("present_k", ""), kwargs.get("present_v", "")] - self.make_node("GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, local_window_size=self.window_size) + self.make_node( + "GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", + num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, local_window_size=self.window_size, + do_rotary=self.attention_attrs["use_rotemb_in_gqa"], rotary_interleaved=self.rotemb_attrs["interleaved"], + ) self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * self.num_attn_heads]) def make_attention(self, layer_id, attention, root_input, **kwargs): @@ -841,60 +880,75 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): v_input_to_attention = "" # Make MatMul nodes - q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul" - self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input) - q_input_to_attention = f"{q_matmul_name}/output_0" - k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul" - self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input) - k_input_to_attention = f"{k_matmul_name}/output_0" - v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul" - self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input) - v_input_to_attention = f"{v_matmul_name}/output_0" + if self.attention_attrs["use_packed_matmul"]: + # Combine 3 MatMuls into 1 packed MatMul + qkv_matmul_name = f"/model/layers.{layer_id}/attn/qkv_proj/MatMul" + self.make_packed_matmul(attention.q_proj.weight.detach().numpy(), attention.k_proj.weight.detach().numpy(), attention.v_proj.weight.detach().numpy(), qkv_matmul_name, root_input) + q_input_to_attention = f"{qkv_matmul_name}/output_0" + else: + q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul" + self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input) + q_input_to_attention = f"{q_matmul_name}/output_0" + k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul" + self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input) + k_input_to_attention = f"{k_matmul_name}/output_0" + v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul" + self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input) + v_input_to_attention = f"{v_matmul_name}/output_0" # Make Add nodes (if bias exists) q_bias_exists = attention.q_proj.bias is not None k_bias_exists = attention.k_proj.bias is not None v_bias_exists = attention.v_proj.bias is not None + all_bias_exists = q_bias_exists and k_bias_exists and v_bias_exists - if q_bias_exists: - q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add" - self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=f"{q_matmul_name}/output_0") - q_input_to_attention = f"{q_add_name}/output_0" - if k_bias_exists: - k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add" - self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=f"{k_matmul_name}/output_0") - k_input_to_attention = f"{k_add_name}/output_0" - if v_bias_exists: - v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add" - self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=f"{v_matmul_name}/output_0") - v_input_to_attention = f"{v_add_name}/output_0" + if all_bias_exists and self.attention_attrs["use_packed_matmul"]: + # Combine 3 Adds into 1 packed Add + qkv_add_name = f"/model/layers.{layer_id}/attn/qkv_proj/Add" + self.make_packed_add(attention.q_proj.bias.detach().numpy(), attention.k_proj.bias.detach().numpy(), attention.v_proj.bias.detach().numpy(), qkv_add_name, root_input=q_input_to_attention) + q_input_to_attention = f"{qkv_add_name}/output_0" + else: + if q_bias_exists: + q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add" + self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=q_input_to_attention) + q_input_to_attention = f"{q_add_name}/output_0" + if k_bias_exists: + k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add" + self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=k_input_to_attention) + k_input_to_attention = f"{k_add_name}/output_0" + if v_bias_exists: + v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add" + self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=v_input_to_attention) + v_input_to_attention = f"{v_add_name}/output_0" # Make RotaryEmbedding nodes - q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding" - q_rotary_input = f"{q_matmul_name if not q_bias_exists else q_add_name}/output_0" - self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, q_rotary_input, position_ids=kwargs.get("position_ids", "position_ids")) - q_input_to_attention = f"{q_rotary_name}/output_0" - - k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding" - k_rotary_input = f"{k_matmul_name if not k_bias_exists else k_add_name}/output_0" - self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, k_rotary_input, position_ids=kwargs.get("position_ids", "position_ids")) - k_input_to_attention = f"{k_rotary_name}/output_0" + cos_cache_name, sin_cache_name = "", "" + if self.attention_attrs["use_rotemb_in_gqa"]: + cos_cache_name, sin_cache_name = self.make_rotary_embedding_caches(attention.rotary_emb) + else: + q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding" + self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, root_input=q_input_to_attention, position_ids=kwargs.get("position_ids", "position_ids")) + q_input_to_attention = f"{q_rotary_name}/output_0" + k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding" + self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, root_input=k_input_to_attention, position_ids=kwargs.get("position_ids", "position_ids")) + k_input_to_attention = f"{k_rotary_name}/output_0" # Make repeat KV nodes (TODO: remove once ORT supports GQA for CPU) past_k = f"past_key_values.{layer_id}.key" past_v = f"past_key_values.{layer_id}.value" present_k = f"present.{layer_id}.key" present_v = f"present.{layer_id}.value" - if self.num_attn_heads != self.num_kv_heads and not self.attention_attrs['use_gqa']: - k_input_to_attention = self.make_repeat_kv(layer_id, k_input_to_attention, past_k, present_k) - v_input_to_attention = self.make_repeat_kv(layer_id, v_input_to_attention, past_v, present_v) + if self.num_attn_heads != self.num_kv_heads and self.attention_attrs["op_type"] != "GroupQueryAttention": + k_input_to_attention = self.make_repeat_kv(layer_id, root_input=k_input_to_attention, past_kv=past_k, present_kv=present_k) + v_input_to_attention = self.make_repeat_kv(layer_id, root_input=v_input_to_attention, past_kv=past_v, present_kv=present_v) past_k, past_v, present_k, present_v = "", "", "", "" # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.) attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}" self.make_attention_op( attn_name, q_path=q_input_to_attention, k_path=k_input_to_attention, v_path=v_input_to_attention, - past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, **kwargs, + past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, + cos_cache=cos_cache_name, sin_cache=sin_cache_name, **kwargs, ) # Make MatMul node (output projection weight node) @@ -914,6 +968,14 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): self.layernorm_attrs["skip_input"] = f"{o_matmul_name if not o_bias_exists else o_add_name}/output_0" def make_mlp(self, layer_id, mlp, root_input): + if self.mlp_attrs["use_proj"]: + self.make_mlp_proj(layer_id, mlp, root_input) + elif self.mlp_attrs["use_fc"]: + self.make_mlp_fc(layer_id, mlp, root_input) + else: + raise NotImplementedError(f"The MLP layer type is not set.") + + def make_mlp_proj(self, layer_id, mlp, root_input): # Make nodes for the MLP subgraph # # root_input @@ -947,6 +1009,39 @@ def make_mlp(self, layer_id, mlp, root_input): # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{down_name}/output_0" + def make_mlp_fc(self, layer_id, mlp, root_input): + # Make nodes for the MLP subgraph + # + # root_input + # | + # FC1_MatMul + # | + # FC1_Add + # | + # ActFunc + # | + # FC2_MatMul + # | + # FC2_Add + + # Make first layer of fully connected nodes (FC1) + fc1_matmul_name = f"/model/layers.{layer_id}/mlp/fc1/MatMul" + self.make_matmul(mlp.fc1.weight.detach().numpy(), fc1_matmul_name, root_input) + fc1_add_name = f"/model/layers.{layer_id}/mlp/fc1/Add" + self.make_add_bias(mlp.fc1.bias.detach().numpy(), fc1_add_name, root_input=f"{fc1_matmul_name}/output_0") + + # Make activation function + act_fn_name = self.make_activation(layer_id, root_input=f"{fc1_add_name}/output_0") + + # Make second layer of fully connected nodes (FC2) + fc2_matmul_name = f"/model/layers.{layer_id}/mlp/fc2/MatMul" + self.make_matmul(mlp.fc2.weight.detach().numpy(), fc2_matmul_name, root_input=f"{act_fn_name}/output_0") + fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add" + self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0") + + # Assign output 0 of MLP layer as output of last layer + self.mlp_attrs["output_0"] = f"{fc2_add_name}/output_0" + def make_activation_with_mul(self, layer_id, root_input, activation, domain): # Make nodes for this activation subgraph # @@ -1071,6 +1166,12 @@ def has_final_norm(self, module, model): return hf_norm or hf_final_layernorm or gguf_final_norm def make_attention_mask_reformatting(self): + if self.attention_attrs["op_type"] == "GroupQueryAttention": + self.make_attention_mask_reformatting_for_gqa() + else: + self.make_attention_mask_reformatting_2d_to_4d() + + def make_attention_mask_reformatting_2d_to_4d(self): # Make nodes for the attention mask subgraphs that reformat the # 2D attention mask (B, S) to 4D causal attention mask (B, N, S, T) # @@ -1370,17 +1471,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for return expand_name - -class LlamaModel(Model): - def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): - super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) - self.model_inputs = ["input_ids", "attention_mask", "position_ids"] - - def make_attention_mask_reformatting(self): - if not self.attention_attrs["use_gqa"]: - super().make_attention_mask_reformatting() - return - + def make_attention_mask_reformatting_for_gqa(self): # Make nodes for the attention mask subgraph that calculates # attributes about the 2D attention mask to use in GroupQueryAttention # @@ -1420,12 +1511,6 @@ def make_attention_mask_reformatting(self): self.mask_attrs["seqlens_k"] = cast_1_name self.mask_attrs["total_seq_len"] = cast_2_name - -class MistralModel(LlamaModel): - def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): - super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) - self.position_ids_name = self.make_position_ids_reformatting() - def make_position_ids_reformatting(self): # Make nodes for the position ids reformatting subgraph # @@ -1461,62 +1546,42 @@ def make_position_ids_reformatting(self): return reshape_name + +class LlamaModel(Model): + def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): + super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) + + +class MistralModel(Model): + def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): + super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) + self.position_ids_name = f"{self.make_position_ids_reformatting()}/output_0" if not self.attention_attrs["use_rotemb_in_gqa"] else "position_ids" + def make_attention(self, layer_id, attention, root_input, **kwargs): - super().make_attention(layer_id, attention, root_input, position_ids=f"{self.position_ids_name}/output_0", **kwargs) + super().make_attention(layer_id, attention, root_input, position_ids=self.position_ids_name, **kwargs) -class PhiModel(LlamaModel): +class PhiModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options) # self.input_shapes["position_ids"] = [1] # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor self.layernorm_attrs["simple"] = False self.rotemb_attrs["num_heads"] = self.num_attn_heads self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"]) + self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = False, True def make_rotary_embedding(self, rotemb, name, root_input, **kwargs): super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs) - - def make_mlp(self, layer_id, mlp, root_input): - # Make nodes for the MLP subgraph - # - # root_input - # | - # FC1_MatMul - # | - # FC1_Add - # | - # FastGelu - # | - # FC2_MatMul - # | - # FC2_Add - - # Make first layer of fully connected nodes (FC1) - fc1_matmul_name = f"/model/layers.{layer_id}/mlp/fc1/MatMul" - self.make_matmul(mlp.fc1.weight.detach().numpy(), fc1_matmul_name, root_input) - fc1_add_name = f"/model/layers.{layer_id}/mlp/fc1/Add" - self.make_add_bias(mlp.fc1.bias.detach().numpy(), fc1_add_name, root_input=f"{fc1_matmul_name}/output_0") - - # Make activation function - fast_gelu_name = self.make_activation(layer_id, root_input=f"{fc1_add_name}/output_0") - - # Make second layer of fully connected nodes (FC2) - fc2_matmul_name = f"/model/layers.{layer_id}/mlp/fc2/MatMul" - self.make_matmul(mlp.fc2.weight.detach().numpy(), fc2_matmul_name, root_input=f"{fast_gelu_name}/output_0") - fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add" - self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0") - - return fc2_add_name def make_layer(self, layer_id, layer): # Each Phi decoder layer is defined as: # input_layernorm --> attention --> MLP --> residual_add self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input") self.make_attention(layer_id, layer.self_attn, self.layernorm_attrs["output_0"]) - fc2_add_name = self.make_mlp(layer_id, layer.mlp, self.layernorm_attrs["output_0"]) + self.make_mlp(layer_id, layer.mlp, self.layernorm_attrs["output_0"]) residual_add_name = f"/model/layers.{layer_id}/residual_add/Add" - residual_add_inputs = [self.layernorm_attrs['skip_input'], f"{fc2_add_name}/output_0"] + residual_add_inputs = [self.layernorm_attrs['skip_input'], self.mlp_attrs["output_0"]] self.make_add(residual_add_name, residual_add_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size]) self.layernorm_attrs["first_layernorm"] = False From bc503fb1929525aa0a4bdc513fc2321ea951f5c8 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Tue, 9 Apr 2024 14:40:44 -0700 Subject: [PATCH 36/36] Add C++ benchmark program (#243) Add a C++ model benchmark program. It is modeled after the existing Python benchmark script (benchmark/python/benchmark_e2e.py). The motivation for a C++ version is to be able to run without Python. This is useful for Android. --- CMakeLists.txt | 20 ++- benchmark/c/CMakeLists.txt | 25 ++++ benchmark/c/main.cpp | 242 ++++++++++++++++++++++++++++++++++++ benchmark/c/options.cpp | 110 ++++++++++++++++ benchmark/c/options.h | 22 ++++ cmake/options.cmake | 1 + src/csharp/Generator.cs | 4 +- src/csharp/NativeMethods.cs | 8 +- src/ort_genai.h | 50 ++++++-- src/ort_genai_c.cpp | 4 +- src/ort_genai_c.h | 9 +- src/python/CMakeLists.txt | 1 - test/CMakeLists.txt | 1 - test/c_api_tests.cpp | 24 ++-- 14 files changed, 486 insertions(+), 35 deletions(-) create mode 100644 benchmark/c/CMakeLists.txt create mode 100644 benchmark/c/main.cpp create mode 100644 benchmark/c/options.cpp create mode 100644 benchmark/c/options.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ff70bea11..b325c2763 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,11 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_BUILD_TYPE STREQUAL "Debug") add_compile_definitions(_DEBUG=1) endif() +if(MSVC) + # set updated value for __cplusplus macro instead of 199711L + add_compile_options($<$:/Zc:__cplusplus>) +endif() + message(STATUS "Adding source files") file(GLOB generator_srcs CONFIGURE_DEPENDS @@ -127,6 +132,11 @@ else() set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.so") endif() +file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}") +if(USE_DML) + list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll") +endif() + if(NO_TOKENIZEROOT) add_compile_definitions(NO_TOKENIZER=1) message("----------------Tokenizer Disabled------------------") @@ -148,6 +158,11 @@ if(ENABLE_PYTHON) message("------------------Enabling Python Wheel------------------") endif() +if(ENABLE_MODEL_BENCHMARK) + add_subdirectory("${CMAKE_SOURCE_DIR}/benchmark/c") + message("------------------Enabling model benchmark------------------") +endif() + if(NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_LIB}") message(FATAL_ERROR "Expected the ONNX Runtime library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_LIB}. Actual: Not found.") endif() @@ -158,7 +173,6 @@ if(USE_CUDA AND NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_CUDA_LIB}") message(FATAL_ERROR "Expected the ONNX Runtime providers cuda library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_CUDA_LIB}. Actual: Not found.") endif() -file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}") target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR}) target_link_libraries(onnxruntime-genai PRIVATE ${ONNXRUNTIME_LIB}) @@ -182,10 +196,6 @@ if(MSVC) endif() # Copy the onnxruntime binaries into the build folder so it's found on launch -file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}") -if(USE_DML) - list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll") -endif() foreach(DLL_FILE ${onnxruntime_libs}) add_custom_command( TARGET onnxruntime-genai POST_BUILD diff --git a/benchmark/c/CMakeLists.txt b/benchmark/c/CMakeLists.txt new file mode 100644 index 000000000..0035f3e5e --- /dev/null +++ b/benchmark/c/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +set(model_benchmark_srcs + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/options.h + ${CMAKE_CURRENT_SOURCE_DIR}/options.cpp +) + +add_executable(model_benchmark ${model_benchmark_srcs}) + +target_include_directories(model_benchmark PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/src # directory containing the ort_genai headers +) + +target_link_libraries(model_benchmark PRIVATE onnxruntime-genai-static ${ONNXRUNTIME_LIB}) + +target_link_directories(model_benchmark PRIVATE ${ORT_LIB_DIR}) + +add_custom_command(TARGET model_benchmark POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${onnxruntime_libs} $ +) + +source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${model_benchmark_srcs}) diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp new file mode 100644 index 000000000..0a6840c42 --- /dev/null +++ b/benchmark/c/main.cpp @@ -0,0 +1,242 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ort_genai.h" + +#include "options.h" + +namespace { + +using Clock = std::chrono::steady_clock; + +using Duration = Clock::duration; +using DurationFp = std::chrono::duration; + +class Timing { + public: + Timing(const Timing&) = delete; + Timing& operator=(const Timing&) = delete; + + Timing(std::vector& measurements) + : measurements_{measurements}, start_{Clock::now()} { + } + + ~Timing() { + const auto measurement = Clock::now() - start_; + measurements_.push_back(measurement); + } + + private: + std::vector& measurements_; + const Clock::time_point start_; +}; + +struct Statistics { + DurationFp average{}; + DurationFp stddev{}; + DurationFp p50{}; + DurationFp p90{}; + DurationFp p99{}; + size_t n{}; +}; + +Statistics ComputeStats(const std::vector& measurements) { + Statistics stats{}; + if (measurements.empty()) { + return stats; + } + + stats.n = measurements.size(); + + const auto sum = std::accumulate(measurements.begin(), measurements.end(), Duration{0}); + stats.average = DurationFp{sum} / stats.n; + + std::vector sorted = measurements; + std::sort(sorted.begin(), sorted.end()); + + stats.p50 = sorted[stats.n * 0.5]; + stats.p90 = sorted[stats.n * 0.9]; + stats.p99 = sorted[stats.n * 0.99]; + + if (stats.n > 1) { + const float variance = + std::accumulate( + measurements.begin(), measurements.end(), + 0.0f, + [mean = stats.average.count()](float accumulator, const Duration& m) -> float { + const float distance_from_mean = m.count() - mean; + return accumulator + distance_from_mean * distance_from_mean; + }) / + (stats.n - 1); + + const float stddev = std::sqrt(variance); + stats.stddev = DurationFp{stddev}; + } + + return stats; +} + +void WritePerTokenStats(std::string_view label, + const Statistics& stats, + const size_t tokens_per_measurement) { + using MicrosecondsFp = std::chrono::duration; + const auto avg_us = MicrosecondsFp{stats.average}; + std::cout << label << ":" + << "\n\tavg (us): " << avg_us.count() + << "\n\tavg (tokens/s): " << 1.0e6f / avg_us.count() * tokens_per_measurement + << "\n\tp50 (us): " << MicrosecondsFp{stats.p50}.count() + << "\n\tstddev (us): " << MicrosecondsFp{stats.stddev}.count() + << "\n\tn: " << stats.n << " * " << tokens_per_measurement << " token(s)" + << "\n"; +} + +void WriteE2EStats(std::string_view label, + const Statistics& stats) { + using MillisecondsFp = std::chrono::duration; + std::cout << label << ":" + << "\n\tavg (ms): " << MillisecondsFp{stats.average}.count() + << "\n\tp50 (ms): " << MillisecondsFp{stats.p50}.count() + << "\n\tstddev (ms): " << MillisecondsFp{stats.stddev}.count() + << "\n\tn: " << stats.n + << "\n"; +} + +std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) { + const char* const base_prompt = "A"; + auto base_prompt_sequences = OgaSequences::Create(); + + tokenizer.Encode(base_prompt, *base_prompt_sequences); + + auto params = OgaGeneratorParams::Create(model); + params->SetSearchOption("max_length", num_prompt_tokens); + params->SetSearchOption("min_length", num_prompt_tokens); + params->SetInputSequences(*base_prompt_sequences); + + auto output_sequences = model.Generate(*params); + const auto output_sequence_length = output_sequences->SequenceCount(0); + const auto* output_sequence_data = output_sequences->SequenceData(0); + return std::string{tokenizer.Decode(output_sequence_data, output_sequence_length)}; +} + +void RunBenchmark(const benchmark::Options& opts) { + auto model = OgaModel::Create(opts.model_path.c_str()); + auto tokenizer = OgaTokenizer::Create(*model); + + const std::string prompt = GeneratePrompt(opts.num_prompt_tokens, *model, *tokenizer); + auto prompt_sequences = OgaSequences::Create(); + + if (opts.batch_size < 1) { + throw std::runtime_error("Batch size must be at least 1."); + } + + for (size_t i = 0; i < opts.batch_size; ++i) { + tokenizer->Encode(prompt.c_str(), *prompt_sequences); + } + + const size_t num_prompt_tokens = prompt_sequences->SequenceCount(0); + const size_t num_tokens = num_prompt_tokens + opts.num_tokens_to_generate; + + auto make_generator_params = [&] { + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", num_tokens); + params->SetSearchOption("min_length", num_tokens); + params->SetInputSequences(*prompt_sequences); + return params; + }; + + const auto generator_params = make_generator_params(); + + // warmup + if (opts.verbose) std::cout << "Running warmup iterations (" << opts.num_warmup_iterations << ")...\n"; + for (size_t i = 0; i < opts.num_warmup_iterations; ++i) { + auto output_sequences = model->Generate(*generator_params); + + if (opts.verbose && i == 0) { + // show prompt and output on first iteration + std::cout << "Prompt:\n\t" << prompt << "\n"; + const auto output_sequence_length = output_sequences->SequenceCount(0); + const auto* output_sequence_data = output_sequences->SequenceData(0); + const auto output = tokenizer->Decode(output_sequence_data, output_sequence_length); + std::cout << "Output:\n\t" << output << "\n"; + } + } + + std::vector e2e_gen_times, prompt_processing_times, token_gen_times, sampling_times; + // note: be sure to reserve enough to avoid vector reallocations in the measured code + e2e_gen_times.reserve(opts.num_iterations); + prompt_processing_times.reserve(opts.num_iterations); + token_gen_times.reserve(opts.num_iterations * (opts.num_tokens_to_generate - 1)); + sampling_times.reserve(opts.num_iterations * opts.num_tokens_to_generate); + + if (opts.verbose) std::cout << "Running iterations (" << opts.num_iterations << ")...\n"; + for (size_t i = 0; i < opts.num_iterations; ++i) { + auto generator = OgaGenerator::Create(*model, *generator_params); + + { + Timing e2e_gen_timing{e2e_gen_times}; + + { + Timing prompt_processing_timing{prompt_processing_times}; + generator->ComputeLogits(); + } + + { + Timing sampling_timing{sampling_times}; + generator->GenerateNextToken(); + } + + while (!generator->IsDone()) { + { + Timing token_gen_timing{token_gen_times}; + generator->ComputeLogits(); + } + + { + Timing sampling_timing{sampling_times}; + generator->GenerateNextToken(); + } + } + } + } + + { + std::cout << "Batch size: " << opts.batch_size + << ", prompt tokens: " << num_prompt_tokens + << ", tokens to generate: " << opts.num_tokens_to_generate + << "\n"; + + const auto e2e_gen_stats = ComputeStats(e2e_gen_times); + const auto prompt_processing_stats = ComputeStats(prompt_processing_times); + const auto token_gen_stats = ComputeStats(token_gen_times); + const auto sampling_stats = ComputeStats(sampling_times); + + WritePerTokenStats("Prompt processing (time to first token)", + prompt_processing_stats, opts.batch_size * num_prompt_tokens); + WritePerTokenStats("Token generation", token_gen_stats, opts.batch_size); + WritePerTokenStats("Token sampling", sampling_stats, opts.batch_size); + WriteE2EStats("E2E generation (entire generation loop)", e2e_gen_stats); + } +} + +} // namespace + +int main(int argc, char** argv) { + try { + const auto opts = benchmark::ParseOptionsFromCommandLine(argc, argv); + RunBenchmark(opts); + return 0; + } catch (const std::exception& e) { + std::cerr << "Exception: " << e.what() << "\n"; + return 1; + } +} diff --git a/benchmark/c/options.cpp b/benchmark/c/options.cpp new file mode 100644 index 000000000..7047a4466 --- /dev/null +++ b/benchmark/c/options.cpp @@ -0,0 +1,110 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "options.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace benchmark { + +namespace { + +[[noreturn]] void PrintHelpAndExit(const char* program_name, int exit_code) { + Options defaults{}; + std::ostringstream s; + + s << "Usage: " << program_name << " -i \n" + << " Options:\n" + << " -i,--input_folder \n" + << " Path to the ONNX model directory to benchmark, compatible with onnxruntime-genai.\n " + << " -b,--batch_size \n" + << " Number of sequences to generate in parallel. Default: " << defaults.batch_size << "\n" + << " -l,--prompt_length \n" + << " Number of tokens in the prompt. Default: " << defaults.num_prompt_tokens << "\n" + << " -g,--generation_length \n" + << " Number of tokens to generate. Default: " << defaults.num_tokens_to_generate << "\n" + << " -r,--repetitions \n" + << " Number of times to repeat the benchmark. Default: " << defaults.num_iterations << "\n" + << " -w,--warmup \n" + << " Number of warmup runs before benchmarking. Default: " << defaults.num_warmup_iterations << "\n" + << " -v,--verbose\n" + << " Show more informational output.\n" + << " -h,--help\n" + << " Show this help message and exit.\n"; + + std::cerr << s.str(); + std::exit(exit_code); +} + +template +T ParseNumber(std::string_view s) { + T n; + const auto *s_begin = s.data(), *s_end = s.data() + s.size(); + const auto [ptr, ec] = std::from_chars(s_begin, s_end, n); + if (ec != std::errc{} || ptr != s_end) { + throw std::runtime_error(std::string{"Failed to parse option value as number: "}.append(s)); + } + return n; +} + +void VerifyOptions(const Options& opts) { + if (opts.model_path.empty()) { + throw std::runtime_error("ONNX model directory path must be provided."); + } +} + +} // namespace + +Options ParseOptionsFromCommandLine(int argc, const char* const* argv) { + const char* const program_name = argc > 0 ? argv[0] : "model_benchmark"; + try { + Options opts{}; + + auto next_arg = [argc, argv](int& idx) { + if (idx + 1 >= argc) { + throw std::runtime_error("Option value not provided."); + } + return std::string_view{argv[++idx]}; + }; + + for (int i = 1; i < argc; ++i) { + std::string_view arg{argv[i]}; + + if (arg == "-i" || arg == "--input_folder") { + opts.model_path = next_arg(i); + } else if (arg == "-b" || arg == "--batch_size") { + opts.batch_size = ParseNumber(next_arg(i)); + } else if (arg == "-l" || arg == "--prompt_length") { + opts.num_prompt_tokens = ParseNumber(next_arg(i)); + } else if (arg == "-g" || arg == "--generation_length") { + opts.num_tokens_to_generate = ParseNumber(next_arg(i)); + } else if (arg == "-r" || arg == "--repetitions") { + opts.num_iterations = ParseNumber(next_arg(i)); + } else if (arg == "-w" || arg == "--warmup") { + opts.num_warmup_iterations = ParseNumber(next_arg(i)); + } else if (arg == "-v" || arg == "--verbose") { + opts.verbose = true; + } else if (arg == "-h" || arg == "--help") { + PrintHelpAndExit(program_name, 0); + } else { + throw std::runtime_error(std::string{"Unknown option: "}.append(arg)); + } + } + + VerifyOptions(opts); + + return opts; + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << "\n"; + PrintHelpAndExit(program_name, 1); + } +} + +} // namespace benchmark diff --git a/benchmark/c/options.h b/benchmark/c/options.h new file mode 100644 index 000000000..a00d19191 --- /dev/null +++ b/benchmark/c/options.h @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +namespace benchmark { + +struct Options { + std::string model_path{}; + size_t num_prompt_tokens{16}; + size_t num_tokens_to_generate{128}; + size_t batch_size{1}; + size_t num_iterations{5}; + size_t num_warmup_iterations{1}; + bool verbose{false}; +}; + +Options ParseOptionsFromCommandLine(int argc, const char* const* argv); + +} // namespace benchmark diff --git a/cmake/options.cmake b/cmake/options.cmake index 80f004215..ac40a6d1d 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -6,5 +6,6 @@ option(NO_TOKENIZER "Don't include the Tokenizer" OFF) option(ENABLE_PYTHON "Build the Python API." ON) option(ENABLE_TESTS "Enable tests" ON) option(TEST_PHI2 "Enable tests for Phi2" OFF) +option(ENABLE_MODEL_BENCHMARK "Build model benchmark program" ON) cmake_dependent_option(BUILD_WHEEL "Build the python wheel" ON "ENABLE_PYTHON" OFF) \ No newline at end of file diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs index 64c1c5623..e2772d632 100644 --- a/src/csharp/Generator.cs +++ b/src/csharp/Generator.cs @@ -32,8 +32,8 @@ public void GenerateNextToken() public ReadOnlySpan GetSequence(ulong index) { - ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64(); - IntPtr sequencePtr = NativeMethods.OgaGenerator_GetSequence(_generatorHandle, (UIntPtr)index); + ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceCount(_generatorHandle, (UIntPtr)index).ToUInt64(); + IntPtr sequencePtr = NativeMethods.OgaGenerator_GetSequenceData(_generatorHandle, (UIntPtr)index); unsafe { return new ReadOnlySpan(sequencePtr.ToPointer(), (int)sequenceLength); diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index 552c9046a..7766c5e02 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -82,15 +82,15 @@ internal class NativeLib // This function returns the length of the sequence at the given index. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator, - UIntPtr /* size_t */ index); + public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceCount(IntPtr /* const OgaGenerator* */ generator, + UIntPtr /* size_t */ index); // This function returns the sequence data at the given index. The returned pointer is owned by the // OgaGenerator object and will be freed when the OgaGenerator object is destroyed. It is expected // that the caller copies the data returned by this function after calling this function. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* const in32_t* */ OgaGenerator_GetSequence(IntPtr /* const OgaGenerator* */ generator, - UIntPtr /* size_t */ index); + public static extern IntPtr /* const in32_t* */ OgaGenerator_GetSequenceData(IntPtr /* const OgaGenerator* */ generator, + UIntPtr /* size_t */ index); [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern IntPtr /* OgaResult* */ OgaCreateSequences(out IntPtr /* OgaSequences** */ sequences); diff --git a/src/ort_genai.h b/src/ort_genai.h index 82f8c722c..ea831ee2e 100644 --- a/src/ort_genai.h +++ b/src/ort_genai.h @@ -1,5 +1,15 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. + +#pragma once + +#include +#include + +#if __cplusplus >= 202002L +#include +#endif + #include "ort_genai_c.h" // GenAI C++ API @@ -55,7 +65,7 @@ struct OgaModel : OgaAbstract { return std::unique_ptr(p); } - std::unique_ptr Generate(const OgaGeneratorParams& params) { + std::unique_ptr Generate(const OgaGeneratorParams& params) const { OgaSequences* p; OgaCheckResult(OgaGenerate(this, ¶ms, &p)); return std::unique_ptr(p); @@ -84,9 +94,19 @@ struct OgaSequences : OgaAbstract { return OgaSequencesCount(this); } + size_t SequenceCount(size_t index) const { + return OgaSequencesGetSequenceCount(this, index); + } + + const int32_t* SequenceData(size_t index) const { + return OgaSequencesGetSequenceData(this, index); + } + +#if __cplusplus >= 202002L std::span Get(size_t index) const { - return {OgaSequencesGetSequenceData(this, index), OgaSequencesGetSequenceCount(this, index)}; + return {SequenceData(index), SequenceCount(index)}; } +#endif static void operator delete(void* p) { OgaDestroySequences(reinterpret_cast(p)); } }; @@ -102,11 +122,19 @@ struct OgaTokenizer : OgaAbstract { OgaCheckResult(OgaTokenizerEncode(this, str, &sequences)); } + OgaString Decode(const int32_t* tokens_data, size_t tokens_length) const { + const char* p; + OgaCheckResult(OgaTokenizerDecode(this, tokens_data, tokens_length, &p)); + return p; + } + +#if __cplusplus >= 202002L OgaString Decode(std::span tokens) const { const char* p; OgaCheckResult(OgaTokenizerDecode(this, tokens.data(), tokens.size(), &p)); return p; } +#endif static void operator delete(void* p) { OgaDestroyTokenizer(reinterpret_cast(p)); } }; @@ -139,15 +167,11 @@ struct OgaGeneratorParams : OgaAbstract { return std::unique_ptr(p); } - void SetSearchOption(const char* name, int value) { - OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value)); - } - void SetSearchOption(const char* name, double value) { OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value)); } - void SetSearchOption(const char* name, bool value) { + void SetSearchOptionBool(const char* name, bool value) { OgaCheckResult(OgaGeneratorParamsSetSearchBool(this, name, value)); } @@ -181,9 +205,19 @@ struct OgaGenerator : OgaAbstract { OgaCheckResult(OgaGenerator_GenerateNextToken(this)); } + size_t GetSequenceCount(size_t index) const { + return OgaGenerator_GetSequenceCount(this, index); + } + + const int32_t* GetSequenceData(size_t index) const { + return OgaGenerator_GetSequenceData(this, index); + } + +#if __cplusplus >= 202002L std::span GetSequence(size_t index) const { - return {OgaGenerator_GetSequence(this, index), OgaGenerator_GetSequenceLength(this, index)}; + return {GetSequenceData(index), GetSequenceCount(index)}; } +#endif static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast(p)); } }; diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index e9548d509..78c1b8ecd 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -157,12 +157,12 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator) OGA_CATCH } -size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) { +size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* oga_generator, size_t index) { auto& generator = *reinterpret_cast(oga_generator); return generator.GetSequence(static_cast(index)).GetCPU().size(); } -const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* oga_generator, size_t index) { +const int32_t* OGA_API_CALL OgaGenerator_GetSequenceData(const OgaGenerator* oga_generator, size_t index) { auto& generator = *reinterpret_cast(oga_generator); return generator.GetSequence(static_cast(index)).GetCPU().data(); } diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index 41eb65909..21a1fb2f7 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -1,5 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. + +#pragma once + #include #include @@ -179,17 +182,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* * \param[in] generator The generator to get the count of the tokens for the sequence at the given index. * \return The number tokens in the sequence at the given index. */ -OGA_EXPORT size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* generator, size_t index); +OGA_EXPORT size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* generator, size_t index); /* * \brief Returns a pointer to the sequence data at the given index. The number of tokens in the sequence - * is given by OgaGenerator_GetSequenceLength + * is given by OgaGenerator_GetSequenceCount * \param[in] generator The generator to get the sequence data for the sequence at the given index. * \return The pointer to the sequence data at the given index. The sequence data is owned by the OgaGenerator * and will be freed when the OgaGenerator is destroyed. The caller must copy the data if it needs to * be used after the OgaGenerator is destroyed. */ -OGA_EXPORT const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* generator, size_t index); +OGA_EXPORT const int32_t* OGA_API_CALL OgaGenerator_GetSequenceData(const OgaGenerator* generator, size_t index); OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateTokenizer(const OgaModel* model, OgaTokenizer** out); OGA_EXPORT void OGA_API_CALL OgaDestroyTokenizer(OgaTokenizer*); diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 942664246..e2175039a 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -44,7 +44,6 @@ if(BUILD_WHEEL) file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/py/" DESTINATION ${WHEEL_TARGET_NAME}/) file(COPY "${CMAKE_SOURCE_DIR}/ThirdPartyNotices.txt" DESTINATION ${WHEEL_TARGET_NAME}/) - file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}") add_custom_command(TARGET python POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_libs} $ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 83571118e..daf8c40b3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -49,7 +49,6 @@ else() target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT}) target_link_libraries(unit_tests PRIVATE tokenizer) endif() -file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}") set(TEST_MODEL_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/test_models/") set(TEST_MODEL_DES_DIR "$/test_models/") add_custom_command(TARGET unit_tests POST_BUILD diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp index a1ec2b923..fbdb4e541 100644 --- a/test/c_api_tests.cpp +++ b/test/c_api_tests.cpp @@ -120,10 +120,13 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { // Verify outputs match expected outputs for (int i = 0; i < batch_size; i++) { - auto sequence = generator->GetSequence(i); + const auto sequence_length = generator->GetSequenceCount(i); + const auto* sequence_data = generator->GetSequenceData(i); - auto* expected_output_start = &expected_output[i * max_length]; - EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t))); + ASSERT_LE(sequence_length, max_length); + + const auto* expected_output_start = &expected_output[i * max_length]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t))); } // Test high level API @@ -131,10 +134,13 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) { // Verify outputs match expected outputs for (int i = 0; i < batch_size; i++) { - auto sequence = sequences->Get(i); + const auto sequence_length = sequences->SequenceCount(i); + const auto* sequence_data = sequences->SequenceData(i); + + ASSERT_LE(sequence_length, max_length); - auto* expected_output_start = &expected_output[i * max_length]; - EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t))); + const auto* expected_output_start = &expected_output[i * max_length]; + EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t))); } } @@ -199,7 +205,7 @@ struct Phi2Test { TEST(CAPITests, TopKCAPI) { Phi2Test test; - test.params_->SetSearchOption("do_sample", true); + test.params_->SetSearchOptionBool("do_sample", true); test.params_->SetSearchOption("top_k", 50); test.params_->SetSearchOption("temperature", 0.6f); @@ -209,7 +215,7 @@ TEST(CAPITests, TopKCAPI) { TEST(CAPITests, TopPCAPI) { Phi2Test test; - test.params_->SetSearchOption("do_sample", true); + test.params_->SetSearchOptionBool("do_sample", true); test.params_->SetSearchOption("top_p", 0.6f); test.params_->SetSearchOption("temperature", 0.6f); @@ -219,7 +225,7 @@ TEST(CAPITests, TopPCAPI) { TEST(CAPITests, TopKTopPCAPI) { Phi2Test test; - test.params_->SetSearchOption("do_sample", true); + test.params_->SetSearchOptionBool("do_sample", true); test.params_->SetSearchOption("top_k", 50); test.params_->SetSearchOption("top_p", 0.6f); test.params_->SetSearchOption("temperature", 0.6f);