From f28e2420b530797a64adc060b95476643ab71aa2 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Thu, 14 Mar 2024 17:00:15 -0700
Subject: [PATCH 01/36] Update example READMEs (#199)

---
 examples/csharp/README.md | 43 ++++++++++++++++++++---------------
 examples/python/README.md | 48 ++++++++++++++++++++-------------------
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/examples/csharp/README.md b/examples/csharp/README.md
index d6b4010f3..dce940811 100644
--- a/examples/csharp/README.md
+++ b/examples/csharp/README.md
@@ -2,31 +2,38 @@
 
 ## Install the onnxruntime-genai library
 
-* Install the python package
-
-  ```bash
-  pip install onnxruntime-genai
-  ```
 
 ## Get the model
 
-Install the model builder script dependencies
+You can generate the model using the model builder this library, or bring your own model.
+
+If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
+
+To generate the model with model builder:
+
+1. Install the python package
 
-```bash
-pip install numpy
-pip install transformers
-pip install torch
-pip install onnx
-pip install onnxruntime
-```
+   Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
 
-Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
 
-```bash
-cd examples\\phi2\\csharp
-python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o phi-2\
-```
+2. Install the model builder script dependencies
+
+   ```bash
+   pip install numpy
+   pip install transformers
+   pip install torch
+   pip install onnx
+   pip install onnxruntime
+   ```
+3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
+
+   ```bash
+   cd examples/python
+   python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu
+   ```
 
 ## Run the phi-2 model
 
+Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
+
 Open [HelloPhi2.sln](HelloPhi2.sln) and run the console application.
diff --git a/examples/python/README.md b/examples/python/README.md
index 9c7a2cc25..2f20cfcd0 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -2,37 +2,39 @@
 
 ## Install the onnxruntime-genai library
 
-* Install the python package
+Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
 
-  ```bash
-  cd build/wheel
-  pip install onnxruntime-genai-*.whl
-  ```
 
 ## Get the model
 
-Install the model builder script dependencies
+You can generate the model using the model builder this library, or bring your own model.
 
-```bash
-pip install numpy
-pip install transformers
-pip install torch
-pip install onnx
-pip install onnxruntime-gpu
-```
+If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
 
-Choose a model. Examples of supported ones are:
-- Phi-2
-- Mistral
-- Gemma 2B IT
-- LLama 7B
+To generate the model with model builder:
 
-Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
+1. Install the model builder script dependencies
 
-```bash
-cd examples/python
-python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu
-```
+   ```bash
+   pip install numpy
+   pip install transformers
+   pip install torch
+   pip install onnx
+   pip install onnxruntime
+   ```
+
+2. Choose a model. Examples of supported ones are:
+   - Phi-2
+   - Mistral
+   - Gemma 2B IT
+   - LLama 7B
+
+3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
+
+   ```bash
+   cd examples/python
+   python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu
+   ```
 
 ## Run the example model script
 

From 143308f5fa513b7765747562631d66c6b26fba05 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Thu, 14 Mar 2024 20:55:21 -0700
Subject: [PATCH 02/36] Better error checking & reporting (#203)

Default batch size to 1 + sanity check batch_size
Improve a common JSON error messge wording for extra ','
Improve error reporting in python search parameter type
---
 examples/python/model-chat.py | 4 ++++
 src/generators.cpp            | 4 ++++
 src/generators.h              | 2 +-
 src/json.cpp                  | 4 ++--
 src/python/python.cpp         | 5 +++--
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py
index e31403065..edbfc1fc0 100644
--- a/examples/python/model-chat.py
+++ b/examples/python/model-chat.py
@@ -13,6 +13,10 @@ def main(args):
     # Keep asking for input prompts in an loop
     while True:
         text = input("Input: ")
+        if not text:
+            print("Error, input cannot be empty")
+            continue
+
         input_tokens = tokenizer.encode(text)
 
         params = og.GeneratorParams(model)
diff --git a/src/generators.cpp b/src/generators.cpp
index db27628da..640b6559e 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -71,6 +71,10 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_
     throw std::runtime_error("search max_length is 0");
   if (params.search.max_length > model.config_->model.context_length)
     throw std::runtime_error("max_length cannot be greater than model context_length");
+  if (params.batch_size < 1)
+    throw std::runtime_error("batch_size must be 1 or greater");
+  if (params.vocab_size < 1)
+    throw std::runtime_error("vocab_size must be 1 or greater");
 
   search_ = CreateSearch(params);
   state_ = model.CreateState(search_->GetSequenceLengths(), params);
diff --git a/src/generators.h b/src/generators.h
index af98aea44..433fda103 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -56,7 +56,7 @@ struct GeneratorParams {
   int vocab_size{};
   int context_length{};
 
-  int batch_size{};
+  int batch_size{1};
   int sequence_length{};
   int BatchBeamSize() const { return search.num_beams * batch_size; }
 
diff --git a/src/json.cpp b/src/json.cpp
index 412b98509..7e487ac99 100644
--- a/src/json.cpp
+++ b/src/json.cpp
@@ -49,7 +49,7 @@ JSON::JSON(Element& element, std::string_view document) : begin_{document.data()
     int line = 1;
     const auto* last_cr = begin_;
     for (const auto* p = begin_; p < current_; p++) {
-      if (*p == '\r') {
+      if (*p == '\n') {
         line++;
         last_cr = p;
       }
@@ -108,7 +108,7 @@ void JSON::Parse_Object(Element& element) {
 
   while (true) {
     if (!Skip('\"')) {
-      throw std::runtime_error("Expecting \"");
+      throw std::runtime_error("Expecting \" to start next object name, possibly due to an extra trailing ',' before this");
     }
 
     auto name = Parse_String();
diff --git a/src/python/python.cpp b/src/python/python.cpp
index 3ed2e6170..a1667eb90 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -3,6 +3,7 @@
 #include <pybind11/numpy.h>
 #include <iostream>
 #include "../generators.h"
+#include "../json.h"
 #include "../search.h"
 #include "../models/model.h"
 
@@ -97,8 +98,8 @@ struct PyGeneratorParams : GeneratorParams {
         } else if (pybind11::isinstance<pybind11::int_>(entry.second)) {
           SetSearchNumber(search, name, entry.second.cast<int>());
         } else
-          throw std::runtime_error("Unknown search option type, can be float/bool/int");
-      } catch (const std::exception& e) {
+          throw std::runtime_error("Unknown search option type, can be float/bool/int:" + name);
+      } catch (JSON::unknown_value_error& e) {
         throw std::runtime_error("Unknown search option:" + name);
       }
     }

From b740fd3238b83613f67f575d0a1ad8dfd9e7030d Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Fri, 15 Mar 2024 09:08:58 -0700
Subject: [PATCH 03/36] C# example README update (#202)

---
 examples/csharp/README.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/examples/csharp/README.md b/examples/csharp/README.md
index dce940811..cb6675e74 100644
--- a/examples/csharp/README.md
+++ b/examples/csharp/README.md
@@ -1,13 +1,8 @@
 # Gen-AI C# Phi-2 Example
 
-## Install the onnxruntime-genai library
-
-
 ## Get the model
 
-You can generate the model using the model builder this library, or bring your own model.
-
-If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
+You can generate the model using the model builder provided with this library, or bring your own model.
 
 To generate the model with model builder:
 
@@ -15,7 +10,6 @@ To generate the model with model builder:
 
    Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
 
-
 2. Install the model builder script dependencies
 
    ```bash
@@ -25,6 +19,7 @@ To generate the model with model builder:
    pip install onnx
    pip install onnxruntime
    ```
+   
 3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
 
    ```bash
@@ -32,6 +27,10 @@ To generate the model with model builder:
    python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./example-models/phi2-int4-cpu
    ```
 
+The model builder also generates the configuration needed by the API to run generation. You can modify the config according to your scenario.  
+
+If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
+
 ## Run the phi-2 model
 
 Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).

From e738f62ab4aac53612dcdca758d449cb772ac0c8 Mon Sep 17 00:00:00 2001
From: Adam Clark <sa_ddam213@live.com>
Date: Sat, 16 Mar 2024 05:55:40 +1300
Subject: [PATCH 04/36] C# Streaming Token Example (#205)

---
 examples/csharp/Program.cs | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/examples/csharp/Program.cs b/examples/csharp/Program.cs
index e70920cc0..993af8b57 100644
--- a/examples/csharp/Program.cs
+++ b/examples/csharp/Program.cs
@@ -9,6 +9,11 @@
 using Model model = new Model(modelPath);
 using Tokenizer tokenizer = new Tokenizer(model);
 
+Console.WriteLine("Please enter option number:");
+Console.WriteLine("1. Complete Output");
+Console.WriteLine("2. Streaming Output");
+int.TryParse(Console.ReadLine(), out var option);
+
 while (true)
 {
     Console.WriteLine("Prompt:");
@@ -21,9 +26,24 @@
     generatorParams.SetSearchOption("max_length", 200);
     generatorParams.SetInputSequences(sequences);
 
-    var outputSequences = model.Generate(generatorParams);
-    var outputString = tokenizer.Decode(outputSequences[0]);
+    if (option == 1) // Complete Output
+    {
+        var outputSequences = model.Generate(generatorParams);
+        var outputString = tokenizer.Decode(outputSequences[0]);
+
+        Console.WriteLine("Output:");
+        Console.WriteLine(outputString);
+    }
 
-    Console.WriteLine("Output:");
-    Console.WriteLine(outputString);
+    else if (option == 2) //Streaming Output
+    {
+        using var tokenizerStream = tokenizer.CreateStream();
+        using var generator = new Generator(model, generatorParams);
+        while (!generator.IsDone())
+        {
+            generator.ComputeLogits();
+            generator.GenerateNextTokenTop();
+            Console.Write(tokenizerStream.Decode(generator.GetSequence(0)[^1]));
+        }
+    }
 }

From 1edff307364f655f7a869d45d7d0dfe78af15090 Mon Sep 17 00:00:00 2001
From: Adam Clark <sa_ddam213@live.com>
Date: Sat, 16 Mar 2024 06:05:56 +1300
Subject: [PATCH 05/36] Add missing windows build step (#204)

Added copy step for `onnxruntime.lib` which seems to be required for
building on windows

Resolves Error:
`LINK : fatal error LNK1104: cannot open file 'onnxruntime.lib'
[D:\Repositories\onnxruntime-genai\build\src\python\pyth
on.vcxproj]`
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index be4fd5eff..769e9ab89 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,7 @@ This step requires `cmake` to be installed.
       build.bat --config RelWithDebInfo --build_shared_lib --skip_tests --parallel [--use_cuda]
       copy include\onnxruntime\core\session\onnxruntime_c_api.h $ORT_HOME\include
       copy build\Windows\RelWithDebInfo\RelWithDebInfo\*.dll $ORT_HOME\lib
+      copy build\Windows\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $ORT_HOME\lib
       ```
 
       On Linux

From 5679d50850787ded496e362df35c71808ecf67df Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 15 Mar 2024 15:38:48 -0400
Subject: [PATCH 06/36] Cjian/preset build dir (#194)

Change the CMake binaryDir consistent throughout the preset. This way we
can simply reference it with $(ep) and $(config)
---
 .github/workflows/linux-cpu-arm64-build.yml   |  4 +-
 .github/workflows/linux-cpu-x64-build.yml     |  6 +--
 .github/workflows/linux-gpu-x64-build.yml     |  4 +-
 .github/workflows/mac-cpu-arm64-build.yml     |  5 ++-
 .github/workflows/win-cpu-arm64-build.yml     | 17 ++++----
 .github/workflows/win-cpu-x64-build.yml       | 17 ++++----
 .github/workflows/win-gpu-x64-build.yml       | 15 ++++---
 .../stages/jobs/nuget-linux-packaging-job.yml |  2 +-
 .../stages/jobs/nuget-win-packaging-job.yml   |  2 +-
 .../stages/jobs/py-win-packaging-job.yml      |  4 +-
 .../stages/jobs/steps/capi-linux-step.yml     |  2 +-
 .../presets/CMakeLinuxClangConfigPresets.json | 16 ++++----
 .../CMakeLinuxDefaultConfigPresets.json       |  2 +
 cmake/presets/CMakeLinuxGccConfigPresets.json | 32 +++++++--------
 cmake/presets/CMakeWinBuildPresets.json       | 18 ++++-----
 cmake/presets/CMakeWinConfigPresets.json      | 40 +++++++++++--------
 src/python/CMakeLists.txt                     | 13 +-----
 src/tokenizer/CMakeLists.txt                  | 14 +------
 test/CMakeLists.txt                           | 13 +-----
 19 files changed, 106 insertions(+), 120 deletions(-)

diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
index ec31c67ab..5018bdbb6 100644
--- a/.github/workflows/linux-cpu-arm64-build.yml
+++ b/.github/workflows/linux-cpu-arm64-build.yml
@@ -56,10 +56,10 @@ jobs:
         run: |
           docker run --rm \
           --volume $GITHUB_WORKSPACE:/onnxruntime_src \
-          -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "ls -l /onnxruntime_src/build/gcc_cpu/release/test/"
+          -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "ls -l /onnxruntime_src/build/cpu/test/"
 
       - name: Docker -- Run tests
         run: |
           docker run --rm \
           --volume $GITHUB_WORKSPACE:/onnxruntime_src \
-          -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/onnxruntime_src/build/gcc_cpu/release/test/unit_tests"   
+          -w /onnxruntime_src ort_genai_linux_arm64_gha bash -c "/onnxruntime_src/build/cpu/test/unit_tests"   
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index fe5c92ad5..da202e19c 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -39,7 +39,7 @@ jobs:
 
       - name: Install the python wheel and test dependencies
         run: |
-          python3 -m pip install build/gcc_cpu/release/wheel/onnxruntime_genai*.whl
+          python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl
           python3 -m pip install -r test/python/requirements-nightly-cpu.txt --user
 
       - name: Get HuggingFace Token
@@ -59,9 +59,9 @@ jobs:
         if: always()
         continue-on-error: true
         run: |
-          ls -l ${{ github.workspace }}/build/gcc_cpu/release
+          ls -l ${{ github.workspace }}/build/cpu
 
       - name: Run tests
         run: |
           set -e -x
-          ./build/gcc_cpu/release/test/unit_tests
+          ./build/cpu/test/unit_tests
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index 3a08b8b05..c4e4c372a 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -63,7 +63,7 @@ jobs:
             --gpus all \
             --rm \
             --volume $GITHUB_WORKSPACE:/onnxruntime_src \
-            -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "python3 -m pip install /onnxruntime_src/build/gcc_cuda/release/wheel/onnxruntime_genai*.whl --user && python3 -m pip install -r test/python/requirements.txt --user && python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models"
+            -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "python3 -m pip install /onnxruntime_src/build/cuda/wheel/onnxruntime_genai*.whl --user && python3 -m pip install -r test/python/requirements.txt --user && python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models"
 
       - name: Docker -- Run tests
         run: |
@@ -72,4 +72,4 @@ jobs:
             --gpus all \
             --rm \
             --volume $GITHUB_WORKSPACE:/onnxruntime_src \
-            -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "/onnxruntime_src/build/gcc_cuda/release/test/unit_tests"
+            -w /onnxruntime_src ort_genai_linux_gpu_gha bash -c "/onnxruntime_src/build/cuda/test/unit_tests"
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index 8c56512e0..d757370f4 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -33,11 +33,12 @@ jobs:
         run: |
           mv ${{ env.ort_dir }} ort
 
-      - name: Build with CMake and Clang
+      - name: Configure CMake
         run: |
+          cmake --preset macos_cpu_release
+
       - name: Build with CMake
         run: |
-          cmake --preset macos_cpu_release
           cmake --build --preset macos_cpu_release --parallel
         continue-on-error: false
 
diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
index 4b43c5bae..7c64ba8ff 100644
--- a/.github/workflows/win-cpu-arm64-build.yml
+++ b/.github/workflows/win-cpu-arm64-build.yml
@@ -14,7 +14,7 @@ env:
   ort_dir: "onnxruntime-win-arm64-1.17.1"
   ort_zip: "$(ort_dir).zip"
   ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)"
-  cmake_build_dir: 'build/release/cpu_default'
+  binaryDir: 'build/cpu'
 
 jobs:
   windows-cpu-arm64-build:
@@ -45,14 +45,17 @@ jobs:
       run: |
         Rename-Item -Path $env:ort_dir -NewName ort
 
-    - name: Build with CMake
+    - name: Configure CMake
       run: |
         cmake --preset windows_arm64_cpu_release
+
+    - name: Build with CMake
+      run: |
         cmake --build --preset windows_arm64_cpu_release --parallel
 
     - name: Install the Python Wheel and Test Dependencies
       run: |
-        python -m pip install (Get-ChildItem ("$env:cmake_build_dir\wheel\*.whl"))
+        python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
         python -m pip install -r test\python\requirements.txt
 
     - name: Run the Python Tests
@@ -62,15 +65,15 @@ jobs:
     - name: Build the C# API and Run the C# Tests
       run: |
         cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:cmake_build_dir\Release"
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
 
     - name: Verify Build Artifacts
       if: always()
       continue-on-error: true
       run: |
-        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir -Recurse
-        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir\test -Recurse
+        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir -Recurse
+        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir\test -Recurse
 
     - name: Run tests
       run: |
-        .\build\release\cpu_default\test\Release\unit_tests.exe
\ No newline at end of file
+        .\build\cpu\test\Release\unit_tests.exe
\ No newline at end of file
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
index cfe792005..f13f3c2c8 100644
--- a/.github/workflows/win-cpu-x64-build.yml
+++ b/.github/workflows/win-cpu-x64-build.yml
@@ -14,7 +14,7 @@ env:
   ort_dir: "onnxruntime-win-x64-1.17.1"
   ort_zip: "$(ort_dir).zip"
   ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)"
-  cmake_build_dir: 'build/release/cpu_default'
+  binaryDir: 'build/cpu'
 
 jobs:
   windows-cpu-x64-build:
@@ -52,14 +52,17 @@ jobs:
       with:
         languages: 'cpp'
 
-    - name: Build with CMake
+    - name: Configure CMake
       run: |
         cmake --preset windows_x64_cpu_release
+
+    - name: Build with CMake
+      run: |
         cmake --build --preset windows_x64_cpu_release --parallel
 
     - name: Install the python wheel and test dependencies
       run: |
-        python -m pip install (Get-ChildItem ("$env:cmake_build_dir\wheel\*.whl"))
+        python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
         python -m pip install -r test\python\requirements-nightly-cpu.txt
 
     - name: Get HuggingFace Token
@@ -76,18 +79,18 @@ jobs:
     - name: Build the C# API and Run the C# Tests
       run: |
         cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:cmake_build_dir\Release"
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
 
     - name: Verify Build Artifacts
       if: always()
       continue-on-error: true
       run: |
-        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir -Recurse
-        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir\test -Recurse
+        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir -Recurse
+        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir\test -Recurse
 
     - name: Run tests
       run: |
-        .\build\release\cpu_default\test\Release\unit_tests.exe
+        .\build\cpu\test\Release\unit_tests.exe
 
     - name: Perform CodeQL Analysis
       uses: github/codeql-action/analyze@v3
diff --git a/.github/workflows/win-gpu-x64-build.yml b/.github/workflows/win-gpu-x64-build.yml
index 48afb21d4..a3f1d338b 100644
--- a/.github/workflows/win-gpu-x64-build.yml
+++ b/.github/workflows/win-gpu-x64-build.yml
@@ -14,7 +14,7 @@ env:
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
   cuda_version: "11.8"
   CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8
-  cmake_build_dir: 'build/release/cuda_default'
+  binaryDir: 'build/cuda'
 
 
 jobs:
@@ -47,9 +47,12 @@ jobs:
       run: |
         Rename-Item -Path $env:ort_dir -NewName ort
 
-    - name: Build with CMake
+    - name: Configure CMake
       run: |
         cmake --preset windows_x64_cuda_release -T cuda=${{ env.cuda_dir }}\\v${{ env.cuda_version }} -DTEST_PHI2=False
+
+    - name: Build with CMake
+      run: |
         cmake --build --preset windows_x64_cuda_release --parallel
 
     - name: Add CUDA to PATH
@@ -58,7 +61,7 @@ jobs:
 
     - name: Install the Python Wheel and Test Dependencies
       run: |
-        python -m pip install (Get-ChildItem ("$env:cmake_build_dir\wheel\*.whl"))
+        python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
         python -m pip install -r test\python\requirements-nightly-cpu.txt
 
     - name: Get HuggingFace Token
@@ -75,17 +78,17 @@ jobs:
     - name: Build the C# API and Run the C# Tests
       run: |
         cd test\csharp
-        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:cmake_build_dir\Release"
+        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
 
     - name: Verify Build Artifacts
       if: always()
       continue-on-error: true
       run: |
         
-        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:cmake_build_dir -Recurse
+        Get-ChildItem -Path $env:GITHUB_WORKSPACE\$env:binaryDir -Recurse
 
     - name: Prepend CUDA to PATH and Run tests
       run: |
         $env:PATH = "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin;" + $env:PATH 
         echo "Current PATH variable is: $env:PATH" 
-        .\build\release\cuda_default\test\Release\unit_tests.exe
\ No newline at end of file
+        .\build\cuda\test\Release\unit_tests.exe
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml b/.pipelines/stages/jobs/nuget-linux-packaging-job.yml
index 322a48c84..fdf9d7106 100644
--- a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml
+++ b/.pipelines/stages/jobs/nuget-linux-packaging-job.yml
@@ -23,7 +23,7 @@ jobs:
   - name: ep
     value: ${{ parameters.ep }}
   - name: buildDir
-    value: 'build/gcc_${{ parameters.ep }}/release'
+    value: 'build/${{ parameters.ep }}'
   - name: ort_filename
     ${{ if eq(parameters.ep, 'cpu') }}:
       value: 'onnxruntime-linux-${{ parameters.arch }}-${{ parameters.ort_version }}'
diff --git a/.pipelines/stages/jobs/nuget-win-packaging-job.yml b/.pipelines/stages/jobs/nuget-win-packaging-job.yml
index de370fe18..b15ceb1ee 100644
--- a/.pipelines/stages/jobs/nuget-win-packaging-job.yml
+++ b/.pipelines/stages/jobs/nuget-win-packaging-job.yml
@@ -30,7 +30,7 @@ jobs:
   - name: ep
     value: ${{ parameters.ep }}
   - name: buildDir
-    value: 'build\release\${{ parameters.ep }}_default'
+    value: 'build\${{ parameters.ep }}'
   - name: artifactName
     value : 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}'
   - name: ort_filename
diff --git a/.pipelines/stages/jobs/py-win-packaging-job.yml b/.pipelines/stages/jobs/py-win-packaging-job.yml
index da6015aa9..88b285506 100644
--- a/.pipelines/stages/jobs/py-win-packaging-job.yml
+++ b/.pipelines/stages/jobs/py-win-packaging-job.yml
@@ -68,7 +68,7 @@ jobs:
 
   - template: steps/compliant/win-esrp-dll-step.yml
     parameters:
-      FolderPath: '$(Build.Repository.LocalPath)\build\release\$(ep)_default\wheel\onnxruntime_genai'
+      FolderPath: '$(Build.Repository.LocalPath)\build\$(ep)\wheel\onnxruntime_genai'
       DisplayName: 'ESRP - PYD Sign'
       DoEsrp: true
       Pattern: '*.pyd'
@@ -83,7 +83,7 @@ jobs:
   - task: CopyFiles@2
     displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
     inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)\build\release\$(ep)_default\wheel'
+      SourceFolder: '$(Build.Repository.LocalPath)\build\$(ep)\wheel'
       Contents: '*.whl'
       TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml
index 1e049ab29..897fae5d5 100644
--- a/.pipelines/stages/jobs/steps/capi-linux-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml
@@ -100,7 +100,7 @@ steps:
   - task: CopyFiles@2
     displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
     inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)/build/gcc_$(ep)/release/wheel'
+      SourceFolder: '$(Build.Repository.LocalPath)/build/$(ep)/wheel'
       Contents: '*manylinux*.whl'
       TargetFolder: '$(Build.ArtifactStagingDirectory)'
 
diff --git a/cmake/presets/CMakeLinuxClangConfigPresets.json b/cmake/presets/CMakeLinuxClangConfigPresets.json
index 59c10d88a..ce607d2f1 100644
--- a/cmake/presets/CMakeLinuxClangConfigPresets.json
+++ b/cmake/presets/CMakeLinuxClangConfigPresets.json
@@ -11,7 +11,7 @@
         "linux_clang_asan_default",
         "linux_release_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/release"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_clang_cpu_debug_asan",
@@ -20,7 +20,7 @@
         "linux_clang_asan_default",
         "linux_debug_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/debug",
+      "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Debug",
         "CMAKE_C_FLAGS": "-ggdb3 -O0 -fsanitize=address",
@@ -34,7 +34,7 @@
         "linux_clang_asan_default",
         "linux_relwithdebinfo_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/relwithdebinfo"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_clang_cpu_minsizerel_asan",
@@ -43,7 +43,7 @@
         "linux_clang_default",
         "linux_minsizerel_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/minsizerel",
+      "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {}
     },
     {
@@ -53,7 +53,7 @@
         "linux_clang_default",
         "linux_release_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/release"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_clang_cpu_debug",
@@ -62,7 +62,7 @@
         "linux_clang_default",
         "linux_debug_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/debug"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_clang_cpu_relwithdebinfo",
@@ -71,7 +71,7 @@
         "linux_clang_default",
         "linux_relwithdebinfo_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/relwithdebinfo"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_clang_cpu_minsizerel",
@@ -80,7 +80,7 @@
         "linux_clang_default",
         "linux_minsizerel_default"
       ],
-      "binaryDir": "${sourceDir}/build/clang_cpu/minsizerel",
+      "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {}
     }
   ]
diff --git a/cmake/presets/CMakeLinuxDefaultConfigPresets.json b/cmake/presets/CMakeLinuxDefaultConfigPresets.json
index 9e904dc10..559d1dae0 100644
--- a/cmake/presets/CMakeLinuxDefaultConfigPresets.json
+++ b/cmake/presets/CMakeLinuxDefaultConfigPresets.json
@@ -39,6 +39,7 @@
     },
     {
       "name": "linux_gcc_asan_default",
+      "inherits": "linux_gcc_default",
       "cacheVariables": {
         "CMAKE_EXE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address",
         "CMAKE_MODULE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address",
@@ -47,6 +48,7 @@
     },
     {
       "name": "linux_clang_asan_default",
+      "inherits": "linux_clang_default",
       "cacheVariables": {
         "CMAKE_EXE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address -L\\usr\\lib64\\x86_64-unknown-linux-gnu",
         "CMAKE_MODULE_LINKER_FLAGS_INIT": "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -fsanitize=address -L\\usr\\lib64\\x86_64-unknown-linux-gnu",
diff --git a/cmake/presets/CMakeLinuxGccConfigPresets.json b/cmake/presets/CMakeLinuxGccConfigPresets.json
index 4e7b45a4a..d5518f9ad 100644
--- a/cmake/presets/CMakeLinuxGccConfigPresets.json
+++ b/cmake/presets/CMakeLinuxGccConfigPresets.json
@@ -12,7 +12,7 @@
         "linux_gcc_default",
         "linux_release_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/release"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_debug_asan",
@@ -22,7 +22,7 @@
         "linux_gcc_default",
         "linux_debug_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/debug"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_relwithdebinfo_asan",
@@ -32,7 +32,7 @@
         "linux_gcc_default",
         "linux_relwithdebinfo_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/relwithdebinfo"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_minsizerel_asan",
@@ -42,7 +42,7 @@
         "linux_gcc_default",
         "linux_minsizerel_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/minsizerel"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_release",
@@ -51,7 +51,7 @@
         "linux_gcc_default",
         "linux_release_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/release"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_debug",
@@ -60,7 +60,7 @@
         "linux_gcc_default",
         "linux_debug_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/debug"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_relwithdebinfo",
@@ -69,7 +69,7 @@
         "linux_gcc_default",
         "linux_relwithdebinfo_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/relwithdebinfo"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cpu_minsizerel",
@@ -78,7 +78,7 @@
         "linux_gcc_default",
         "linux_minsizerel_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cpu/minsizerel"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "linux_gcc_cuda_release_asan",
@@ -88,7 +88,7 @@
         "linux_gcc_cuda_default",
         "linux_release_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/release"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_debug_asan",
@@ -98,7 +98,7 @@
         "linux_gcc_cuda_default",
         "linux_debug_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/debug"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_relwithdebinfo_asan",
@@ -108,7 +108,7 @@
         "linux_gcc_cuda_default",
         "linux_relwithdebinfo_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/relwithdebinfo"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_minsizerel_asan",
@@ -118,7 +118,7 @@
         "linux_gcc_cuda_default",
         "linux_minsizerel_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/minsizerel"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_release",
@@ -127,7 +127,7 @@
         "linux_gcc_cuda_default",
         "linux_release_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/release"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_debug",
@@ -136,7 +136,7 @@
         "linux_gcc_cuda_default",
         "linux_debug_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/debug"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_relwithdebinfo",
@@ -145,7 +145,7 @@
         "linux_gcc_cuda_default",
         "linux_relwithdebinfo_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/relwithdebinfo"
+      "binaryDir": "${sourceDir}/build/cuda"
     },
     {
       "name": "linux_gcc_cuda_minsizerel",
@@ -154,7 +154,7 @@
         "linux_gcc_cuda_default",
         "linux_minsizerel_default"
       ],
-      "binaryDir": "${sourceDir}/build/gcc_cuda/minsizerel"
+      "binaryDir": "${sourceDir}/build/cuda"
     }
   ]
 }
\ No newline at end of file
diff --git a/cmake/presets/CMakeWinBuildPresets.json b/cmake/presets/CMakeWinBuildPresets.json
index b42eec934..1edfd4e13 100644
--- a/cmake/presets/CMakeWinBuildPresets.json
+++ b/cmake/presets/CMakeWinBuildPresets.json
@@ -16,12 +16,12 @@
     },
     {
       "name": "windows_x64_cpu_relwithdebinfo_asan",
-      "configuration": "Relwithdebinfo",
+      "configuration": "RelWithDebInfo",
       "configurePreset": "windows_x64_cpu_relwithdebinfo_asan"
     },
     {
       "name": "windows_x64_cpu_minsizerel_asan",
-      "configuration": "Minsizerel",
+      "configuration": "MinSizeRel",
       "configurePreset": "windows_x64_cpu_minsizerel_asan"
     },
     {
@@ -36,12 +36,12 @@
     },
     {
       "name": "windows_x64_cpu_relwithdebinfo",
-      "configuration": "Relwithdebinfo",
+      "configuration": "RelWithDebInfo",
       "configurePreset": "windows_x64_cpu_relwithdebinfo"
     },
     {
       "name": "windows_x64_cpu_minsizerel",
-      "configuration": "Minsizerel",
+      "configuration": "MinSizeRel",
       "configurePreset": "windows_x64_cpu_minsizerel"
     },
     {
@@ -56,12 +56,12 @@
     },
     {
       "name": "windows_x64_cuda_relwithdebinfo_asan",
-      "configuration": "Relwithdebinfo",
+      "configuration": "RelWithDebInfo",
       "configurePreset": "windows_x64_cuda_relwithdebinfo_asan"
     },
     {
       "name": "windows_x64_cuda_minsizerel_asan",
-      "configuration": "Minsizerel",
+      "configuration": "MinSizeRel",
       "configurePreset": "windows_x64_cuda_minsizerel_asan"
     },
     {
@@ -76,17 +76,17 @@
     },
     {
       "name": "windows_x64_cuda_relwithdebinfo",
-      "configuration": "Relwithdebinfo",
+      "configuration": "RelWithDebInfo",
       "configurePreset": "windows_x64_cuda_relwithdebinfo"
     },
     {
       "name": "windows_x64_cuda_minsizerel",
-      "configuration": "Minsizerel",
+      "configuration": "MinSizeRel",
       "configurePreset": "windows_x64_cuda_minsizerel"
     },
     {
       "name": "windows_arm64_cpu_relwithdebinfo",
-      "configuration": "Relwithdebinfo",
+      "configuration": "RelWithDebInfo",
       "configurePreset": "windows_arm64_cpu_relwithdebinfo"
     },
     {
diff --git a/cmake/presets/CMakeWinConfigPresets.json b/cmake/presets/CMakeWinConfigPresets.json
index be70ca2d6..3b22aae07 100644
--- a/cmake/presets/CMakeWinConfigPresets.json
+++ b/cmake/presets/CMakeWinConfigPresets.json
@@ -28,6 +28,7 @@
     {
       "name": "windows_release_default",
       "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG"
       }
@@ -35,6 +36,7 @@
     {
       "name": "windows_debug_default",
       "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1"
       }
@@ -42,6 +44,7 @@
     {
       "name": "windows_relwithdebinfo_default",
       "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo",
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG"
       }
@@ -49,12 +52,14 @@
     {
       "name": "windows_minsizerel_default",
       "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "MinSizeRel",
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG"
       }
     },
     {
       "name": "windows_release_asan_default",
+      "inherits": "windows_release_default",
       "cacheVariables": {
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG /fsanitize=address",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob2 /DNDEBUG /fsanitize=address"
@@ -62,6 +67,7 @@
     },
     {
       "name": "windows_debug_asan_default",
+      "inherits": "windows_debug_default",
       "cacheVariables": {
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1 /fsanitize=address",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Ob0 /Od /RTC1 /fsanitize=address"
@@ -69,6 +75,7 @@
     },
     {
       "name": "windows_relwithdebinfo_asan_default",
+      "inherits": "windows_relwithdebinfo_default",
       "cacheVariables": {
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG /fsanitize=address",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O2 /Ob1 /DNDEBUG /fsanitize=address"
@@ -76,6 +83,7 @@
     },
     {
       "name": "windows_minsizerel_asan_default",
+      "inherits": "windows_minsizerel_default",
       "cacheVariables": {
         "CMAKE_C_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG /fsanitize=address",
         "CMAKE_CXX_FLAGS": "/EHsc /Qspectre /MP /guard:cf /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /O1 /Ob1 /DNDEBUG /fsanitize=address"
@@ -88,7 +96,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu release asan",
-      "binaryDir": "${sourceDir}/build/release/cpu_asan"
+      "binaryDir": "${sourceDir}/build/cpu_asan"
     },
     {
       "name": "windows_x64_cpu_debug_asan",
@@ -97,7 +105,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu debug asan",
-      "binaryDir": "${sourceDir}/build/debug/cpu_asan"
+      "binaryDir": "${sourceDir}/build/cpu_asan"
     },
     {
       "name": "windows_x64_cpu_relwithdebinfo_asan",
@@ -106,7 +114,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu relwithdebinfo asan",
-      "binaryDir": "${sourceDir}/build/relwithdebinfo/cpu_asan"
+      "binaryDir": "${sourceDir}/build/cpu_asan"
     },
     {
       "name": "windows_x64_cpu_minsizerel_asan",
@@ -115,7 +123,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu minsizerel asan",
-      "binaryDir": "${sourceDir}/build/minsizerel/cpu_asan"
+      "binaryDir": "${sourceDir}/build/cpu_asan"
     },
     {
       "name": "windows_x64_cpu_release",
@@ -124,7 +132,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu release",
-      "binaryDir": "${sourceDir}/build/release/cpu_default"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "windows_x64_cpu_debug",
@@ -133,7 +141,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu debug",
-      "binaryDir": "${sourceDir}/build/debug/cpu_default"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "windows_x64_cpu_relwithdebinfo",
@@ -142,7 +150,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu relwithdebinfo",
-      "binaryDir": "${sourceDir}/build/relwithdebinfo/cpu_default"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "windows_x64_cpu_minsizerel",
@@ -151,7 +159,7 @@
         "windows_cpu_default"
       ],
       "displayName": "windows x64 cpu minsizerel",
-      "binaryDir": "${sourceDir}/build/minsizerel/cpu_default"
+      "binaryDir": "${sourceDir}/build/cpu"
     },
     {
       "name": "windows_x64_cuda_release_asan",
@@ -160,7 +168,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda release asan",
-      "binaryDir": "${sourceDir}/build/release/cuda_asan",
+      "binaryDir": "${sourceDir}/build/cuda_asan",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -172,7 +180,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda debug asan",
-      "binaryDir": "${sourceDir}/build/debug/cuda_asan",
+      "binaryDir": "${sourceDir}/build/cuda_asan",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -184,7 +192,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda relwithdebinfo asan",
-      "binaryDir": "${sourceDir}/build/relwithdebinfo/cuda_asan",
+      "binaryDir": "${sourceDir}/build/cuda_asan",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -196,7 +204,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda minsizerel asan",
-      "binaryDir": "${sourceDir}/build/minsizerel/cuda_asan",
+      "binaryDir": "${sourceDir}/build/cuda_asan",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -208,7 +216,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda release",
-      "binaryDir": "${sourceDir}/build/release/cuda_default",
+      "binaryDir": "${sourceDir}/build/cuda",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -220,7 +228,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda debug",
-      "binaryDir": "${sourceDir}/build/debug/cuda_default",
+      "binaryDir": "${sourceDir}/build/cuda",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -232,7 +240,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda relwithdebinfo",
-      "binaryDir": "${sourceDir}/build/relwithdebinfo/cuda_default",
+      "binaryDir": "${sourceDir}/build/cuda",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
@@ -244,7 +252,7 @@
         "windows_cuda_default"
       ],
       "displayName": "windows x64 cuda minsizerel",
-      "binaryDir": "${sourceDir}/build/minsizerel/cuda_default",
+      "binaryDir": "${sourceDir}/build/cuda",
       "cacheVariables": {
         "USE_CUDA": "ON"
       }
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 6bce4cbd3..942664246 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -1,15 +1,4 @@
-if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10)
-  add_compile_definitions(USE_CXX17=1)
-  message("Python is using C++17 because GCC Version is less than 10")
-  set(CMAKE_CXX_STANDARD 17)
-elseif(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
-  add_compile_definitions(USE_CXX17=1)
-  message("Python is using C++17 Because CUDA Version is less than 12")
-  set(CMAKE_CXX_STANDARD 17)
-else()
-  message("Python is using C++20")
-  set(CMAKE_CXX_STANDARD 20)
-endif()
+include(${CMAKE_SOURCE_DIR}/cmake/cxx_standard.cmake)
 
 file(GLOB python_srcs CONFIGURE_DEPENDS
   "${CMAKE_CURRENT_SOURCE_DIR}/*.h"
diff --git a/src/tokenizer/CMakeLists.txt b/src/tokenizer/CMakeLists.txt
index 1eb0fc6c6..69d603715 100644
--- a/src/tokenizer/CMakeLists.txt
+++ b/src/tokenizer/CMakeLists.txt
@@ -1,17 +1,5 @@
 set(TOKENIZER_ROOT ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE)
-
-if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10)
-  add_compile_definitions(USE_CXX17=1)
-  message("Tokenizer is using C++17 because GCC Version is less than 10")
-  set(CMAKE_CXX_STANDARD 17)
-elseif(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
-  add_compile_definitions(USE_CXX17=1)
-  message("Tokenizer is using C++17 Because CUDA Version is less than 12")
-  set(CMAKE_CXX_STANDARD 17)
-else()
-  message("Tokenizer is using C++20")
-  set(CMAKE_CXX_STANDARD 20)
-endif()
+include(${CMAKE_SOURCE_DIR}/cmake/cxx_standard.cmake)
 
 file(GLOB tokenizer_srcs CONFIGURE_DEPENDS
    "${CMAKE_CURRENT_SOURCE_DIR}/*.cc"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a4263eeda..83571118e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,17 +1,6 @@
 enable_testing()
 
-if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10)
-  add_compile_definitions(USE_CXX17=1)
-  message("Test is using C++17 because GCC Version is less than 10")
-  set(CMAKE_CXX_STANDARD 17)
-elseif(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
-  add_compile_definitions(USE_CXX17=1)
-  message("Test is using C++17 Because CUDA Version is less than 12")
-  set(CMAKE_CXX_STANDARD 17)
-else()
-  message("Test is using C++20")
-  set(CMAKE_CXX_STANDARD 20)
-endif()
+include(${CMAKE_SOURCE_DIR}/cmake/cxx_standard.cmake)
 
 set(TESTS_ROOT ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE)
 file(GLOB test_srcs CONFIGURE_DEPENDS

From 7cc8062b8a17101f4bdd9367ca732c690309b6c1 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Fri, 15 Mar 2024 14:10:47 -0700
Subject: [PATCH 07/36] Update README.md (#201)

---
 README.md | 96 +++++++------------------------------------------------
 1 file changed, 11 insertions(+), 85 deletions(-)

diff --git a/README.md b/README.md
index 769e9ab89..564ef3370 100644
--- a/README.md
+++ b/README.md
@@ -31,25 +31,27 @@ Users can call a high level `generate()` method, or run each iteration of the mo
 
 ## Coming very soon
 
-* Support for the Whisper model architectures
 * Support for DirectML
+* Support for the encoder decoder model architectures, such as whisper, T5 and BART.
+
+## Coming soon
+
+* Support for mobile devices (Android and iOS) with Java and Objective-C bindings
 
 ## Roadmap
 
+* Stable diffusion pipeline
 * Automatic model download and cache
 * More model architectures
 
 ## Sample code for phi-2 in Python
 
-Install onnxruntime-genai.
-
-(Temporary) Build and install from source according to the instructions below.
-
+[Install](https://onnxruntime.ai/docs/genai/install) the onnxruntime-genai Python package.
 
 ```python
 import onnxruntime_genai as og
 
-model = og.Model(f'models/microsoft/phi-2', device_type)
+model = og.Model(f'models/microsoft/phi-2')
 
 tokenizer = og.Tokenizer(model)
 
@@ -72,88 +74,11 @@ print("Output:")
 print(text)
 ```
 
-
-## Build from source
-
-This step requires `cmake` to be installed.
-
-1. Clone this repo
-
-   ```bash
-   git clone https://github.com/microsoft/onnxruntime-genai
-   cd onnxruntime-genai
-   ```
-
-2. Install ONNX Runtime
-
-    By default, the onnxruntime-genai build expects to find the ONNX Runtime include and binaries in a folder called `ort` in the root directory of onnxruntime-genai. You can put the ONNX Runtime files in a different location and specify this location to the onnxruntime-genai build. These instructions use ORT_HOME as the location.
-
-    * Install from release
-
-      These instructions are for the Linux GPU build of ONNX Runtime. Replace the location with the operating system and target of choice. 
-
-      ```bash
-      cd $ORT_HOME
-      wget https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-gpu-1.17.1.tgz
-      tar xvzf onnxruntime-linux-x64-gpu-1.17.1.tgz 
-      mv onnxruntime-linux-x64-gpu-1.17.1/include .
-      mv onnxruntime-linux-x64-gpu-1.17.1/lib .
-      ```
-
-    * Or build from source
-
-      ```
-      git clone https://github.com/microsoft/onnxruntime.git
-      cd onnxruntime
-      ```
-
-      Create include and lib folders in the ORT_HOME directory
-
-      ```bash
-      mkdir $ORT_HOME/include
-      mkdir $ORT_HOME/lib
-      ```
-
-      Build from source and copy the include and libraries into ORT_HOME
-
-      On Windows
-
-      ```cmd
-      build.bat --config RelWithDebInfo --build_shared_lib --skip_tests --parallel [--use_cuda]
-      copy include\onnxruntime\core\session\onnxruntime_c_api.h $ORT_HOME\include
-      copy build\Windows\RelWithDebInfo\RelWithDebInfo\*.dll $ORT_HOME\lib
-      copy build\Windows\RelWithDebInfo\RelWithDebInfo\onnxruntime.lib $ORT_HOME\lib
-      ```
-
-      On Linux
-
-      ```cmd
-      ./build.sh --build_shared_lib --skip_tests --parallel [--use_cuda]
-      cp include/onnxruntime/core/session/onnxruntime_c_api.h $ORT_HOME/include
-      cp build/Linux/RelWithDebInfo/libonnxruntime*.so* $ORT_HOME/lib
-      ```
-
-3. Build onnxruntime-genai
-
-   If you are building for CUDA, add the cuda_home argument.
-
-   ```bash
-   cd ..
-   python build.py [--cuda_home <path_to_cuda_home>]
-   ```
-   
-4. Install Python wheel
-
-   ```bash
-   cd build/wheel
-   pip install *.whl
-   ```
-
 ## Model download and export
 
 ONNX models are run from a local folder, via a string supplied to the `Model()` method. 
 
-To source `microsoft/phi-2` optimized for your target, download and run the following script. You will need to be logged into Hugging Face via the CLI to run the script.
+You can bring your own ONNX model or use the model builder utility, included in this package. 
 
 Install model builder dependencies.
 
@@ -165,14 +90,15 @@ pip install onnx
 pip install onnxruntime
 ```
 
-
 Export int4 CPU version 
 ```bash
 huggingface-cli login --token <your HuggingFace token>
 python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o <model folder>
 ```
 
+## Known issues
 
+* Mistral and Gemma support on CUDA only
 
 ## Contributing
 

From 9760fbd801a139dd068ff3f88f5ce127f485488f Mon Sep 17 00:00:00 2001
From: aciddelgado <139922440+aciddelgado@users.noreply.github.com>
Date: Fri, 15 Mar 2024 23:45:47 -0700
Subject: [PATCH 08/36] swap p and k in sample function (#162)

swap p and k to match generate api functions
---
 src/generators.cpp          |  2 +-
 src/search.cpp              |  2 +-
 src/search.h                |  4 ++--
 src/search_cuda.cpp         |  2 +-
 src/search_cuda.h           |  2 +-
 test/sampling_benchmark.cpp |  4 ++--
 test/sampling_tests.cpp     | 39 ++++++++++++++++++++++++-------------
 7 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/src/generators.cpp b/src/generators.cpp
index 640b6559e..b5cd1bb67 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -120,7 +120,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper
     throw std::runtime_error("top_k must be 0 or greater");
 
   if (top_p > 0.0f && top_k > 1) {
-    search_->SampleTopPAndK(top_p, top_k, temperature);
+    search_->SampleTopKTopP(top_k, top_p, temperature);
   } else if (top_k > 1) {
     search_->SampleTopK(top_k, temperature);
   } else {
diff --git a/src/search.cpp b/src/search.cpp
index b7dd21e46..cc3a6fd10 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -187,7 +187,7 @@ void GreedySearch_Cpu::SampleTopP(float p, float temperature) {
   AppendNextTokensToSequences();
 }
 
-void GreedySearch_Cpu::SampleTopPAndK(float p, int k, float temperature) {
+void GreedySearch_Cpu::SampleTopKTopP(int k, float p, float temperature) {
   std::uniform_real_distribution<float> dis(0, p);
   for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) {
     if (PadIfAlreadyEOS(batch_id)) {
diff --git a/src/search.h b/src/search.h
index abf373b51..bc81313eb 100644
--- a/src/search.h
+++ b/src/search.h
@@ -24,7 +24,7 @@ struct Search {
   virtual void SelectTop() = 0;
   virtual void SampleTopP(float /*p*/, float /*temperature*/) { assert(false); }
   virtual void SampleTopK(int /*k*/, float /*temperature*/) { assert(false); }
-  virtual void SampleTopPAndK(float /*p*/, int /*k*/, float /*temperature*/) { assert(false); }
+  virtual void SampleTopKTopP(int /*k*/, float /*p*/, float /*temperature*/) { assert(false); }
 
   // Scoring features
   virtual void ApplyMinLength(int min_length) = 0;
@@ -69,7 +69,7 @@ struct GreedySearch_Cpu : Search_Cpu {
   void SelectTop() override;
   void SampleTopK(int k, float temperature) override;
   void SampleTopP(float p, float temperature) override;
-  void SampleTopPAndK(float /*p*/, int /*k*/, float /*temperature*/) override;
+  void SampleTopKTopP(int /*k*/, float /*p*/, float /*temperature*/) override;
 
  private:
   bool PadIfAlreadyEOS(size_t batch_id);
diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp
index bc285cad0..304f62cc2 100644
--- a/src/search_cuda.cpp
+++ b/src/search_cuda.cpp
@@ -154,7 +154,7 @@ void GreedySearch_Cuda::SampleTopK(int k, float temperature) {
   AppendNextTokensToSequences();
 }
 
-void GreedySearch_Cuda::SampleTopPAndK(float p, int k, float temperature) {
+void GreedySearch_Cuda::SampleTopKTopP(int k, float p, float temperature) {
   std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
   cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
                   params_.batch_size, k, p, temperature);
diff --git a/src/search_cuda.h b/src/search_cuda.h
index 50628b9f3..8d1ddbbb4 100644
--- a/src/search_cuda.h
+++ b/src/search_cuda.h
@@ -51,7 +51,7 @@ struct GreedySearch_Cuda : Search_Cuda {
   void SelectTop() override;
   void SampleTopK(int k, float t) override;
   void SampleTopP(float p, float t) override;
-  void SampleTopPAndK(float p, int k, float t) override;
+  void SampleTopKTopP(int k, float p, float t) override;
 
  private:
   void CheckForEOS();
diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp
index 0fcb138e6..3f21ed669 100644
--- a/test/sampling_benchmark.cpp
+++ b/test/sampling_benchmark.cpp
@@ -117,7 +117,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) {
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.get(), vocab_size * batch_size));
 
     auto start = std::chrono::high_resolution_clock::now();
-    generator->search_->SampleTopPAndK(p, k, 1.0f);
+    generator->search_->SampleTopKTopP(k, p, 1.0f);
     auto stop = std::chrono::high_resolution_clock::now();
 
     auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
@@ -252,7 +252,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) {
 
     cudaStreamSynchronize(params.cuda_stream);
     auto start = std::chrono::high_resolution_clock::now();
-    generator->search_->SampleTopPAndK(p, k, 1.0f);
+    generator->search_->SampleTopKTopP(k, p, 1.0f);
     auto stop = std::chrono::high_resolution_clock::now();
     auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
     total_time += duration.count();
diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp
index 96264f015..f42c03e0a 100644
--- a/test/sampling_tests.cpp
+++ b/test/sampling_tests.cpp
@@ -35,8 +35,9 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) {
   auto generator = Generators::CreateGenerator(*model, params);
   auto logits_span = Generators::cpu_span<float>(logits_cpu);
   generator->search_->SetLogits(logits_span);
+  generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  generator->search_->SampleTopP(0.25f, 1.0f);
+  generator->GenerateNextToken_TopP(0.25f, 1.0f);
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t)));
 }
@@ -60,10 +61,11 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) {
   auto generator = Generators::CreateGenerator(*model, params);
   auto logits_copy = logits_cpu;
   generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
+  generator->computed_logits_ = true;
 
   // Verify outputs match expected outputs
   int k = 2;
-  generator->search_->SampleTopK(k, 1.0);
+  generator->GenerateNextToken_TopK(k, 1.0);
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -91,10 +93,11 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) {
   auto generator = Generators::CreateGenerator(*model, params);
   auto logits_copy = logits_cpu;
   generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
+  generator->computed_logits_ = true;
   // Verify outputs match expected outputs
   float p = 0.25f;
   int k = 2;
-  generator->search_->SampleTopPAndK(p, k, 1.0);
+  generator->GenerateNextToken_TopK_TopP(k, p, 1.0);
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -143,7 +146,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) {
     CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
     auto logits_copy = logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
-    generator->search_->SampleTopP(0.95f, 1.0f);
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_TopP(0.95f, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
@@ -178,7 +182,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
     CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
     auto logits_copy=logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
-    generator->search_->SampleTopK(k, 1.0f);
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_TopK(k, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
@@ -214,7 +219,8 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
     CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
     auto logits_copy = logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
-    generator->search_->SampleTopPAndK(p, k, 1.0f);
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_TopK_TopP(k, p, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
@@ -251,8 +257,9 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) {
   cudaStreamSynchronize(params.cuda_stream);
   auto generator = Generators::CreateGenerator(*model, params);
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
+  generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  generator->search_->SampleTopP(0.25f, 1.0f);
+  generator->GenerateNextToken_TopP(0.25f, 1.0f);
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t)));
 }
@@ -278,9 +285,10 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) {
   cudaStreamSynchronize(params.cuda_stream);
   auto generator = Generators::CreateGenerator(*model, params);
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
+  generator->computed_logits_ = true;
   // Verify outputs match expected outputs
   int k = 2;
-  generator->search_->SampleTopK(k, 1.0);
+  generator->GenerateNextToken_TopK(k, 1.0);
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -310,10 +318,11 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) {
   cudaStreamSynchronize(params.cuda_stream);
   auto generator = Generators::CreateGenerator(*model, params);
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
+  generator->computed_logits_ = true;
   // Verify outputs match expected outputs
   float p = 0.25f;
   int k = 2;
-  generator->search_->SampleTopPAndK(p, k, 1.0);
+  generator->GenerateNextToken_TopK_TopP(k, p, 1.0);
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -348,7 +357,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
     LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
     cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
-    generator->search_->SampleTopP(0.95f, 1.0f);
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_TopP(0.95f, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params.cuda_stream);
     // Verify outputs match expected outputs
@@ -387,7 +397,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) {
     cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
     auto generator = Generators::CreateGenerator(*model, params);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
-    generator->search_->SampleTopK(k, 1.0f);
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_TopK(k, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params.cuda_stream);
     // Verify outputs match expected outputs
@@ -427,7 +438,8 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
     LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
     cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
-    generator->search_->SampleTopPAndK(p, k, 1.0f);
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_TopK_TopP(k, p, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params.cuda_stream);
     // Verify outputs match expected outputs
@@ -465,7 +477,8 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) {
     cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
     auto generator = Generators::CreateGenerator(*model, params);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
-    generator->search_->SelectTop();
+    generator->computed_logits_ = true;
+    generator->GenerateNextToken_Top();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params.cuda_stream);
     // Verify outputs match expected outputs

From 21fd88ec351760a63b813ba8b31e2c9c1248131a Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Sun, 17 Mar 2024 19:45:34 -0700
Subject: [PATCH 09/36] Update README.md (#208)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 564ef3370..f49534a6d 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ Users can call a high level `generate()` method, or run each iteration of the mo
 * Built in logits processing like repetition penalties
 * Easy custom scoring
 
+See full documentation at [https://onnxruntime.ai/docs/genai].
+
 ## Features
 
 * Supported model architectures:

From 13d8be5ea28c77ba91860b0b008b9e0a532bf2cd Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Mon, 18 Mar 2024 10:58:46 -0700
Subject: [PATCH 10/36] Fix install link in READMEs (#213)

---
 README.md                        | 2 +-
 examples/csharp/HelloPhi2.csproj | 2 +-
 examples/csharp/README.md        | 4 ++--
 examples/python/README.md        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index f49534a6d..350a2fe67 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ See full documentation at [https://onnxruntime.ai/docs/genai].
 
 ## Sample code for phi-2 in Python
 
-[Install](https://onnxruntime.ai/docs/genai/install) the onnxruntime-genai Python package.
+[Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package.
 
 ```python
 import onnxruntime_genai as og
diff --git a/examples/csharp/HelloPhi2.csproj b/examples/csharp/HelloPhi2.csproj
index 0fb2a1948..a431aa126 100644
--- a/examples/csharp/HelloPhi2.csproj
+++ b/examples/csharp/HelloPhi2.csproj
@@ -9,7 +9,7 @@
 
   <ItemGroup>
     <PackageReference Include="Microsoft.ML.OnnxRuntime.Gpu" Version="1.17.1" />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Gpu" Version="0.1.0" />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.1.0" />
     <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Managed" Version="0.1.0" />
   </ItemGroup>
 
diff --git a/examples/csharp/README.md b/examples/csharp/README.md
index cb6675e74..edb71a717 100644
--- a/examples/csharp/README.md
+++ b/examples/csharp/README.md
@@ -8,7 +8,7 @@ To generate the model with model builder:
 
 1. Install the python package
 
-   Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
+   Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install).
 
 2. Install the model builder script dependencies
 
@@ -33,6 +33,6 @@ If you bring your own model, you need to provide the configuration. See the [con
 
 ## Run the phi-2 model
 
-Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
+Install the OnnxRuntime.GenAI nuget according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install).
 
 Open [HelloPhi2.sln](HelloPhi2.sln) and run the console application.
diff --git a/examples/python/README.md b/examples/python/README.md
index 2f20cfcd0..cf7fe3450 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -2,7 +2,7 @@
 
 ## Install the onnxruntime-genai library
 
-Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/install).
+Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install).
 
 
 ## Get the model

From e6c99669ee0cbde6afd525b9b03dfdc70269c2b3 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Mon, 18 Mar 2024 12:40:17 -0700
Subject: [PATCH 11/36] Simple sequence_length check against max_length (#211)

A much better error for when the prompt is greater length than
max_length
---
 src/generators.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/generators.cpp b/src/generators.cpp
index b5cd1bb67..d7bf47abe 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -75,6 +75,8 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_
     throw std::runtime_error("batch_size must be 1 or greater");
   if (params.vocab_size < 1)
     throw std::runtime_error("vocab_size must be 1 or greater");
+  if (params.sequence_length >= params.search.max_length)
+    throw std::runtime_error("input sequence_length is >= max_length");
 
   search_ = CreateSearch(params);
   state_ = model.CreateState(search_->GetSequenceLengths(), params);

From 970fb4d5eb19572f6de9394d898874ed64876d9a Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 19 Mar 2024 14:38:54 -0700
Subject: [PATCH 12/36] Check the return value when calling
 SetCurrentGpuDeviceId (#219)

---
 src/models/onnxruntime_inline.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/models/onnxruntime_inline.h b/src/models/onnxruntime_inline.h
index 8c955f08b..cd7180e93 100644
--- a/src/models/onnxruntime_inline.h
+++ b/src/models/onnxruntime_inline.h
@@ -151,14 +151,7 @@ inline std::unique_ptr<Allocator> Allocator::Create(const OrtSession& sess, cons
 }
 
 inline void SetCurrentGpuDeviceId(int device_id) {
-#ifdef __APPLE__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-result"
-#endif
-  api->SetCurrentGpuDeviceId(device_id);
-#ifdef __APPLE__
-#pragma clang diagnostic pop
-#endif
+  ThrowOnError(api->SetCurrentGpuDeviceId(device_id));
 }
 
 inline int GetCurrentGpuDeviceId() {

From 8679a684e0b0b01200f9462fd9d763f278a1d4aa Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 19 Mar 2024 14:39:26 -0700
Subject: [PATCH 13/36] Fix python scripts to use options properly (#217)

We were using set_search_options, then for example calling
'get_next_token_topk_topp' which is redundant. We can call
'get_next_token()' and it will use the search options we just set.

Also now bypasses calling TopK_TopP if P is 1.0 when calling
generate_next_token
---
 benchmark/python/benchmark_e2e.py         | 21 ++++++++++-----------
 examples/python/model-chat.py             |  4 ++--
 src/generators.cpp                        |  2 +-
 test/python/test_onnxruntime_genai_api.py |  4 ++--
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
index 4dddded46..f5cfd3143 100644
--- a/benchmark/python/benchmark_e2e.py
+++ b/benchmark/python/benchmark_e2e.py
@@ -18,12 +18,12 @@ def generate_prompt(model, tokenizer, prompt_length) -> str:
     prompt = "a"
     tokens = tokenizer.encode(prompt)
     params=og.GeneratorParams(model)
-    params.set_search_options({"max_length":prompt_length, "min_length":prompt_length+1})
+    params.set_search_options({"do_sample":True, "top_k":5, "temperature":temperature, "max_length":prompt_length, "min_length":prompt_length+1})
     params.input_ids = tokens
     generator=og.Generator(model, params)
     while not generator.is_done():
         generator.compute_logits()
-        generator.generate_next_token_top_k(5, temperature)
+        generator.generate_next_token()
     return tokenizer.decode(generator.get_sequence(0))
 
 def save_results(results, filename):
@@ -65,15 +65,17 @@ def main(args):
     # Generate prompt
     prompt = [generate_prompt(model, tokenizer, prompt_length)] * batch_size
     tokens = tokenizer.encode_batch(prompt)
+
+    params = og.GeneratorParams(model)
+    params.input_ids = tokens
+    params.set_search_options({"do_sample":True, "top_k":args.top_k, "top_p":args.top_p, "temperature":temperature, "max_length":max_length, "min_length":max_length})
+
     if args.verbose: print("Running warmup runs...")
     for _ in tqdm(range(args.warmup)):
-        params = og.GeneratorParams(model)
-        params.input_ids = tokens
-        params.set_search_options({"max_length":max_length, "min_length":max_length})
         generator = og.Generator(model, params)
         while not generator.is_done():
             generator.compute_logits()
-            generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, temperature)
+            generator.generate_next_token()
         if args.print_model_output: print(tokenizer.decode(generator.get_sequence(0)))
 
     tokenize_times = []
@@ -84,9 +86,6 @@ def main(args):
     if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}")
     for _ in tqdm(range(num_repetitions)):
         # Prepare run
-        params = og.GeneratorParams(model)
-        params.input_ids = tokens
-        params.set_search_options({"max_length":max_length, "min_length":max_length})
         generator = og.Generator(model, params)
 
         # Measure tokenization
@@ -102,7 +101,7 @@ def main(args):
         prompt_times.append(prompt_end_time - prompt_start_time)
 
         sampling_start_time = time.perf_counter()
-        generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, temperature)
+        generator.generate_next_token()
         sampling_end_time = time.perf_counter()
         sampling_times.append(sampling_end_time - sampling_start_time)
 
@@ -115,7 +114,7 @@ def main(args):
             token_gen_end_time = time.perf_counter()
 
             sampling_start_time = time.perf_counter()
-            generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, temperature)
+            generator.generate_next_token()
             sampling_end_time = time.perf_counter()
             
             token_gen_times.append(token_gen_end_time - token_gen_start_time)
diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py
index edbfc1fc0..559423989 100644
--- a/examples/python/model-chat.py
+++ b/examples/python/model-chat.py
@@ -20,7 +20,7 @@ def main(args):
         input_tokens = tokenizer.encode(text)
 
         params = og.GeneratorParams(model)
-        params.set_search_options({"max_length": args.max_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty})
+        params.set_search_options({"do_sample": True, "max_length": args.max_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty})
         params.input_ids = input_tokens
         generator = og.Generator(model, params)
         if args.verbose: print("Generator created")
@@ -29,7 +29,7 @@ def main(args):
         print(f'\n{text}', end='')
         while not generator.is_done():
             generator.compute_logits()
-            generator.generate_next_token_top_k_top_p(args.top_k, args.top_p, args.temperature)
+            generator.generate_next_token()
             print(tokenizer_stream.decode(generator.get_next_tokens()[0]), end='', flush=True)
         print()
 
diff --git a/src/generators.cpp b/src/generators.cpp
index d7bf47abe..d64ad34a3 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -121,7 +121,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper
   if (top_k < 0)
     throw std::runtime_error("top_k must be 0 or greater");
 
-  if (top_p > 0.0f && top_k > 1) {
+  if (top_p > 0.0f && top_p < 1.0f && top_k > 1) {
     search_->SampleTopKTopP(top_k, top_p, temperature);
   } else if (top_k > 1) {
     search_->SampleTopK(top_k, temperature);
diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py
index 7be2c68aa..695fb8802 100644
--- a/test/python/test_onnxruntime_genai_api.py
+++ b/test/python/test_onnxruntime_genai_api.py
@@ -32,14 +32,14 @@ def test_greedy_search(test_data_path, relative_model_path):
     search_params.input_ids = np.array(
         [[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32
     )
-    search_params.set_search_options({"max_length": 10})
+    search_params.set_search_options({"do_sample": False, "max_length": 10})
     input_ids_shape = [2, 4]
     batch_size = input_ids_shape[0]
 
     generator = og.Generator(model, search_params)
     while not generator.is_done():
         generator.compute_logits()
-        generator.generate_next_token_top()
+        generator.generate_next_token()
 
     expected_sequence = np.array(
         [

From 7a33c01468d2cf03d89e037ed6cca7a6e9ca061d Mon Sep 17 00:00:00 2001
From: Adam Clark <sa_ddam213@live.com>
Date: Wed, 20 Mar 2024 11:10:36 +1300
Subject: [PATCH 14/36] Access to sampling methods in C# Api (#206)

---
 src/csharp/Generator.cs                |  21 ++-
 src/csharp/NativeMethods.cs            |  23 ++++
 src/ort_genai_c.cpp                    |  28 ++++
 src/ort_genai_c.h                      |   8 ++
 test/c_api_tests.cpp                   | 179 +++++++++++++++++++++++++
 test/csharp/TestOnnxRuntimeGenAIAPI.cs | 156 +++++++++++++++++++++
 6 files changed, 414 insertions(+), 1 deletion(-)

diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs
index 1dc81883b..10c3d4e47 100644
--- a/src/csharp/Generator.cs
+++ b/src/csharp/Generator.cs
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 using System;
-using System.Runtime.InteropServices;
 
 namespace Microsoft.ML.OnnxRuntimeGenAI
 {
@@ -26,11 +25,31 @@ public void ComputeLogits()
             Result.VerifySuccess(NativeMethods.OgaGenerator_ComputeLogits(_generatorHandle));
         }
 
+        public void GenerateNextToken()
+        {
+            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken(_generatorHandle));
+        }
+
         public void GenerateNextTokenTop()
         {
             Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_Top(_generatorHandle));
         }
 
+        public void GenerateNextTokenTopK(int k, float temperature)
+        {
+            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK(_generatorHandle, k, temperature));
+        }
+
+        public void GenerateNextTokenTopP(float p, float temperature)
+        {
+            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopP(_generatorHandle, p, temperature));
+        }
+
+        public void GenerateNextTokenTopKTopP(int k, float p, float temperature)
+        {
+            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK_TopP(_generatorHandle, k, p, temperature));
+        }
+
         public ReadOnlySpan<int> GetSequence(ulong index)
         {
             ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64();
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index 4b41102d7..039dfb4de 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -76,10 +76,33 @@ internal class NativeLib
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaGenerator_ComputeLogits(IntPtr /* OgaGenerator* */ generator);
 
+        // This function is used to generate the next token in the sequence using the greedy search algorithm.
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken(IntPtr /* OgaGenerator* */ generator);
+
         // This function is used to generate the next token in the sequence using the greedy search algorithm.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_Top(IntPtr /* OgaGenerator* */ generator);
 
+        // This function is used to generate the next token in the sequence using the greedy search algorithm.
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK(IntPtr /* OgaGenerator* */ generator,
+                                                                                         int /* int32_t */ k,
+                                                                                         float /* single_t */ t);
+
+        // This function is used to generate the next token in the sequence using the greedy search algorithm.
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopP(IntPtr /* OgaGenerator* */ generator,
+                                                                                         float /* single_t */ p,
+                                                                                         float /* single_t */ t);
+
+        // This function is used to generate the next token in the sequence using the greedy search algorithm.
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK_TopP(IntPtr /* OgaGenerator* */ generator,
+                                                                                         int /* int32_t */ k,
+                                                                                         float /* single_t */ p,
+                                                                                         float /* single_t */ t);
+
         // This function returns the length of the sequence at the given index.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator,
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index 25b0edf18..fbb986af0 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -145,6 +145,13 @@ OgaResult* OGA_API_CALL OgaGenerator_ComputeLogits(OgaGenerator* generator) {
   OGA_CATCH
 }
 
+OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator) {
+  OGA_TRY
+  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken();
+  return nullptr;
+  OGA_CATCH
+}
+
 OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator) {
   OGA_TRY
   reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_Top();
@@ -152,6 +159,27 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generat
   OGA_CATCH
 }
 
+OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t) {
+  OGA_TRY
+  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_TopK(k, t);
+  return nullptr;
+  OGA_CATCH
+}
+
+OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t) {
+  OGA_TRY
+  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_TopP(p, t);
+  return nullptr;
+  OGA_CATCH
+}
+
+OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t) {
+  OGA_TRY
+  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_TopK_TopP(k, p, t);
+  return nullptr;
+  OGA_CATCH
+}
+
 size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().size();
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index 255bfbafb..e702082fc 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -180,9 +180,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_ComputeLogits(OgaGenerator* gene
  */
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator);
 
+/* Top-K sampling: most probable words from the model's output probability distribution for the next word
+ */
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t);
+
+/*Top-P sampling selects words from the smallest set of words whose cumulative probability exceeds a predefined threshold (p)
+ */
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t);
 
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator);
+
 /*
  * \brief Returns the number of tokens in the sequence at the given index.
  * \param[in] generator The generator to get the count of the tokens for the sequence at the given index.
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index ab5bfc169..2ac6bfb71 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -221,3 +221,182 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
     EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
   }
 }
+
+#if TEST_PHI2
+TEST(CAPITests, TopKCAPI) {
+  float top_k = 50;
+  float temp = 0.6f;
+
+  OgaModel* model;
+  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
+  OgaModelPtr model_ptr{model};
+
+  OgaTokenizer* tokenizer;
+  CheckResult(OgaCreateTokenizer(model, &tokenizer));
+  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+
+  OgaSequences* input_sequences;
+  CheckResult(OgaCreateSequences(&input_sequences));
+  OgaSequencesPtr sequences_ptr{input_sequences};
+
+  const char* input_strings[] = {
+      "This is a test.",
+      "Rats are awesome pets!",
+      "The quick brown fox jumps over the lazy dog.",
+  };
+
+  for (auto& string : input_strings)
+    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
+
+  OgaGeneratorParams* params;
+  CheckResult(OgaCreateGeneratorParams(model, &params));
+  OgaGeneratorParamsPtr params_ptr{params};
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
+  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
+
+  OgaGenerator* generator;
+  CheckResult(OgaCreateGenerator(model, params, &generator));
+  OgaGeneratorPtr generator_ptr{generator};
+
+  while (!OgaGenerator_IsDone(generator)) {
+    CheckResult(OgaGenerator_ComputeLogits(generator));
+    CheckResult(OgaGenerator_GenerateNextToken_TopK(generator, top_k, temp));
+  }
+
+  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
+  OgaSequences* output_sequences;
+  CheckResult(OgaGenerate(model, params, &output_sequences));
+  OgaSequencesPtr output_sequences_ptr{output_sequences};
+
+  // Decode The Batch
+  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
+    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
+
+    const char* out_string;
+    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
+    std::cout << "Decoded string:" << out_string << std::endl;
+    OgaDestroyString(out_string);
+  }
+}
+
+TEST(CAPITests, TopPCAPI) {
+  float top_p = 0.6f;
+  float temp = 0.6f;
+
+  OgaModel* model;
+  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
+  OgaModelPtr model_ptr{model};
+
+  OgaTokenizer* tokenizer;
+  CheckResult(OgaCreateTokenizer(model, &tokenizer));
+  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+
+  OgaSequences* input_sequences;
+  CheckResult(OgaCreateSequences(&input_sequences));
+  OgaSequencesPtr sequences_ptr{input_sequences};
+
+  const char* input_strings[] = {
+      "This is a test.",
+      "Rats are awesome pets!",
+      "The quick brown fox jumps over the lazy dog.",
+  };
+
+  for (auto& string : input_strings)
+    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
+
+  OgaGeneratorParams* params;
+  CheckResult(OgaCreateGeneratorParams(model, &params));
+  OgaGeneratorParamsPtr params_ptr{params};
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
+  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
+
+  OgaGenerator* generator;
+  CheckResult(OgaCreateGenerator(model, params, &generator));
+  OgaGeneratorPtr generator_ptr{generator};
+
+  while (!OgaGenerator_IsDone(generator)) {
+    CheckResult(OgaGenerator_ComputeLogits(generator));
+    CheckResult(OgaGenerator_GenerateNextToken_TopP(generator, top_p, temp));
+  }
+
+  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
+  OgaSequences* output_sequences;
+  CheckResult(OgaGenerate(model, params, &output_sequences));
+  OgaSequencesPtr output_sequences_ptr{output_sequences};
+
+  // Decode The Batch
+  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
+    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
+
+    const char* out_string;
+    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
+    std::cout << "Decoded string:" << out_string << std::endl;
+    OgaDestroyString(out_string);
+  }
+}
+
+TEST(CAPITests, TopKTopPCAPI) {
+  float top_p = 0.6f;
+  int top_k = 50;
+  float temp = 0.6f;
+
+  OgaModel* model;
+  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
+  OgaModelPtr model_ptr{model};
+
+  OgaTokenizer* tokenizer;
+  CheckResult(OgaCreateTokenizer(model, &tokenizer));
+  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+
+  OgaSequences* input_sequences;
+  CheckResult(OgaCreateSequences(&input_sequences));
+  OgaSequencesPtr sequences_ptr{input_sequences};
+
+  const char* input_strings[] = {
+      "This is a test.",
+      "Rats are awesome pets!",
+      "The quick brown fox jumps over the lazy dog.",
+  };
+
+  for (auto& string : input_strings)
+    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
+
+  OgaGeneratorParams* params;
+  CheckResult(OgaCreateGeneratorParams(model, &params));
+  OgaGeneratorParamsPtr params_ptr{params};
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
+  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
+
+  OgaGenerator* generator;
+  CheckResult(OgaCreateGenerator(model, params, &generator));
+  OgaGeneratorPtr generator_ptr{generator};
+
+  while (!OgaGenerator_IsDone(generator)) {
+    CheckResult(OgaGenerator_ComputeLogits(generator));
+    CheckResult(OgaGenerator_GenerateNextToken_TopK_TopP(generator, top_k, top_p, temp));
+  }
+
+  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p));
+  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
+  OgaSequences* output_sequences;
+  CheckResult(OgaGenerate(model, params, &output_sequences));
+  OgaSequencesPtr output_sequences_ptr{output_sequences};
+
+  // Decode The Batch
+  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
+    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
+
+    const char* out_string;
+    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
+    std::cout << "Decoded string:" << out_string << std::endl;
+    OgaDestroyString(out_string);
+  }
+}
+
+#endif // TEST_PHI2
diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
index 2121fc7aa..7bca5ffdc 100644
--- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs
+++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
@@ -8,6 +8,7 @@
 using Microsoft.ML.OnnxRuntimeGenAI;
 using System.Collections.Generic;
 using System.Linq;
+using System.Reflection.Emit;
 
 namespace Microsoft.ML.OnnxRuntimeGenAI.Tests
 {
@@ -86,6 +87,161 @@ public void TestGreedySearch()
             }
         }
 
+        [IgnoreOnModelAbsebceFact(DisplayName = "TestTopKSearch")]
+        public void TestTopKSearch()
+        {
+            int topK = 100;
+            float temp = 0.6f;
+            ulong maxLength = 40;
+            
+            string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
+            using (var model = new Model(modelPath))
+            {
+                Assert.NotNull(model);
+                using (var tokenizer = new Tokenizer(model))
+                {
+                    Assert.NotNull(tokenizer);
+
+                    var strings = new string[] {
+                        "This is a test.",
+                        "Rats are awesome pets!",
+                        "The quick brown fox jumps over the lazy dog."
+                    };
+
+                    var sequences = tokenizer.EncodeBatch(strings);
+                    Assert.NotNull(sequences);
+                    Assert.Equal((ulong)strings.Length, sequences.NumSequences);
+
+                    using GeneratorParams generatorParams = new GeneratorParams(model);
+                    Assert.NotNull(generatorParams);
+
+                    generatorParams.SetSearchOption("max_length", 20);
+                    generatorParams.SetInputSequences(sequences);
+
+                    using Generator generator = new Generator(model, generatorParams);
+                    Assert.NotNull(generator);
+                    while (!generator.IsDone())
+                    {
+                        generator.ComputeLogits();
+                        generator.GenerateNextTokenTopK(topK, temp);
+                    }
+
+                    generatorParams.SetSearchOption("do_sample", true);
+                    generatorParams.SetSearchOption("top_k", topK);
+                    generatorParams.SetSearchOption("temperature", temp);
+                    var outputSequences = model.Generate(generatorParams);
+                    Assert.NotNull(outputSequences);
+
+                    var outputStrings = tokenizer.DecodeBatch(outputSequences);
+                    Assert.NotNull(outputStrings);
+                }
+            }
+        }
+
+        [IgnoreOnModelAbsebceFact(DisplayName = "TestTopPSearch")]
+        public void TestTopPSearch()
+        {
+            float topP = 0.6f;
+            float temp = 0.6f;
+            ulong maxLength = 40;
+            
+            string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
+            using (var model = new Model(modelPath))
+            {
+                Assert.NotNull(model);
+                using (var tokenizer = new Tokenizer(model))
+                {
+                    Assert.NotNull(tokenizer);
+
+                    var strings = new string[] {
+                        "This is a test.",
+                        "Rats are awesome pets!",
+                        "The quick brown fox jumps over the lazy dog."
+                    };
+
+                    var sequences = tokenizer.EncodeBatch(strings);
+                    Assert.NotNull(sequences);
+                    Assert.Equal((ulong)strings.Length, sequences.NumSequences);
+
+                    using GeneratorParams generatorParams = new GeneratorParams(model);
+                    Assert.NotNull(generatorParams);
+
+                    generatorParams.SetSearchOption("max_length", 20);
+                    generatorParams.SetInputSequences(sequences);
+
+                    using Generator generator = new Generator(model, generatorParams);
+                    Assert.NotNull(generator);
+                    while (!generator.IsDone())
+                    {
+                        generator.ComputeLogits();
+                        generator.GenerateNextTokenTopP(topP, temp);
+                    }
+
+                    generatorParams.SetSearchOption("do_sample", true);
+                    generatorParams.SetSearchOption("top_p", topP);
+                    generatorParams.SetSearchOption("temperature", temp);
+                    var outputSequences = model.Generate(generatorParams);
+                    Assert.NotNull(outputSequences);
+
+                    var outputStrings = tokenizer.DecodeBatch(outputSequences);
+                    Assert.NotNull(outputStrings);
+                }
+            }
+        }
+
+        [IgnoreOnModelAbsebceFact(DisplayName = "TestTopKTopPSearch")]
+        public void TestTopKTopPSearch()
+        {
+            int topK = 100;
+            float topP = 0.6f;
+            float temp = 0.6f;
+            ulong maxLength = 40;
+            
+            string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
+            using (var model = new Model(modelPath))
+            {
+                Assert.NotNull(model);
+                using (var tokenizer = new Tokenizer(model))
+                {
+                    Assert.NotNull(tokenizer);
+
+                    var strings = new string[] {
+                        "This is a test.",
+                        "Rats are awesome pets!",
+                        "The quick brown fox jumps over the lazy dog."
+                    };
+
+                    var sequences = tokenizer.EncodeBatch(strings);
+                    Assert.NotNull(sequences);
+                    Assert.Equal((ulong)strings.Length, sequences.NumSequences);
+
+                    using GeneratorParams generatorParams = new GeneratorParams(model);
+                    Assert.NotNull(generatorParams);
+
+                    generatorParams.SetSearchOption("max_length", 20);
+                    generatorParams.SetInputSequences(sequences);
+
+                    using Generator generator = new Generator(model, generatorParams);
+                    Assert.NotNull(generator);
+                    while (!generator.IsDone())
+                    {
+                        generator.ComputeLogits();
+                        generator.GenerateNextTokenTopKTopP(topK, topP, temp);
+                    }
+
+                    generatorParams.SetSearchOption("do_sample", true);
+                    generatorParams.SetSearchOption("top_k", topK);
+                    generatorParams.SetSearchOption("top_p", topP);
+                    generatorParams.SetSearchOption("temperature", temp);
+                    var outputSequences = model.Generate(generatorParams);
+                    Assert.NotNull(outputSequences);
+
+                    var outputStrings = tokenizer.DecodeBatch(outputSequences);
+                    Assert.NotNull(outputStrings);
+                }
+            }
+        }
+
         [IgnoreOnModelAbsebceFact(DisplayName = "TestTokenizerBatchEncodeDecode")]
         public void TestTokenizerBatchEncodeDecode()
         {

From 3a9ecf787f8bc6e84a0850f37044e909bf49d06d Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Wed, 20 Mar 2024 10:08:10 -0700
Subject: [PATCH 15/36] Make Model and GeneratorParams be a shared_ptr vs
 unique_ptr (#212)

For safety. This will ensure the Model object's lifetime matches that of
any Generator using it.
Same as GeneratorParams and Tokenizer
---
 src/generators.cpp          |   8 +-
 src/generators.h            |  10 +-
 src/models/input_ids.cpp    |  10 +-
 src/models/kv_cache.cpp     |  16 +--
 src/models/logits.cpp       |  12 +-
 src/models/model.cpp        |  27 ++--
 src/models/model.h          |  13 +-
 src/models/position_ids.cpp |  18 +--
 src/models/whisper.cpp      |   4 +-
 src/ort_genai_c.cpp         |  18 ++-
 src/python/python.cpp       |  50 +++----
 src/search.cpp              |  68 +++++-----
 src/search.h                |   6 +-
 src/search_cuda.cpp         |  84 ++++++------
 src/search_cuda.h           |   2 +-
 test/model_tests.cpp        | 118 ++++++++---------
 test/sampling_benchmark.cpp | 170 ++++++++++++------------
 test/sampling_tests.cpp     | 252 ++++++++++++++++++------------------
 18 files changed, 454 insertions(+), 432 deletions(-)

diff --git a/src/generators.cpp b/src/generators.cpp
index d64ad34a3..6844a9aaf 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -66,7 +66,7 @@ std::unique_ptr<Search> CreateSearch(const GeneratorParams& params) {
   return std::make_unique<GreedySearch_Cpu>(params);
 }
 
-Generator::Generator(const Model& model, const GeneratorParams& params) : model_{model} {
+Generator::Generator(const Model& model, const GeneratorParams& params) : model_{model.shared_from_this()} {
   if (params.search.max_length == 0)
     throw std::runtime_error("search max_length is 0");
   if (params.search.max_length > model.config_->model.context_length)
@@ -89,7 +89,7 @@ void Generator::ComputeLogits() {
   search_->SetLogits(state_->Run(search_->GetSequenceLength(), search_->GetNextTokens(), search_->GetNextIndices()));
   computed_logits_ = true;
 
-  auto& search = search_->params_.search;
+  auto& search = search_->params_->search;
   search_->ApplyMinLength(search.min_length);
   search_->ApplyRepetitionPenalty(search.repetition_penalty);
 }
@@ -112,7 +112,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper
   }
 
   // The user explicitly called TopK_TopP on a beam search
-  if (search_->params_.search.num_beams != 1)
+  if (search_->params_->search.num_beams != 1)
     throw std::runtime_error("TopK and TopP cannot be used with a beam search");
 
   // Sanity checks
@@ -134,7 +134,7 @@ void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temper
 }
 
 void Generator::GenerateNextToken() {
-  auto& search = search_->params_.search;
+  auto& search = search_->params_->search;
   if (search.do_sample)
     GenerateNextToken_TopK_TopP(search.top_k, search.top_p, search.temperature);
   else
diff --git a/src/generators.h b/src/generators.h
index 433fda103..1b42b45e9 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -44,7 +44,7 @@ enum struct DeviceType {
   CUDA,
 };
 
-struct GeneratorParams {
+struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
   GeneratorParams() = default;  // This constructor is only used if doing a custom model handler vs built-in
   GeneratorParams(const Model& model);
 
@@ -91,6 +91,8 @@ struct GeneratorParams {
   std::variant<Whisper> inputs;
 
   std::vector<int32_t> input_ids_owner;  // Backing memory of input_ids in some cases
+
+  std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 };
 
 struct Generator {
@@ -106,13 +108,15 @@ struct Generator {
 
   RoamingArray<int32_t> GetSequence(int index) const;
 
-  const Model& model_;
+  std::shared_ptr<const Model> model_;
   std::unique_ptr<State> state_;
   std::unique_ptr<Search> search_;
   bool computed_logits_{};  // Set to true in ComputeLogits() and false after appending a token to ensure a 1 to 1 call ratio
 };
 
-std::unique_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path);
+std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path);
+std::shared_ptr<GeneratorParams> CreateGeneratorParams(const Model& model);
+std::shared_ptr<GeneratorParams> CreateGeneratorParams();  // For benchmarking purposes only
 std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params);
 std::vector<std::vector<int32_t>> Generate(const Model& model, const GeneratorParams& params);  // Uses CreateGenerator and a simple loop to return the entire sequence
 
diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp
index 96a8facc8..88d2514b5 100644
--- a/src/models/input_ids.cpp
+++ b/src/models/input_ids.cpp
@@ -9,24 +9,24 @@ InputIDs::InputIDs(const Model& model, State& state)
     : model_{model},
       state_{state} {
   name_ = model_.config_->model.decoder.inputs.input_ids.c_str();
-  shape_ = {state_.params_.batch_size, state_.params_.sequence_length};
+  shape_ = {state_.params_->batch_size, state_.params_->sequence_length};
   type_ = model_.session_info_->GetInputDataType(name_);
 
   // If 64-bit, convert from 32-bit to 64-bit
   if (type_ == Ort::TypeToTensorType<int64_t>::type) {
     value_ = OrtValue::CreateTensor(model.allocator_cpu_, shape_, type_);
     auto* p_data = value_->GetTensorMutableData<int64_t>();
-    for (auto v : state_.params_.input_ids) {
+    for (auto v : state_.params_->input_ids) {
       *p_data++ = v;
     }
   } else {
     if (type_ != Ort::TypeToTensorType<int32_t>::type)
       throw std::runtime_error("InputIDs must be int64 or int32");
-    value_ = OrtValue::CreateTensor<int32_t>(model.allocator_cpu_.GetInfo(), std::span<int32_t>(const_cast<int32_t*>(state_.params_.input_ids.data()), shape_[0] * shape_[1]), shape_);
+    value_ = OrtValue::CreateTensor<int32_t>(model.allocator_cpu_.GetInfo(), std::span<int32_t>(const_cast<int32_t*>(state_.params_->input_ids.data()), shape_[0] * shape_[1]), shape_);
   }
 
-  value_ = model_.ExpandInputs(value_, state_.params_.search.num_beams);
-  shape_[0] *= state_.params_.search.num_beams;
+  value_ = model_.ExpandInputs(value_, state_.params_->search.num_beams);
+  shape_[0] *= state_.params_->search.num_beams;
 }
 
 void InputIDs::Add() {
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 4a0910066..17515355f 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -8,7 +8,7 @@ KV_Cache_Combined::KV_Cache_Combined(const Model& model, State& state)
     : model_{model},
       state_{state},
       layer_count_{model.config_->model.decoder.num_hidden_layers},
-      shape_{2, state_.params_.BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
+      shape_{2, state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
   pasts_.resize(layer_count_);
   presents_.reserve(layer_count_);
 
@@ -25,7 +25,7 @@ KV_Cache_Combined::KV_Cache_Combined(const Model& model, State& state)
   type_ = model_.session_info_->GetInputDataType(input_name_strings_[0]);
 
   empty_past_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
-  shape_[3] = state_.params_.sequence_length;
+  shape_[3] = state_.params_->sequence_length;
 
   for (int i = 0; i < layer_count_; ++i) {
     presents_.push_back(OrtValue::CreateTensor(*model.allocator_device_, shape_, type_));
@@ -45,7 +45,7 @@ void KV_Cache_Combined::Add() {
 }
 
 void KV_Cache_Combined::Update(std::span<const int32_t> beam_indices, int current_length) {
-  assert(state_.params_.search.num_beams == 1 || !beam_indices.empty());  // We require beam_indices if we're a beam search
+  assert(state_.params_->search.num_beams == 1 || !beam_indices.empty());  // We require beam_indices if we're a beam search
 
   for (int i = 0; i < layer_count_; i++) {
     if (beam_indices.empty()) {
@@ -117,8 +117,8 @@ KV_Cache::KV_Cache(const Model& model, State& state)
     : model_{model},
       state_{state},
       layer_count_{model_.config_->model.decoder.num_hidden_layers},
-      past_present_share_buffer_{state_.params_.search.past_present_share_buffer && state_.params_.search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA},
-      shape_{state_.params_.BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
+      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && model_.device_type_ == DeviceType::CUDA},
+      shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
   pasts_.resize(layer_count_ * 2);
   presents_.reserve(layer_count_ * 2);
 
@@ -142,9 +142,9 @@ KV_Cache::KV_Cache(const Model& model, State& state)
 
   // Set the size after empty_past_ has been created with 0 for this field
   if (past_present_share_buffer_)
-    shape_[2] = state_.params_.search.max_length;
+    shape_[2] = state_.params_->search.max_length;
   else
-    shape_[2] = state_.params_.sequence_length;
+    shape_[2] = state_.params_->sequence_length;
 
   for (int i = 0; i < layer_count_; ++i) {
     presents_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
@@ -245,7 +245,7 @@ Cross_Cache::Cross_Cache(const Model& model, State& state)
     : model_{model},
       state_{state},
       layer_count_{model_.config_->model.decoder.num_hidden_layers},
-      shape_{state_.params_.BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 1500, model.config_->model.decoder.head_size} {
+      shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 1500, model.config_->model.decoder.head_size} {
   values_.reserve(layer_count_ * 2);
 
   for (int i = 0; i < layer_count_; ++i) {
diff --git a/src/models/logits.cpp b/src/models/logits.cpp
index 7dc79e53a..d7dd837f3 100644
--- a/src/models/logits.cpp
+++ b/src/models/logits.cpp
@@ -7,7 +7,7 @@ namespace Generators {
 Logits::Logits(const Model& model, State& state)
     : model_{model},
       state_{state},
-      shape_{static_cast<int64_t>(state_.params_.batch_size) * state_.params_.search.num_beams, state_.params_.sequence_length, state_.params_.vocab_size},
+      shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
       type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
   if (model_.device_type_ == DeviceType::CPU && type_ != Ort::TypeToTensorType<float>::type)
     throw std::runtime_error("Model logits_type can only be float32 on CPU");
@@ -34,7 +34,7 @@ RoamingArray<float> Logits::Get() {
   if (shape_[1] != 1) {
     const size_t seq_length = shape_[1];
     const size_t vocab_size = shape_[2];
-    const size_t num_beams = state_.params_.search.num_beams;
+    const size_t num_beams = state_.params_->search.num_beams;
 
     shape_[1] = 1;
     auto value_next = OrtValue::CreateTensor<float>(*model_.allocator_device_, shape_);
@@ -42,12 +42,12 @@ RoamingArray<float> Logits::Get() {
 
     size_t vocab_index = 0;  // Simpler math to have this index go up by vocab_size for every logit chunk we process
 
-    const auto* input_ids = state_.params_.input_ids.data();
-    for (int batch_index = 0; batch_index < state_.params_.batch_size; batch_index++) {
+    const auto* input_ids = state_.params_->input_ids.data();
+    for (int batch_index = 0; batch_index < state_.params_->batch_size; batch_index++) {
       // Find the first non pad token from the end
       size_t token_index = seq_length;
       while (token_index-- > 0) {
-        if (input_ids[token_index] != state_.params_.pad_token_id)
+        if (input_ids[token_index] != state_.params_->pad_token_id)
           break;
       }
 
@@ -57,7 +57,7 @@ RoamingArray<float> Logits::Get() {
         auto target = logits_next.subspan(vocab_index, vocab_size);
 #if USE_CUDA
         if (model_.device_type_ == DeviceType::CUDA)
-          CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_.cuda_stream);
+          CudaCheck() == cudaMemcpyAsync(target.data(), source.data(), source.size_bytes(), cudaMemcpyDeviceToDevice, state_.params_->cuda_stream);
         else
 #endif
           copy(source, target);
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 993b66248..a31b1ed84 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -12,7 +12,7 @@
 
 namespace Generators {
 
-State::State(const GeneratorParams& params) : params_{params} {
+State::State(const GeneratorParams& params) : params_{params.shared_from_this()} {
 }
 
 void State::Run(OrtSession& session) {
@@ -94,13 +94,13 @@ void CheckResult(tfmError_t error) {
 }
 
 TokenizerStream::TokenizerStream(const Tokenizer& tokenizer)
-    : tokenizer_{tokenizer} {
+    : tokenizer_{tokenizer.shared_from_this()} {
   CheckResult(TfmCreate(kTfmKindDetokenizerCache, cache_.Address()));
 }
 
 const std::string& TokenizerStream::Decode(int32_t token) {
   const char* string;
-  CheckResult(TfmDetokenizeCached(tokenizer_.tokenizer_, cache_, token, &string));
+  CheckResult(TfmDetokenizeCached(tokenizer_->tokenizer_, cache_, token, &string));
   chunk_ = string;
   return chunk_;
 }
@@ -297,23 +297,32 @@ void Model::CreateSessionOptions() {
   }
 }
 
-std::unique_ptr<Tokenizer> Model::CreateTokenizer() const {
-  return std::make_unique<Tokenizer>(*config_);
+std::shared_ptr<Tokenizer> Model::CreateTokenizer() const {
+  return std::make_shared<Tokenizer>(*config_);
 }
 
-std::unique_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path) {
+std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path) {
   auto config = std::make_unique<Config>(config_path);
 
   if (config->model.type == "gpt2")
-    return std::make_unique<Gpt_Model>(std::move(config), ort_env);
+    return std::make_shared<Gpt_Model>(std::move(config), ort_env);
   if (config->model.type == "llama" || config->model.type == "gemma" || config->model.type == "mistral" || config->model.type == "phi")
-    return std::make_unique<DecoderOnly_Model>(std::move(config), ort_env);
+    return std::make_shared<DecoderOnly_Model>(std::move(config), ort_env);
   if (config->model.type == "whisper")
-    return std::make_unique<Whisper_Model>(std::move(config), ort_env);
+    return std::make_shared<Whisper_Model>(std::move(config), ort_env);
 
   throw std::runtime_error("Unsupported model_type in config.json: " + config->model.type);
 }
 
+std::shared_ptr<GeneratorParams> CreateGeneratorParams(const Model& model) {
+  return std::make_shared<GeneratorParams>(model);
+}
+
+// Used by benchmarking tests only, should not be used normally
+std::shared_ptr<GeneratorParams> CreateGeneratorParams() {
+  return std::make_shared<GeneratorParams>();
+}
+
 #if USE_CUDA
 void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out) {
   auto shape_info = in.GetTensorTypeAndShapeInfo();
diff --git a/src/models/model.h b/src/models/model.h
index 3f1d4ceca..9af784362 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -15,7 +15,7 @@ struct State {
 
   virtual RoamingArray<float> Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices = {}) = 0;
 
-  const GeneratorParams& params_;
+  std::shared_ptr<const GeneratorParams> params_;
 
   std::vector<const char*> input_names_, output_names_;
   std::vector<OrtValue*> inputs_, outputs_;
@@ -57,7 +57,7 @@ struct TokenizerStream {
   const std::string& Decode(int32_t token);
 
  private:
-  const Tokenizer& tokenizer_;
+  std::shared_ptr<const Tokenizer> tokenizer_;
   TfmPtr<TfmObject> cache_;
   std::string chunk_;
 };
@@ -66,7 +66,7 @@ struct TokenizerStream {
 // Sequence length is vector.size()/count
 std::vector<int32_t> PadInputs(std::span<std::span<const int32_t> > sequences, int32_t pad_token_id);
 
-struct Tokenizer {
+struct Tokenizer : std::enable_shared_from_this<Tokenizer> {
   Tokenizer(Config& config);
 
   std::unique_ptr<TokenizerStream> CreateStream() const;
@@ -78,6 +78,7 @@ struct Tokenizer {
   std::vector<std::string> DecodeBatch(std::span<const int32_t> sequences, size_t count) const;
 
   TfmPtr<TfmTokenizer> tokenizer_;
+  std::shared_ptr<Tokenizer> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
  private:
   int32_t pad_token_id_;
@@ -94,11 +95,11 @@ struct SessionInfo {
   std::unordered_map<std::string, ONNXTensorElementDataType> inputs_, outputs_;
 };
 
-struct Model {
+struct Model : std::enable_shared_from_this<Model> {
   Model(std::unique_ptr<Config> config);
   virtual ~Model();
 
-  std::unique_ptr<Tokenizer> CreateTokenizer() const;
+  std::shared_ptr<Tokenizer> CreateTokenizer() const;
 
   virtual std::unique_ptr<State> CreateState(RoamingArray<int32_t> sequence_lengths, const GeneratorParams& params) const = 0;
 
@@ -113,6 +114,8 @@ struct Model {
 
   std::unique_ptr<SessionInfo> session_info_;
 
+  std::shared_ptr<Model> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
+
  protected:
   void InitDeviceAllocator(OrtSession& session);
   void CreateSessionOptions();
diff --git a/src/models/position_ids.cpp b/src/models/position_ids.cpp
index bfff6e161..a0e8d6b56 100644
--- a/src/models/position_ids.cpp
+++ b/src/models/position_ids.cpp
@@ -12,7 +12,7 @@ PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray<int32_t>
   if (type_ != Ort::TypeToTensorType<int32_t>::type && type_ != Ort::TypeToTensorType<int64_t>::type)
     throw std::runtime_error("position_ids & attention_mask only support int32 or int64 types");
 
-  std::array<int64_t, 2> shape{state_.params_.batch_size, state_.params_.sequence_length};  // Only batch_size initially, as we haven't expanded over the beams yet
+  std::array<int64_t, 2> shape{state_.params_->batch_size, state_.params_->sequence_length};  // Only batch_size initially, as we haven't expanded over the beams yet
   position_ids_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_);
   position_ids_next_ = OrtValue::CreateTensor(model.allocator_cpu_, std::array<int64_t, 2>{shape[0], 1}, type_);
   attention_mask_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_);
@@ -22,10 +22,10 @@ PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray<int32_t>
   else
     InitializeTensors<int64_t>(shape, sequence_lengths_unk);
 
-  position_ids_ = model_.ExpandInputs(position_ids_, state_.params_.search.num_beams);
-  position_ids_next_ = model_.ExpandInputs(position_ids_next_, state_.params_.search.num_beams);
-  attention_mask_ = model_.ExpandInputs(attention_mask_, state_.params_.search.num_beams);
-  shape[0] *= state_.params_.search.num_beams;
+  position_ids_ = model_.ExpandInputs(position_ids_, state_.params_->search.num_beams);
+  position_ids_next_ = model_.ExpandInputs(position_ids_next_, state_.params_->search.num_beams);
+  attention_mask_ = model_.ExpandInputs(attention_mask_, state_.params_->search.num_beams);
+  shape[0] *= state_.params_->search.num_beams;
   position_ids_shape_ = shape;
   attention_mask_shape_ = shape;
 }
@@ -106,13 +106,13 @@ void PositionIDs::InitializeTensors(std::array<int64_t, 2> shape, cpu_span<int32
   auto* mask_data = attention_mask_->GetTensorMutableData<T>();
   auto* position_data = position_ids_->GetTensorMutableData<T>();
   auto* position_data_next = position_ids_next_->GetTensorMutableData<T>();
-  const auto* word_id = state_.params_.input_ids.data();
+  const auto* word_id = state_.params_->input_ids.data();
   auto* mask = mask_data;
   auto* position = position_data;
   for (int i = 0; i < shape[0]; i++) {
     T abs_position = 0;
     for (int j = 0; j < shape[1]; j++, word_id++, mask++, position++) {
-      if (*word_id == state_.params_.pad_token_id) {
+      if (*word_id == state_.params_->pad_token_id) {
         *mask = 0;
         *position = 0;
       } else {
@@ -122,8 +122,8 @@ void PositionIDs::InitializeTensors(std::array<int64_t, 2> shape, cpu_span<int32
     }
 
     position_data_next[i] = abs_position;
-    for (int k = 0; k < state_.params_.search.num_beams; k++) {
-      sequence_lengths[i * state_.params_.search.num_beams + k] = static_cast<int32_t>(abs_position);
+    for (int k = 0; k < state_.params_->search.num_beams; k++) {
+      sequence_lengths[i * state_.params_->search.num_beams + k] = static_cast<int32_t>(abs_position);
     }
   }
 }
diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp
index f6f2aaea1..5c8fe9d83 100644
--- a/src/models/whisper.cpp
+++ b/src/models/whisper.cpp
@@ -20,12 +20,12 @@ Whisper_State::Whisper_State(const Whisper_Model& model, RoamingArray<int32_t> s
       model_{model} {
   auto& inputs = const_cast<GeneratorParams::Whisper&>(std::get<GeneratorParams::Whisper>(params.inputs));
 
-  auto encoder_input_ids = model_.ExpandInputs(inputs.input_features, params_.search.num_beams);
+  auto encoder_input_ids = model_.ExpandInputs(inputs.input_features, params_->search.num_beams);
   encoder_hidden_states_ = OrtValue::CreateTensor<float>(*model_.allocator_device_, std::array<int64_t, 3>{decoder_input_ids_.GetShape()[0], 1500, 384});
 
   auto sequence_lengths = sequence_lengths_unk.GetCPU();
   for (int i = 0; i < decoder_input_ids_.GetShape()[0]; i++) {
-    sequence_lengths[i] = static_cast<int32_t>(params_.sequence_length);
+    sequence_lengths[i] = static_cast<int32_t>(params_->sequence_length);
   }
 
   input_names_.push_back("encoder_input_ids");
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index fbb986af0..bbf84be51 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -63,14 +63,18 @@ const int32_t* OGA_API_CALL OgaSequencesGetSequenceData(const OgaSequences* p, s
 
 OgaResult* OGA_API_CALL OgaCreateModel(const char* config_path, OgaModel** out) {
   OGA_TRY
-  *out = reinterpret_cast<OgaModel*>(Generators::CreateModel(Generators::GetOrtEnv(), config_path).release());
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), config_path);
+  model->external_owner_ = model;
+  *out = reinterpret_cast<OgaModel*>(model.get());
   return nullptr;
   OGA_CATCH
 }
 
 OgaResult* OGA_API_CALL OgaCreateGeneratorParams(const OgaModel* model, OgaGeneratorParams** out) {
   OGA_TRY
-  *out = reinterpret_cast<OgaGeneratorParams*>(new Generators::GeneratorParams(*reinterpret_cast<const Generators::Model*>(model)));
+  auto params = std::make_shared<Generators::GeneratorParams>(*reinterpret_cast<const Generators::Model*>(model));
+  params->external_owner_ = params;
+  *out = reinterpret_cast<OgaGeneratorParams*>(params.get());
   return nullptr;
   OGA_CATCH
 }
@@ -192,7 +196,9 @@ const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* oga_gen
 
 OgaResult* OGA_API_CALL OgaCreateTokenizer(const OgaModel* model, OgaTokenizer** out) {
   OGA_TRY
-  *out = reinterpret_cast<OgaTokenizer*>(reinterpret_cast<const Generators::Model*>(model)->CreateTokenizer().release());
+  auto tokenizer = reinterpret_cast<const Generators::Model*>(model)->CreateTokenizer();
+  tokenizer->external_owner_ = tokenizer;
+  *out = reinterpret_cast<OgaTokenizer*>(tokenizer.get());
   return nullptr;
   OGA_CATCH
 }
@@ -265,11 +271,11 @@ void OGA_API_CALL OgaDestroySequences(OgaSequences* p) {
 }
 
 void OGA_API_CALL OgaDestroyModel(OgaModel* p) {
-  delete reinterpret_cast<Generators::Model*>(p);
+  reinterpret_cast<Generators::Model*>(p)->external_owner_ = nullptr;
 }
 
 void OGA_API_CALL OgaDestroyGeneratorParams(OgaGeneratorParams* p) {
-  delete reinterpret_cast<Generators::GeneratorParams*>(p);
+  reinterpret_cast<Generators::GeneratorParams*>(p)->external_owner_ = nullptr;
 }
 
 void OGA_API_CALL OgaDestroyGenerator(OgaGenerator* p) {
@@ -277,7 +283,7 @@ void OGA_API_CALL OgaDestroyGenerator(OgaGenerator* p) {
 }
 
 void OGA_API_CALL OgaDestroyTokenizer(OgaTokenizer* p) {
-  delete reinterpret_cast<Generators::Tokenizer*>(p);
+  reinterpret_cast<Generators::Tokenizer*>(p)->external_owner_ = nullptr;
 }
 
 void OGA_API_CALL OgaDestroyTokenizerStream(OgaTokenizerStream* p) {
diff --git a/src/python/python.cpp b/src/python/python.cpp
index a1667eb90..1c8db803d 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -53,26 +53,33 @@ void Declare_DeviceArray(pybind11::module& m, const char* name) {
           "get_array", [](Type& t) -> pybind11::array_t<T> { return t.GetNumpy(); }, pybind11::return_value_policy::reference_internal);
 }
 
-struct PyGeneratorParams : GeneratorParams {
+struct PyGeneratorParams {
+  PyGeneratorParams(const Model& model) : params_{std::make_shared<GeneratorParams>(model)} {
+  }
+
+  operator const GeneratorParams&() const { return *params_; }
+
+  std::shared_ptr<GeneratorParams> params_;
+
   // Turn the python py_input_ids_ into the low level parameters
   void Prepare() {
     // TODO: This will switch to using the variant vs being ifs
     if (py_input_ids_.size() != 0) {
       if (py_input_ids_.ndim() == 1) {  // Just a 1D array
-        batch_size = 1;
-        sequence_length = static_cast<int>(py_input_ids_.shape(0));
+        params_->batch_size = 1;
+        params_->sequence_length = static_cast<int>(py_input_ids_.shape(0));
       } else {
         if (py_input_ids_.ndim() != 2)
           throw std::runtime_error("Input IDs can only be 1 or 2 dimensional");
 
-        batch_size = static_cast<int>(py_input_ids_.shape(0));
-        sequence_length = static_cast<int>(py_input_ids_.shape(1));
+        params_->batch_size = static_cast<int>(py_input_ids_.shape(0));
+        params_->sequence_length = static_cast<int>(py_input_ids_.shape(1));
       }
-      input_ids = ToSpan(py_input_ids_);
+      params_->input_ids = ToSpan(py_input_ids_);
     }
 
     if (py_whisper_input_features_.size() != 0) {
-      GeneratorParams::Whisper& whisper = inputs.emplace<GeneratorParams::Whisper>();
+      GeneratorParams::Whisper& whisper = params_->inputs.emplace<GeneratorParams::Whisper>();
 #ifdef __APPLE__
       std::span shape(reinterpret_cast<const int64_t*>(py_whisper_input_features_.shape()),
                       py_whisper_input_features_.ndim());
@@ -81,9 +88,9 @@ struct PyGeneratorParams : GeneratorParams {
 #endif
       whisper.input_features = OrtValue::CreateTensor<float>(Ort::Allocator::GetWithDefaultOptions().GetInfo(), ToSpan(py_whisper_input_features_), shape);
       whisper.decoder_input_ids = ToSpan(py_whisper_decoder_input_ids_);
-      batch_size = 1;
-      sequence_length = static_cast<int>(py_whisper_decoder_input_ids_.shape(1));
-      input_ids = ToSpan(py_whisper_decoder_input_ids_);
+      params_->batch_size = 1;
+      params_->sequence_length = static_cast<int>(py_whisper_decoder_input_ids_.shape(1));
+      params_->input_ids = ToSpan(py_whisper_decoder_input_ids_);
     }
   }
 
@@ -92,11 +99,11 @@ struct PyGeneratorParams : GeneratorParams {
       auto name = entry.first.cast<std::string>();
       try {
         if (pybind11::isinstance<pybind11::float_>(entry.second)) {
-          SetSearchNumber(search, name, entry.second.cast<double>());
+          SetSearchNumber(params_->search, name, entry.second.cast<double>());
         } else if (pybind11::isinstance<pybind11::bool_>(entry.second)) {
-          SetSearchBool(search, name, entry.second.cast<bool>());
+          SetSearchBool(params_->search, name, entry.second.cast<bool>());
         } else if (pybind11::isinstance<pybind11::int_>(entry.second)) {
-          SetSearchNumber(search, name, entry.second.cast<int>());
+          SetSearchNumber(params_->search, name, entry.second.cast<int>());
         } else
           throw std::runtime_error("Unknown search option type, can be float/bool/int:" + name);
       } catch (JSON::unknown_value_error& e) {
@@ -182,9 +189,9 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
 
   pybind11::class_<PyGeneratorParams>(m, "GeneratorParams")
       .def(pybind11::init<const Model&>())
-      .def_readonly("pad_token_id", &PyGeneratorParams::pad_token_id)
-      .def_readonly("eos_token_id", &PyGeneratorParams::eos_token_id)
-      .def_readonly("vocab_size", &PyGeneratorParams::vocab_size)
+      .def_property_readonly("pad_token_id", [](const PyGeneratorParams& v) { return v.params_->pad_token_id; })
+      .def_property_readonly("eos_token_id", [](const PyGeneratorParams& v) { return v.params_->eos_token_id; })
+      .def_property_readonly("vocab_size", [](const PyGeneratorParams& v) { return v.params_->vocab_size; })
       .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_)
       .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_)
       .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_)
@@ -196,7 +203,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
   pybind11::class_<TokenizerStream>(m, "TokenizerStream")
       .def("decode", [](TokenizerStream& t, int32_t token) { return t.Decode(token); });
 
-  pybind11::class_<Tokenizer>(m, "Tokenizer")
+  pybind11::class_<Tokenizer, std::shared_ptr<Tokenizer>>(m, "Tokenizer")
       .def(pybind11::init([](Model& model) { return model.CreateTokenizer(); }))
       .def("encode", &Tokenizer::Encode)
       .def("decode", [](const Tokenizer& t, pybind11::array_t<int32_t> tokens) { return t.Decode(ToSpan(tokens)); })
@@ -216,18 +223,11 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       })
       .def("create_stream", [](const Tokenizer& t) { return t.CreateStream(); });
 
-  pybind11::class_<Model>(m, "Model")
+  pybind11::class_<Model, std::shared_ptr<Model>>(m, "Model")
       .def(pybind11::init([](const std::string& config_path) {
         return CreateModel(GetOrtEnv(), config_path.c_str());
       }))
       .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); })
-      .def("generate_sequence", [](Model& model, pybind11::array_t<int32_t> input_ids, const pybind11::dict& search_options) {
-        PyGeneratorParams params{model};
-        params.SetSearchOptions(search_options);
-        params.py_input_ids_ = input_ids;
-        params.Prepare();
-        return Generate(model, params)[0];
-      })
       .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });
 
   pybind11::class_<PyGenerator>(m, "Generator")
diff --git a/src/search.cpp b/src/search.cpp
index cc3a6fd10..dd3389270 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -9,7 +9,7 @@ namespace Generators {
 
 Search_Cpu::Search_Cpu(const GeneratorParams& params)
     : Search{params},
-      sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_.search.max_length} {
+      sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_->search.max_length} {
   auto batch_beam_size = params.BatchBeamSize();
   sequence_lengths_buffer_ = AllocateArray<int32_t>(batch_beam_size, &sequence_lengths_);
 }
@@ -25,8 +25,8 @@ GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params)
 
 BeamSearch_Cpu::BeamSearch_Cpu(const GeneratorParams& params)
     : Search_Cpu(params) {
-  assert(params_.search.num_beams > 1);  // If 1, use GreedySearch
-  beam_scorer_ = std::make_unique<BeamSearchScorer>(params_);
+  assert(params_->search.num_beams > 1);  // If 1, use GreedySearch
+  beam_scorer_ = std::make_unique<BeamSearchScorer>(*params_);
 }
 
 BeamSearch_Cpu::~BeamSearch_Cpu() = default;
@@ -58,16 +58,16 @@ void BeamSearch_Cpu::SelectTop() {
   // TODO(tianleiwu): use thread pool to parallel
   int offset = 0;
   int batch_beam_index = 0;
-  for (int i = 0; i < params_.batch_size; i++) {
-    for (int j = 0; j < params_.search.num_beams; j++, batch_beam_index++) {
-      for (int k = 0; k < params_.vocab_size; k++, offset++) {
+  for (int i = 0; i < params_->batch_size; i++) {
+    for (int j = 0; j < params_->search.num_beams; j++, batch_beam_index++) {
+      for (int k = 0; k < params_->vocab_size; k++, offset++) {
         next_token_scores_[offset] += beam_scores[batch_beam_index];
       }
     }
   }
 
   // TODO: Write output scores?
-  unsigned const top_k = 2 * params_.search.num_beams;
+  const size_t top_k = 2 * params_->search.num_beams;
 
   struct ScoreIndex {
     float score;
@@ -76,17 +76,17 @@ void BeamSearch_Cpu::SelectTop() {
     bool operator<(const ScoreIndex& s) const { return score < s.score; }
   };
 
-  auto scores = std::make_unique<float[]>(top_k * params_.batch_size);
-  auto indices = std::make_unique<int32_t[]>(top_k * params_.batch_size);
-  auto tokens = std::make_unique<int32_t[]>(top_k * params_.batch_size);
+  auto scores = std::make_unique<float[]>(top_k * params_->batch_size);
+  auto indices = std::make_unique<int32_t[]>(top_k * params_->batch_size);
+  auto tokens = std::make_unique<int32_t[]>(top_k * params_->batch_size);
 
-  auto next_scores = std::span<float>(scores.get(), top_k * params_.batch_size);
-  auto next_indices = std::span<int32_t>(indices.get(), top_k * params_.batch_size);
-  auto next_tokens = std::span<int32_t>(tokens.get(), top_k * params_.batch_size);
+  auto next_scores = std::span<float>(scores.get(), top_k * params_->batch_size);
+  auto next_indices = std::span<int32_t>(indices.get(), top_k * params_->batch_size);
+  auto next_tokens = std::span<int32_t>(tokens.get(), top_k * params_->batch_size);
 
-  for (size_t batch_index = 0; batch_index < static_cast<size_t>(params_.batch_size); batch_index++) {
+  for (size_t batch_index = 0; batch_index < static_cast<size_t>(params_->batch_size); batch_index++) {
     std::priority_queue<ScoreIndex, std::vector<ScoreIndex>> queue;
-    auto token_scores_sub = next_token_scores_.subspan(batch_index * params_.search.num_beams * params_.vocab_size, params_.search.num_beams * params_.vocab_size);
+    auto token_scores_sub = next_token_scores_.subspan(batch_index * params_->search.num_beams * params_->vocab_size, static_cast<size_t>(params_->search.num_beams) * params_->vocab_size);
     for (int i = 0; i < token_scores_sub.size(); i++) {
       queue.push({token_scores_sub[i], i});
     }
@@ -96,8 +96,8 @@ void BeamSearch_Cpu::SelectTop() {
     auto next_scores_sub = next_scores.subspan(top_k * batch_index, top_k);
     for (unsigned i = 0; i < top_k; i++) {
       auto v = queue.top();
-      next_indices_sub[i] = v.index / params_.vocab_size;
-      next_tokens_sub[i] = v.index % params_.vocab_size;
+      next_indices_sub[i] = v.index / params_->vocab_size;
+      next_tokens_sub[i] = v.index % params_->vocab_size;
       next_scores_sub[i] = v.score;
       queue.pop();
     }
@@ -117,12 +117,12 @@ void BeamSearch_Cpu::SelectTop() {
 
 void GreedySearch_Cpu::SelectTop() {
   // next_tokens = torch.argmax(scores, dim=-1)
-  for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) {
+  for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) {
     if (PadIfAlreadyEOS(batch_id)) {
       continue;
     }
 
-    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size);
+    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size);
     auto const token = static_cast<int32_t>(std::distance(scores.begin(), std::max_element(scores.begin(), scores.end())));
     SetNextToken(batch_id, token);
   }
@@ -144,8 +144,8 @@ void SoftMax(std::span<float> scores, float temperature) {
 }
 
 void GreedySearch_Cpu::SampleTopK(int k, float temperature) {
-  for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) {
-    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size);
+  for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) {
+    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size);
     SoftMax(scores, temperature);
     // Find the top K scores
     std::vector<int> indices(scores.size());
@@ -160,11 +160,11 @@ void GreedySearch_Cpu::SampleTopK(int k, float temperature) {
 
 void GreedySearch_Cpu::SampleTopP(float p, float temperature) {
   std::uniform_real_distribution<float> dis(0, p);
-  for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) {
+  for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) {
     if (PadIfAlreadyEOS(batch_id)) {
       continue;
     }
-    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size);
+    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size);
     SoftMax(scores, temperature);
     // Sort an array of indices into the scores
     std::vector<int32_t> indices(scores.size());
@@ -189,11 +189,11 @@ void GreedySearch_Cpu::SampleTopP(float p, float temperature) {
 
 void GreedySearch_Cpu::SampleTopKTopP(int k, float p, float temperature) {
   std::uniform_real_distribution<float> dis(0, p);
-  for (size_t batch_id = 0; batch_id < params_.batch_size; batch_id++) {
+  for (size_t batch_id = 0; batch_id < params_->batch_size; batch_id++) {
     if (PadIfAlreadyEOS(batch_id)) {
       continue;
     }
-    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_.vocab_size, params_.vocab_size);
+    std::span<float> const scores = next_token_scores_.subspan(batch_id * params_->vocab_size, params_->vocab_size);
     SoftMax(scores, temperature);
     // Find the top K scores
     std::vector<int> indices(scores.size());
@@ -222,13 +222,13 @@ bool GreedySearch_Cpu::PadIfAlreadyEOS(size_t batch_id) {
     return false;
   }
 
-  next_tokens_[batch_id] = params_.pad_token_id;
+  next_tokens_[batch_id] = params_->pad_token_id;
   return true;
 }
 
 void GreedySearch_Cpu::SetNextToken(size_t batch_id, int32_t token) {
   next_tokens_[batch_id] = token;
-  if (token == params_.eos_token_id) {
+  if (token == params_->eos_token_id) {
     eos_seen_[batch_id] = true;
     if (--not_done_count_ == 0) {
       done_ = true;
@@ -239,7 +239,7 @@ void GreedySearch_Cpu::SetNextToken(size_t batch_id, int32_t token) {
 void GreedySearch_Cpu::AppendNextTokensToSequences() {
   sequences_.AppendNextTokenToSequences(next_tokens_);
 
-  if (sequences_.GetSequenceLength() == params_.search.max_length) {
+  if (sequences_.GetSequenceLength() == params_->search.max_length) {
     done_ = true;
   }
 }
@@ -247,7 +247,7 @@ void GreedySearch_Cpu::AppendNextTokensToSequences() {
 void BeamSearch_Cpu::AppendNextTokensToSequences() {
   sequences_.AppendNextTokenToSequences(beam_scorer_->GetNextIndicesCPU(), beam_scorer_->GetNextTokens());
 
-  if (sequences_.GetSequenceLength() == params_.search.max_length) {
+  if (sequences_.GetSequenceLength() == params_->search.max_length) {
     done_ = true;
   }
 }
@@ -257,8 +257,8 @@ void BeamSearch_Cpu::Finalize(size_t num_return_sequences, RoamingArray<int32_t>
 }
 
 std::span<float> Search_Cpu::GetScores(int batch_beam_index) const {
-  assert(batch_beam_index >= 0 && batch_beam_index < params_.BatchBeamSize());
-  return next_token_scores_.subspan(batch_beam_index * params_.vocab_size, params_.vocab_size);
+  assert(batch_beam_index >= 0 && batch_beam_index < params_->BatchBeamSize());
+  return next_token_scores_.subspan(static_cast<size_t>(batch_beam_index) * params_->vocab_size, params_->vocab_size);
 }
 
 void Search_Cpu::ApplyMinLength(int min_length) {
@@ -266,10 +266,10 @@ void Search_Cpu::ApplyMinLength(int min_length) {
     return;
   }
 
-  const int batch_beam_size = params_.BatchBeamSize();
+  const int batch_beam_size = params_->BatchBeamSize();
   for (int i = 0; i < batch_beam_size; i++) {
     std::span<float> const beam_token_scores = GetScores(i);
-    beam_token_scores[params_.eos_token_id] = std::numeric_limits<float>::lowest();
+    beam_token_scores[params_->eos_token_id] = std::numeric_limits<float>::lowest();
   }
 }
 
@@ -277,7 +277,7 @@ void Search_Cpu::ApplyRepetitionPenalty(float penalty) {
   if (penalty == 1.0f)
     return;
 
-  const int batch_beam_size = params_.BatchBeamSize();
+  const int batch_beam_size = params_->BatchBeamSize();
   for (int i = 0; i < batch_beam_size; i++) {
     std::span<float> const beam_token_scores = GetScores(i);
     std::span<const int32_t> const sequence = sequences_.GetSequence(i);
diff --git a/src/search.h b/src/search.h
index bc81313eb..5a52c11e2 100644
--- a/src/search.h
+++ b/src/search.h
@@ -6,7 +6,7 @@ namespace Generators {
 struct BeamSearchScorer;
 
 struct Search {
-  Search(const GeneratorParams& params) : params_{params} {}
+  Search(const GeneratorParams& params) : params_{params.shared_from_this()} {}
   virtual ~Search() = default;
 
   virtual RoamingArray<int32_t> GetNextTokens() = 0;
@@ -30,7 +30,7 @@ struct Search {
   virtual void ApplyMinLength(int min_length) = 0;
   virtual void ApplyRepetitionPenalty(float penalty) = 0;
 
-  const GeneratorParams& params_;
+  std::shared_ptr<const GeneratorParams> params_;
 };
 
 struct Search_Cpu : Search {
@@ -81,7 +81,7 @@ struct GreedySearch_Cpu : Search_Cpu {
 
   std::span<bool> eos_seen_;  // shape (batch_size)
   std::unique_ptr<bool[]> eos_seen_buffer_;
-  int not_done_count_{params_.batch_size};  // When zero, every batch entry is done (starts at batch_size_)
+  int not_done_count_{params_->batch_size};  // When zero, every batch entry is done (starts at batch_size_)
 
   std::random_device rd_;
   std::mt19937 gen_;
diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp
index 304f62cc2..aa6d85431 100644
--- a/src/search_cuda.cpp
+++ b/src/search_cuda.cpp
@@ -17,13 +17,13 @@ void OnCudaError(cudaError_t error) {
 
 Search_Cuda::Search_Cuda(const GeneratorParams& params)
     : Search{params},
-      sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_.search.max_length, params_.cuda_stream} {
+      sequences_{params.input_ids, params.batch_size, params.search.num_beams, params_->search.max_length, params_->cuda_stream} {
   auto batch_beam_size = params.BatchBeamSize();
   sequence_lengths_buffer_ = std::make_unique<int32_t[]>(batch_beam_size);
   sequence_lengths_ = cpu_span<int32_t>(sequence_lengths_buffer_.get(), batch_beam_size);
 
   eos_meet_buffer_ = CudaMallocArray<bool>(batch_beam_size, &eos_meet_);
-  cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_.cuda_stream);
+  cudaMemsetAsync(eos_meet_.data(), 0, eos_meet_.size_bytes(), params_->cuda_stream);
 
   done_cpu_ = CudaMallocHostArray<bool>(1);
   *done_cpu_ = false;
@@ -32,26 +32,26 @@ Search_Cuda::Search_Cuda(const GeneratorParams& params)
 GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params)
     : Search_Cuda{params} {
   next_tokens_buffer_ = CudaMallocArray<int32_t>(params.batch_size, &next_tokens_);
-  cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_.cuda_stream);
-  samplingdata_ = std::make_unique<cuda::SamplingData>(params_.batch_size, params_.vocab_size, params_.cuda_stream);
+  cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_->cuda_stream);
+  samplingdata_ = std::make_unique<cuda::SamplingData>(params_->batch_size, params_->vocab_size, params_->cuda_stream);
 }
 
 BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params)
     : Search_Cuda{params} {
-  assert(params_.search.num_beams > 1);  // If 1, use GreedySearch
-  auto batch_beam_size = params_.BatchBeamSize();
-  beam_scorer_ = std::make_unique<BeamSearchScorer_Cuda>(params_);
+  assert(params_->search.num_beams > 1);  // If 1, use GreedySearch
+  auto batch_beam_size = params_->BatchBeamSize();
+  beam_scorer_ = std::make_unique<BeamSearchScorer_Cuda>(*params_);
 
   topk_next_tokens_ = CudaMallocArray<int32_t>(2 * batch_beam_size);
   topk_next_indices_ = CudaMallocArray<int32_t>(2 * batch_beam_size);
   topk_next_scores_ = CudaMallocArray<float>(2 * batch_beam_size);
 
   constexpr size_t max_parts_of_vocab = 128;
-  size_t topk_buffer_size = batch_beam_size * (max_parts_of_vocab + 1) * params_.search.num_beams * 2 * 2;
+  size_t topk_buffer_size = batch_beam_size * (max_parts_of_vocab + 1) * params_->search.num_beams * 2 * 2;
   topk_buffer_ = CudaMallocArray<float>(topk_buffer_size);
   static_assert(sizeof(float) == sizeof(int32_t));  // The topk_buffer assumes these match, fix for float16
 
-  cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), params_.cuda_stream);
+  cudaMemsetAsync(topk_buffer_.get(), 0, topk_buffer_size * sizeof(float), params_->cuda_stream);
 }
 
 BeamSearch_Cuda::~BeamSearch_Cuda() = default;
@@ -82,13 +82,13 @@ void BeamSearch_Cuda::SelectTop() {
   // Add beam score to next token scores. Corresponding python code is like:
   //    next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
   cuda::LaunchAddProbsKernel(next_token_scores_.data(), beam_scores.data(),
-                             params_.batch_size, params_.search.num_beams, params_.vocab_size, params_.cuda_stream);
+                             params_->batch_size, params_->search.num_beams, params_->vocab_size, params_->cuda_stream);
 
   // TODO: Write output scores?
 
-  if (params_.search.num_beams <= 32) {
+  if (params_->search.num_beams <= 32) {
     constexpr size_t max_parts_of_vocab = 128;
-    size_t candidate_count = params_.BatchBeamSize() * 2 * params_.search.num_beams;
+    size_t candidate_count = params_->BatchBeamSize() * 2 * params_->search.num_beams;
     float* topk_tmp_buffer = topk_buffer_.get();
     float* topk_scores_1st_stage = topk_tmp_buffer;
     int32_t* topk_tokens_1st_stage = reinterpret_cast<int32_t*>(topk_scores_1st_stage + candidate_count * max_parts_of_vocab);
@@ -96,10 +96,10 @@ void BeamSearch_Cuda::SelectTop() {
     int32_t* topk_tokens_2nd_stage = reinterpret_cast<int32_t*>(topk_scores_2nd_stage + candidate_count);
 
     cuda::BeamSearchTopK(next_token_scores_.data(),
-                         params_.batch_size,
-                         params_.search.num_beams,
-                         params_.vocab_size,
-                         2 * params_.search.num_beams,
+                         params_->batch_size,
+                         params_->search.num_beams,
+                         params_->vocab_size,
+                         2 * params_->search.num_beams,
                          topk_scores_1st_stage,
                          topk_tokens_1st_stage,
                          topk_scores_2nd_stage,
@@ -107,13 +107,13 @@ void BeamSearch_Cuda::SelectTop() {
                          topk_next_scores_.get(),
                          topk_next_tokens_.get(),
                          topk_next_indices_.get(),
-                         params_.cuda_stream);
+                         params_->cuda_stream);
   } else
     assert(false);
 
-  CudaCheck() == cudaStreamSynchronize(params_.cuda_stream);
+  CudaCheck() == cudaStreamSynchronize(params_->cuda_stream);
 
-  size_t size = params_.BatchBeamSize() * 2;
+  size_t size = params_->BatchBeamSize() * 2;
   std::span<float> next_scores{topk_next_scores_.get(), size};
   std::span<int32_t> next_tokens{topk_next_tokens_.get(), size};
   std::span<int32_t> next_indices{topk_next_indices_.get(), size};
@@ -131,52 +131,52 @@ void BeamSearch_Cuda::SelectTop() {
 }
 
 void GreedySearch_Cuda::SelectTop() {
-  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
-  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
-                  params_.batch_size, 1, 0.0, 1.0);
+  std::span<float> scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size),
+                  params_->batch_size, 1, 0.0, 1.0);
   CheckForEOS();
   AppendNextTokensToSequences();
 }
 
 void GreedySearch_Cuda::SampleTopP(float p, float temperature) {
-  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
-  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
-                  params_.batch_size, -1, p, temperature);
+  std::span<float> scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size),
+                  params_->batch_size, -1, p, temperature);
   CheckForEOS();
   AppendNextTokensToSequences();
 }
 
 void GreedySearch_Cuda::SampleTopK(int k, float temperature) {
-  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
-  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
-                  params_.batch_size, k, 0.0, temperature);
+  std::span<float> scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size),
+                  params_->batch_size, k, 0.0, temperature);
   CheckForEOS();
   AppendNextTokensToSequences();
 }
 
 void GreedySearch_Cuda::SampleTopKTopP(int k, float p, float temperature) {
-  std::span<float> scores = next_token_scores_.subspan(0, params_.batch_size * params_.vocab_size);
-  cuda::GetSample(samplingdata_.get(), params_.cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_.batch_size),
-                  params_.batch_size, k, p, temperature);
+  std::span<float> scores = next_token_scores_.subspan(0, params_->batch_size * params_->vocab_size);
+  cuda::GetSample(samplingdata_.get(), params_->cuda_stream, next_tokens_.data(), scores.data(), int(scores.size() / params_->batch_size),
+                  params_->batch_size, k, p, temperature);
   CheckForEOS();
   AppendNextTokensToSequences();
 }
 
 void GreedySearch_Cuda::CheckForEOS() {
   assert(next_tokens_.size() == eos_meet_.size());
-  cuda::Launch_CheckForEOS(next_tokens_.data(), static_cast<int>(next_tokens_.size()), eos_meet_.data(), params_.eos_token_id, params_.pad_token_id, done_cpu_.get(), params_.cuda_stream);
+  cuda::Launch_CheckForEOS(next_tokens_.data(), static_cast<int>(next_tokens_.size()), eos_meet_.data(), params_->eos_token_id, params_->pad_token_id, done_cpu_.get(), params_->cuda_stream);
 }
 
 void GreedySearch_Cuda::AppendNextTokensToSequences() {
   sequences_.AppendNextTokenToSequences(next_tokens_);
 
-  if (sequences_.GetSequenceLength() == params_.search.max_length)
+  if (sequences_.GetSequenceLength() == params_->search.max_length)
     *done_cpu_ = true;
 }
 
 bool BeamSearch_Cuda::IsDone() const {
   beam_scorer_->IsDone();
-  return beam_scorer_->IsDoneLater() || sequences_.GetSequenceLength() == params_.search.max_length;
+  return beam_scorer_->IsDoneLater() || sequences_.GetSequenceLength() == params_->search.max_length;
 }
 
 void BeamSearch_Cuda::AppendNextTokensToSequences() {
@@ -195,10 +195,10 @@ void GreedySearch::Finalize(size_t num_return_sequences, std::span<int32_t> outp
 
   // Copy the sequences to output
   std::span<int32_t> output{ output_sequences_->GetTensorMutableData<int32_t>(), shape_count};
-  for (int batch_id = 0; batch_id < params_.batch_size; ++batch_id) {
+  for (int batch_id = 0; batch_id < params_->batch_size; ++batch_id) {
     auto batch_output = output.subspan(
-        static_cast<size_t>(batch_id) * params_.max_length,
-        params_.max_length);
+        static_cast<size_t>(batch_id) * params_->max_length,
+        params_->max_length);
     std::span<const int32_t> sequence_source = sequences_.GetSequence(batch_id);
     std::copy(sequence_source, batch_output);
   }
@@ -206,8 +206,8 @@ void GreedySearch::Finalize(size_t num_return_sequences, std::span<int32_t> outp
 #endif
 
 std::span<float> Search_Cuda::GetScores(int batch_beam_index) {
-  assert(batch_beam_index >= 0 && batch_beam_index < params_.BatchBeamSize());
-  return next_token_scores_.subspan(batch_beam_index * params_.vocab_size, params_.vocab_size);
+  assert(batch_beam_index >= 0 && batch_beam_index < params_->BatchBeamSize());
+  return next_token_scores_.subspan(batch_beam_index * params_->vocab_size, params_->vocab_size);
 }
 
 std::span<float> Search_Cuda::GetScores() {
@@ -218,7 +218,7 @@ void Search_Cuda::ApplyMinLength(int min_length) {
   if (sequences_.GetSequenceLength() >= min_length)
     return;
 
-  cuda::LaunchSetScoreProcessor(GetScores().data(), params_.BatchBeamSize(), params_.vocab_size, params_.eos_token_id, std::numeric_limits<float>::lowest(), params_.cuda_stream);
+  cuda::LaunchSetScoreProcessor(GetScores().data(), params_->BatchBeamSize(), params_->vocab_size, params_->eos_token_id, std::numeric_limits<float>::lowest(), params_->cuda_stream);
 }
 
 void Search_Cuda::ApplyRepetitionPenalty(float penalty) {
@@ -226,8 +226,8 @@ void Search_Cuda::ApplyRepetitionPenalty(float penalty) {
     return;
 
   cuda::LaunchRepetitionPenaltyProcessor(sequences_.GetSequences().data(),
-                                         GetScores().data(), params_.batch_size, params_.search.num_beams, params_.vocab_size,
-                                         params_.search.max_length, GetSequenceLength(), penalty, params_.cuda_stream);
+                                         GetScores().data(), params_->batch_size, params_->search.num_beams, params_->vocab_size,
+                                         params_->search.max_length, GetSequenceLength(), penalty, params_->cuda_stream);
 }
 
 }  // namespace Generators
\ No newline at end of file
diff --git a/src/search_cuda.h b/src/search_cuda.h
index 8d1ddbbb4..11a5a428d 100644
--- a/src/search_cuda.h
+++ b/src/search_cuda.h
@@ -15,7 +15,7 @@ struct Search_Cuda : Search {
   RoamingArray<int32_t> GetSequence(int index) override { return sequences_.GetSequence(index); }
 
   bool IsDone() const {
-    cudaStreamSynchronize(params_.cuda_stream);
+    cudaStreamSynchronize(params_->cuda_stream);
     return *done_cpu_;
   }  // TODO: Use an event
   void SetLogits(RoamingArray<float> logits);
diff --git a/test/model_tests.cpp b/test/model_tests.cpp
index 90655cfed..a2b3a7832 100644
--- a/test/model_tests.cpp
+++ b/test/model_tests.cpp
@@ -34,13 +34,13 @@ TEST(ModelTests, GreedySearchGptFp32) {
   auto model = Generators::CreateModel(*g_ort_env,
                                        MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
 
-  Generators::GeneratorParams params{*model};
-  params.search.max_length = 10;
-  params.batch_size = static_cast<int>(input_ids_shape[0]);
-  params.sequence_length = static_cast<int>(input_ids_shape[1]);
-  params.input_ids = input_ids;
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->search.max_length = 10;
+  params->batch_size = static_cast<int>(input_ids_shape[0]);
+  params->sequence_length = static_cast<int>(input_ids_shape[1]);
+  params->input_ids = input_ids;
 
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto generator = Generators::CreateGenerator(*model, *params);
 
   while (!generator->IsDone()) {
     generator->ComputeLogits();
@@ -48,10 +48,10 @@ TEST(ModelTests, GreedySearchGptFp32) {
   }
 
   // Verify outputs match expected outputs
-  for (size_t i = 0; i < static_cast<size_t>(params.batch_size); i++) {
+  for (size_t i = 0; i < static_cast<size_t>(params->batch_size); i++) {
     auto sequence = generator->GetSequence(i).GetCPU();
-    auto* expected_output_start = &expected_output[i * params.search.max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t)));
+    auto* expected_output_start = &expected_output[i * params->search.max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t)));
   }
 }
 
@@ -74,16 +74,16 @@ TEST(ModelTests, BeamSearchGptFp32) {
 
   auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
 
-  Generators::GeneratorParams params{*model};
-  params.batch_size = static_cast<int>(input_ids_shape[0]);
-  params.sequence_length = static_cast<int>(input_ids_shape[1]);
-  params.input_ids = input_ids;
-  params.search.max_length = 20;
-  params.search.length_penalty = 1.0f;
-  params.search.num_beams = 4;
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->batch_size = static_cast<int>(input_ids_shape[0]);
+  params->sequence_length = static_cast<int>(input_ids_shape[1]);
+  params->input_ids = input_ids;
+  params->search.max_length = 20;
+  params->search.length_penalty = 1.0f;
+  params->search.num_beams = 4;
 
-  Generators::BeamSearch_Cpu search{params};
-  auto state = model->CreateState(search.sequence_lengths_, params);
+  Generators::BeamSearch_Cpu search{*params};
+  auto state = model->CreateState(search.sequence_lengths_, *params);
 
   while (!search.IsDone()) {
     search.SetLogits(state->Run(search.GetSequenceLength(), search.GetNextTokens(), search.GetNextIndices()));
@@ -95,14 +95,14 @@ TEST(ModelTests, BeamSearchGptFp32) {
     search.SelectTop();
   }
 
-  std::vector<int32_t> output_sequence(static_cast<size_t>(search.params_.batch_size) * search.params_.search.max_length);
+  std::vector<int32_t> output_sequence(static_cast<size_t>(search.params_->batch_size) * search.params_->search.max_length);
   search.Finalize(1, Generators::cpu_span<int32_t>{output_sequence}, {});
 
   // Verify outputs match expected outputs
-  for (size_t i = 0; i < static_cast<size_t>(search.params_.batch_size); i++) {
-    auto sequence = std::span<int32_t>(output_sequence.data() + search.params_.search.max_length * i, search.params_.search.max_length);
-    auto* expected_output_start = &expected_output[i * search.params_.search.max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t)));
+  for (size_t i = 0; i < static_cast<size_t>(search.params_->batch_size); i++) {
+    auto sequence = std::span<int32_t>(output_sequence.data() + search.params_->search.max_length * i, search.params_->search.max_length);
+    auto* expected_output_start = &expected_output[i * search.params_->search.max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t)));
   }
 }
 
@@ -118,13 +118,13 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label)
 
   auto model = Generators::CreateModel(*g_ort_env, model_path);
 
-  Generators::GeneratorParams params{*model};
-  params.batch_size = static_cast<int>(input_ids_shape[0]);
-  params.sequence_length = static_cast<int>(input_ids_shape[1]);
-  params.search.max_length = 10;
-  params.input_ids = input_ids;
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->batch_size = static_cast<int>(input_ids_shape[0]);
+  params->sequence_length = static_cast<int>(input_ids_shape[1]);
+  params->search.max_length = 10;
+  params->input_ids = input_ids;
 
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto generator = Generators::CreateGenerator(*model, *params);
 
   while (!generator->IsDone()) {
     generator->ComputeLogits();
@@ -132,11 +132,11 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label)
   }
 
   // Verify outputs match expected outputs
-  for (int i = 0; i < params.batch_size; i++) {
+  for (int i = 0; i < params->batch_size; i++) {
     auto sequence_gpu = generator->GetSequence(i);
     auto sequence = sequence_gpu.GetCPU();
-    auto* expected_output_start = &expected_output[i * params.search.max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t)));
+    auto* expected_output_start = &expected_output[i * params->search.max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t)));
   }
 }
 
@@ -163,34 +163,34 @@ void Test_BeamSearch_Gpt_Cuda(const char* model_path, const char* model_label) {
   // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default)
   auto model = Generators::CreateModel(*g_ort_env, model_path);
 
-  Generators::GeneratorParams params{*model};
-  params.batch_size = static_cast<int>(input_ids_shape[0]);
-  params.sequence_length = static_cast<int>(input_ids_shape[1]);
-  params.input_ids = input_ids;
-  params.search.max_length = 20;
-  params.search.num_beams = 4;
-  params.search.length_penalty = 1.0f;
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->batch_size = static_cast<int>(input_ids_shape[0]);
+  params->sequence_length = static_cast<int>(input_ids_shape[1]);
+  params->input_ids = input_ids;
+  params->search.max_length = 20;
+  params->search.num_beams = 4;
+  params->search.length_penalty = 1.0f;
 
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto generator = Generators::CreateGenerator(*model, *params);
 
   while (!generator->IsDone()) {
     generator->ComputeLogits();
     generator->GenerateNextToken();
   }
 
-  size_t sequence_length = params.batch_size * params.search.max_length;
+  size_t sequence_length = params->batch_size * params->search.max_length;
   auto output_sequence_cuda = Generators::CudaMallocArray<int32_t>(sequence_length);
   auto output_sequence_cpu = std::make_unique<int32_t[]>(sequence_length);
 
   generator->search_->Finalize(1, Generators::gpu_span<int32_t>(output_sequence_cuda.get(), sequence_length), {});
-  cudaMemcpyAsync(output_sequence_cpu.get(), output_sequence_cuda.get(), sequence_length * sizeof(int32_t), cudaMemcpyDeviceToHost, params.cuda_stream);
-  cudaStreamSynchronize(params.cuda_stream);
+  cudaMemcpyAsync(output_sequence_cpu.get(), output_sequence_cuda.get(), sequence_length * sizeof(int32_t), cudaMemcpyDeviceToHost, params->cuda_stream);
+  cudaStreamSynchronize(params->cuda_stream);
 
   // Verify outputs match expected outputs
-  for (int i = 0; i < params.batch_size; i++) {
-    auto sequence = std::span<int32_t>(output_sequence_cpu.get() + params.search.max_length * i, params.search.max_length);
-    auto* expected_output_start = &expected_output[i * params.search.max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params.search.max_length * sizeof(int32_t)));
+  for (int i = 0; i < params->batch_size; i++) {
+    auto sequence = std::span<int32_t>(output_sequence_cpu.get() + params->search.max_length * i, params->search.max_length);
+    auto* expected_output_start = &expected_output[i * params->search.max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), params->search.max_length * sizeof(int32_t)));
   }
 }
 
@@ -216,14 +216,14 @@ Print all primes between 1 and n
   auto tokenizer = model->CreateTokenizer();
   auto tokens = tokenizer->Encode(prompt);
 
-  Generators::GeneratorParams params{*model};
-  params.batch_size = 1;
-  params.sequence_length = static_cast<int>(tokens.size());
-  params.input_ids = tokens;
-  params.search.max_length = 128;
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->batch_size = 1;
+  params->sequence_length = static_cast<int>(tokens.size());
+  params->input_ids = tokens;
+  params->search.max_length = 128;
 
   // Generator version
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto generator = Generators::CreateGenerator(*model, *params);
   while (!generator->IsDone()) {
     generator->ComputeLogits();
     generator->GenerateNextToken_Top();
@@ -254,14 +254,14 @@ Print all primes between 1 and n
   auto tokenizer = model->CreateTokenizer();
   auto tokens = tokenizer->Encode(prompt);
 
-  Generators::GeneratorParams params{*model};
-  params.batch_size = 1;
-  params.sequence_length = static_cast<int>(tokens.size());
-  params.input_ids = tokens;
-  params.search.max_length = 128;
+  auto params = Generators::CreateGeneratorParams(*model);
+  params->batch_size = 1;
+  params->sequence_length = static_cast<int>(tokens.size());
+  params->input_ids = tokens;
+  params->search.max_length = 128;
 
   // High level version
-  auto result = Generators::Generate(*model, params);
+  auto result = Generators::Generate(*model, *params);
 
   std::cout << tokenizer->Decode(result[0]) << "\r\n";
 #else
diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp
index 3f21ed669..6190e2507 100644
--- a/test/sampling_benchmark.cpp
+++ b/test/sampling_benchmark.cpp
@@ -24,24 +24,24 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCpu) {
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
-  std::unique_ptr<float[]> logits_cpu(new float[vocab_size * batch_size]);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
+  std::vector<float> logits_cpu(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(1, 25);
   double total_time = 0.0;
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
-    auto generator = Generators::CreateGenerator(*model, params);
+    auto generator = Generators::CreateGenerator(*model, *params);
     int num_large = dist(engine);
-    CreateRandomLogits(logits_cpu.get(), num_large, vocab_size, batch_size, engine);
-    generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.get(), vocab_size * batch_size));
+    CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
+    generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.data(), vocab_size * batch_size));
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SampleTopP(0.95f, 1.0f);
     auto stop = std::chrono::high_resolution_clock::now();
@@ -59,14 +59,14 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) {
   int batch_size = 1;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
-  std::unique_ptr<float[]> logits_cpu(new float[vocab_size * batch_size]);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
+  std::vector<float> logits_cpu(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(5, 25);
@@ -74,9 +74,9 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) {
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    CreateRandomLogits(logits_cpu.get(), num_large, vocab_size, batch_size, engine);
-    generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.get(), vocab_size * batch_size));
+    auto generator = Generators::CreateGenerator(*model, *params);
+    CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
+    generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.data(), vocab_size * batch_size));
 
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SampleTopK(k, 1.0f);
@@ -97,14 +97,14 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) {
   float p = 0.95f;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
-  std::unique_ptr<float[]> logits_cpu(new float[vocab_size * batch_size]);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
+  std::vector<float> logits_cpu(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(5, 25);
@@ -112,9 +112,9 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) {
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    CreateRandomLogits(logits_cpu.get(), num_large, vocab_size, batch_size, engine);
-    generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.get(), vocab_size * batch_size));
+    auto generator = Generators::CreateGenerator(*model, *params);
+    CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
+    generator->search_->SetLogits(Generators::cpu_span<float>(logits_cpu.data(), vocab_size * batch_size));
 
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SampleTopKTopP(k, p, 1.0f);
@@ -136,14 +136,14 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) {
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
-  float* cpu_logits = new float[vocab_size * batch_size];
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
+  std::vector<float> cpu_logits(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(1, 25);
@@ -153,13 +153,13 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) {
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
+    auto generator = Generators::CreateGenerator(*model, *params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
 
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SampleTopP(0.95f, 1.0f);
     auto stop = std::chrono::high_resolution_clock::now();
@@ -167,7 +167,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) {
     total_time += duration.count();
 
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
   }
   double average_time = total_time / double(num_iter);
   std::cout << "Average time taken by TopP CUDA: "
@@ -183,16 +183,16 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) {
   int batch_size = 1;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocArray<int>(vocab_size * batch_size);
-  float* cpu_logits = new float[vocab_size * batch_size];
+  std::vector<float> cpu_logits(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(1, 25);
@@ -200,12 +200,12 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) {
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
+    auto generator = Generators::CreateGenerator(*model, *params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SampleTopK(k, 1.0f);
     auto stop = std::chrono::high_resolution_clock::now();
@@ -227,30 +227,30 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) {
   float p = 0.95f;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocArray<int>(vocab_size * batch_size);
-  float* cpu_logits = new float[vocab_size * batch_size];
+  std::vector<float> cpu_logits(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(1, 25);
   double total_time = 0.0;
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
-    auto generator = Generators::CreateGenerator(*model, params);
+    auto generator = Generators::CreateGenerator(*model, *params);
     int num_large = dist(engine);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpy(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
 
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SampleTopKTopP(k, p, 1.0f);
     auto stop = std::chrono::high_resolution_clock::now();
@@ -258,7 +258,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) {
     total_time += duration.count();
 
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
   }
   double average_time = total_time / double(num_iter);
   std::cout << "Average time taken by TopP+K: "
@@ -273,13 +273,13 @@ TEST(Benchmarks, BenchmarkRandomizedSelectTopCuda) {
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 12;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; // Needs to match batch_size
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocArray<int>(vocab_size * batch_size);
   std::vector<float> cpu_logits(vocab_size * batch_size);
@@ -290,12 +290,12 @@ TEST(Benchmarks, BenchmarkRandomizedSelectTopCuda) {
   int num_iter = 1000;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
+    auto generator = Generators::CreateGenerator(*model, *params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
     cudaMemcpy(cpu_logits.data(), logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     auto start = std::chrono::high_resolution_clock::now();
     generator->search_->SelectTop();
     auto stop = std::chrono::high_resolution_clock::now();
diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp
index f42c03e0a..531270f78 100644
--- a/test/sampling_tests.cpp
+++ b/test/sampling_tests.cpp
@@ -25,14 +25,14 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) {
                                    0.1f, 0.1f, 0.1f, 0.1f, 0.6f};
   int vocab_size = 5;
   int batch_size = 4;
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
+  auto generator = Generators::CreateGenerator(*model, *params);
   auto logits_span = Generators::cpu_span<float>(logits_cpu);
   generator->search_->SetLogits(logits_span);
   generator->computed_logits_ = true;
@@ -51,14 +51,14 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) {
                                 1.25f, 0.25f, 1.5f, 0.25f, 2.0f};
   int vocab_size = 5;
   int batch_size = 4;
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
+  auto generator = Generators::CreateGenerator(*model, *params);
   auto logits_copy = logits_cpu;
   generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
   generator->computed_logits_ = true;
@@ -83,14 +83,14 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) {
                                 1.25f, 0.25f, 1.5f, 0.25f, 2.0f};
   int vocab_size = 5;
   int batch_size = 4;
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
+  auto generator = Generators::CreateGenerator(*model, *params);
   auto logits_copy = logits_cpu;
   generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
   generator->computed_logits_ = true;
@@ -128,20 +128,20 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) {
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
   std::vector<float> logits_cpu(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
   std::uniform_int_distribution<> dist(1, 25);
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
-    auto generator = Generators::CreateGenerator(*model, params);
+    auto generator = Generators::CreateGenerator(*model, *params);
     int num_large = dist(engine);
     CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
     auto logits_copy = logits_cpu;
@@ -164,13 +164,13 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
   int batch_size = 5;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
   std::vector<float> logits_cpu(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
@@ -178,7 +178,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
+    auto generator = Generators::CreateGenerator(*model, *params);
     CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
     auto logits_copy=logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
@@ -201,13 +201,13 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
   float p = 0.95f;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CPU;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CPU;
   std::vector<float> logits_cpu(vocab_size * batch_size);
   std::random_device rd;
   std::mt19937 engine(rd());
@@ -215,7 +215,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
+    auto generator = Generators::CreateGenerator(*model, *params);
     CreateRandomLogits(logits_cpu.data(), num_large, vocab_size, batch_size, engine);
     auto logits_copy = logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
@@ -246,16 +246,16 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) {
   auto logits_gpu = Generators::CudaMallocArray<float>(logits_cpu.size());
   int vocab_size = 5;
   int batch_size = 4;
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
-  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream);
-  cudaStreamSynchronize(params.cuda_stream);
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
+  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params->cuda_stream);
+  cudaStreamSynchronize(params->cuda_stream);
+  auto generator = Generators::CreateGenerator(*model, *params);
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
@@ -274,16 +274,16 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) {
   auto logits_gpu = Generators::CudaMallocArray<float>(logits_cpu.size());
   int vocab_size = 5;
   int batch_size = 4;
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
-  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream);
-  cudaStreamSynchronize(params.cuda_stream);
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
+  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params->cuda_stream);
+  cudaStreamSynchronize(params->cuda_stream);
+  auto generator = Generators::CreateGenerator(*model, *params);
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
@@ -307,16 +307,16 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) {
   auto logits_gpu = Generators::CudaMallocArray<float>(logits_cpu.size());
   int vocab_size = 5;
   int batch_size = 4;
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
-  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params.cuda_stream);
-  cudaStreamSynchronize(params.cuda_stream);
-  auto generator = Generators::CreateGenerator(*model, params);
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
+  cudaMemcpyAsync(logits_gpu.get(), logits_cpu.data(), logits_cpu.size() * sizeof(float), cudaMemcpyHostToDevice, params->cuda_stream);
+  cudaStreamSynchronize(params->cuda_stream);
+  auto generator = Generators::CreateGenerator(*model, *params);
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
@@ -336,13 +336,13 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
   float* cpu_logits = new float[vocab_size * batch_size];
@@ -352,15 +352,15 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
+    auto generator = Generators::CreateGenerator(*model, *params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
     generator->GenerateNextToken_TopP(0.95f, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
       auto next_token = next_tokens[b];
@@ -376,13 +376,13 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) {
   int batch_size = 5;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
   float* cpu_logits = new float[vocab_size * batch_size];
@@ -392,15 +392,15 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) {
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
-    auto generator = Generators::CreateGenerator(*model, params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream);
+    auto generator = Generators::CreateGenerator(*model, *params);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
     generator->GenerateNextToken_TopK(k, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
       auto next_token = next_tokens[b];
@@ -417,13 +417,13 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
   float p = 0.95f;
   int k = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
   float* cpu_logits = new float[vocab_size * batch_size];
@@ -433,15 +433,15 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    auto generator = Generators::CreateGenerator(*model, params);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
+    auto generator = Generators::CreateGenerator(*model, *params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
     generator->GenerateNextToken_TopK_TopP(k, p, 1.0f);
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
       auto next_token = next_tokens[b];
@@ -456,13 +456,13 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) {
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
-  Generators::GeneratorParams params = Generators::GeneratorParams{};
-  params.search.max_length = 10;
-  params.batch_size = batch_size;
-  params.sequence_length = 1;
-  params.vocab_size = vocab_size;
-  params.input_ids = input_ids;
-  params.device_type = Generators::DeviceType::CUDA;
+  auto params = Generators::CreateGeneratorParams();
+  params->search.max_length = 10;
+  params->batch_size = batch_size;
+  params->sequence_length = 1;
+  params->vocab_size = vocab_size;
+  params->input_ids = input_ids;
+  params->device_type = Generators::DeviceType::CUDA;
   auto logits_gpu = Generators::CudaMallocArray<float>(vocab_size * batch_size);
   auto indices_buffer = Generators::CudaMallocHostArray<int>(vocab_size * batch_size);
   float* cpu_logits = new float[vocab_size * batch_size];
@@ -472,15 +472,15 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) {
   int num_iter = 100;
   for (int i = 0; i < num_iter; i++) {
     int num_large = dist(engine);
-    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params.cuda_stream);
-    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params.cuda_stream);
-    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params.cuda_stream);
-    auto generator = Generators::CreateGenerator(*model, params);
+    LaunchGeometricDecayKernel(logits_gpu.get(), vocab_size, batch_size, num_large, 20.0f, params->cuda_stream);
+    LaunchFisherYatesKernel(logits_gpu.get(), indices_buffer.get(), vocab_size, batch_size, params->cuda_stream);
+    cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream);
+    auto generator = Generators::CreateGenerator(*model, *params);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
     generator->GenerateNextToken_Top();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
-    cudaStreamSynchronize(params.cuda_stream);
+    cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
       float max_score = *std::max_element(cpu_logits + vocab_size * b, cpu_logits + vocab_size * (b + 1));

From e46c8b21665c7197e975e52edba8b67b920bf1bf Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Thu, 21 Mar 2024 13:30:10 -0700
Subject: [PATCH 16/36] Fix nightly pipeline (#222)

---
 .github/workflows/linux-cpu-x64-nightly-build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
index 89a52a803..c7a0234b1 100644
--- a/.github/workflows/linux-cpu-x64-nightly-build.yml
+++ b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -45,12 +45,12 @@ jobs:
         run: |
           set -e -x
           rm -rf build
-          cmake --preset linux_clang_cpu_release
-          cmake --build --preset linux_clang_cpu_release
+          cmake --preset linux_gcc_cpu_release
+          cmake --build --preset linux_gcc_cpu_release
 
       - name: Install the python wheel and test dependencies
         run: |
-          python3 -m pip install build/clang_cpu/release/wheel/onnxruntime_genai*.whl
+          python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl
           python3 -m pip install -r test/python/requirements-nightly-cpu.txt --user
 
       - name: Get HuggingFace Token

From a29d4a5d71b760ea2a81cf141892e95b3f917afc Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Thu, 21 Mar 2024 13:40:39 -0700
Subject: [PATCH 17/36] Add repeat KV to model builder (#210)

### Description

This PR adds `repeat_kv` to the model builder for models where
`num_attention_heads != num_key_value_heads`.

### Motivation and Context

By supporting `repeat_kv`, models where `num_attention_heads !=
num_key_value_heads` can now run on both CPU and GPU.
---
 src/python/py/models/builder.py | 298 ++++++++++++++++++++++++--------
 1 file changed, 230 insertions(+), 68 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 1652d5778..fa022f7aa 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -116,7 +116,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             "op_type": "MultiHeadAttention",                                # Attention op to use
             "use_gqa": ep == "cuda" and io_dtype == TensorProto.FLOAT16     # Check if GroupQueryAttention can be used
         }
-        if self.attention_attrs["use_gqa"] or self.num_attn_heads != self.num_kv_heads:
+        if self.attention_attrs["use_gqa"]:
             self.attention_attrs["op_type"] = "GroupQueryAttention"
 
         # Quantization-specific variables (INT4, INT8, etc.)
@@ -166,15 +166,15 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
                 "do_sample": config.do_sample if hasattr(config, "do_sample") else False,
                 "early_stopping": True,
                 "length_penalty": config.length_penalty if hasattr(config, "length_penalty") else 1.0,
-                "max_length": config.max_length if hasattr(config, "max_length") else 20,
+                "max_length": self.context_length,
                 "min_length": 0,
                 "no_repeat_ngram_size": config.no_repeat_ngram_size if hasattr(config, "no_repeat_ngram_size") else 0,
                 "num_beams": config.num_beams if hasattr(config, "num_beams") else 1,
                 "num_return_sequences": config.num_return_sequences if hasattr(config, "num_return_sequences") else 1,
-                "past_present_share_buffer": True if self.attention_attrs["op_type"] == "GroupQueryAttention" else False,
+                "past_present_share_buffer": self.attention_attrs["op_type"] == "GroupQueryAttention",
                 "repetition_penalty": config.repetition_penalty if hasattr(config, "repetition_penalty") else 1.0,
                 "temperature": config.temperature if hasattr(config, "temperature") else 1.0,
-                "top_k": config.top_k if hasattr(config, "top_k") else 50,
+                "top_k": 1,
                 "top_p": config.top_p if hasattr(config, "top_p") else 1.0,
             },
         }
@@ -337,7 +337,7 @@ def make_constant(self, name):
         path = name.split("/")
         onnx_dtype, dims, num = eval(path[-3]), path[-2], eval(path[-1])
         np_dtype = self.to_numpy_dtype[onnx_dtype]
-        value = numpy_helper.from_array(np.array(num if dims == "0D" else [num], dtype=np_dtype), name=name.replace("constants", "numpy_helper"))
+        value = numpy_helper.from_array(np.array(num if dims == "0D" else list(num) if type(num) == tuple else [num], dtype=np_dtype), name=name.replace("constants", "numpy_helper"))
 
         node_name = name.replace("constants", "constant_nodes")
         self.make_node("Constant", inputs=[], outputs=[name], name=node_name, value=value)
@@ -349,10 +349,10 @@ def make_gather(self, name, inputs, axis):
         self.make_node("Gather", inputs=inputs, outputs=[output], name=name, axis=axis)
         self.make_value_info(output, TensorProto.INT64, shape=[])
 
-    def make_reshape(self, name, inputs):
+    def make_reshape(self, name, inputs, dtype, shape):
         output = f"{name}/output_0"
         self.make_node("Reshape", inputs=inputs, outputs=[output], name=name)
-        self.make_value_info(output, TensorProto.INT64, shape=None)
+        self.make_value_info(output, dtype, shape=shape)
 
     def make_shape(self, name, root_input, shape):
         output = f"{name}/output_0"
@@ -379,10 +379,10 @@ def make_concat(self, name, inputs, dtype, shape, axis=0):
         self.make_node("Concat", inputs=inputs, outputs=[output], name=name, axis=axis)
         self.make_value_info(output, dtype, shape=shape)
 
-    def make_equal(self, name, inputs):
+    def make_equal(self, name, inputs, shape):
         output = f"{name}/output_0"
         self.make_node("Equal", inputs=inputs, outputs=[output], name=name)
-        self.make_value_info(output, TensorProto.BOOL, shape=[4])
+        self.make_value_info(output, TensorProto.BOOL, shape=shape)
 
     def make_where(self, name, inputs, dtype, shape):
         output = f"{name}/output_0"
@@ -439,6 +439,11 @@ def make_mul(self, name, inputs, dtype, shape):
         self.make_node("Mul", inputs=inputs, outputs=[output], name=name)
         self.make_value_info(output, dtype, shape=shape)
 
+    def make_transpose(self, name, root_input, dtype, shape, perm):
+        output = f"{name}/output_0"
+        self.make_node("Transpose", inputs=[root_input], outputs=[output], perm=perm)
+        self.make_value_info(output, dtype, shape=shape)
+
     def make_matmul(self, matmul, name, root_input, **kwargs):
         self.make_matmul_fp16_or_fp32(matmul, name, root_input, **kwargs)
 
@@ -512,7 +517,6 @@ def make_embedding(self, embedding):
         self.layernorm_attrs["root_input"] = layernorm_attrs_value
         self.layernorm_attrs["skip_input"] = layernorm_attrs_value
 
-
     def make_layernorm(self, layer_id, layernorm, skip, simple, location):
         root_input = self.layernorm_attrs["root_input"]
         skip_input = self.layernorm_attrs["skip_input"]
@@ -552,7 +556,7 @@ def make_layernorm(self, layer_id, layernorm, skip, simple, location):
 
         return output_0
 
-    def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
+    def make_rotary_embedding_caches(self, rotemb):
         cos_cache_name, sin_cache_name = "cos_cache", "sin_cache"
 
         if self.rotemb_attrs["create_rotary_embedding_caches"]:
@@ -576,11 +580,195 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
 
             self.rotemb_attrs["create_rotary_embedding_caches"] = False
 
+        return cos_cache_name, sin_cache_name
+
+    def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
+        cos_cache_name, sin_cache_name = self.make_rotary_embedding_caches(rotemb)
+
         inputs = [root_input, kwargs.pop("position_ids"), cos_cache_name, sin_cache_name]
         output = f"{name}/output_0"
         self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=0, **kwargs)
         self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * (self.num_kv_heads if "k_rotary" in name else self.num_attn_heads)])
 
+    # TODO: This function and any corresponding changes to support it are temporary until ORT supports GQA for CPU
+    def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
+        # Make subgraph that repeats tensor of shape (batch_size, sequence_length, num_kv_heads, head_size)
+        # to shape (batch_size, sequence_length, num_attn_heads, head_size) in an interleaved pattern
+        # and updates the KV caches
+        #
+        #           root_input
+        #                |
+        #             Reshape
+        #                |
+        #            Transpose
+        #                |
+        #                |   past_kv
+        #                |  /
+        #             Concat
+        #                |  \
+        #                |   present_kv
+        #                |
+        #        +-------+---------+
+        #        |                 |        
+        #        |               Shape
+        #        |                 |
+        #        |     +-----------+-----------+-----------+
+        #        |     |           |           |           |
+        #        |   Gather     Gather      Gather      Gather
+        #        |   (idx=0)    (idx=1)     (idx=2)     (idx=3)
+        #        |     |           |           |           |
+        #        | Unsqueeze   Unsqueeze   Unsqueeze   Unsqueeze
+        #        |     |           |           |           |
+        #        |     +-----------+-----------+-----------+
+        #        |                 |
+        #        |                 +-----------------------+
+        #        |                 |                       |
+        #        |                 |                      Mul
+        #        |                 |                       |
+        #        |              Concat                   Concat
+        #        |               (5D)                     (4D)
+        #        |                 |                       |
+        #        |              Reshape                    |
+        #        |             /   |   \                   |
+        #        |            /    |    \                  |
+        #        |           /     |     \                /
+        #        |          /      |      \              /
+        #        |         /       |       \            /
+        #        |        /      Shape      \          /
+        #        |       /         |         \        /
+        #        |      |   ConstantOfShape   \      /
+        #        |       \         |       \   \    /
+        #        |        \        |       Mul  |  /
+        #        |         \       |        |  /  /
+        #        |          \      |      Equal  /
+        #        |           \     |       /    /
+        #         \           \    |      /    /
+        #          \           \   |     /    /
+        #           \           \  |    /    /
+        #            \           \ |   /    /
+        #         Unsqueeze       Where    /
+        #             \           /       /
+        #              \         /       /
+        #               \       /       /
+        #                \     /       /
+        #                 Expand      /
+        #                    |       /
+        #                    |      /
+        #                    |     /
+        #                    |    /
+        #                    |   /
+        #                 Reshape
+        #                    |
+        #                Transpose
+        #                    |
+        #                 Reshape
+        basename = f"/model/layers.{layer_id}/attn/{'k_proj' if past_kv.endswith('key') else 'v_proj'}/repeat_kv"
+
+        # Make the initial subgraph
+        #
+        #                                                       +------> Gather --> Unsqueeze -----+
+        #                                                       |                                  |
+        #                                         past_kv       +------> Gather --> Unsqueeze -----+---> Mul --> Concat (4D)
+        #                                            |          |                                  |
+        # root_input --> Reshape --> Transpose --> Concat --> Shape ---> Gather --> Unsqueeze -----+---> Concat (5D)
+        #                                            |          |                                  |
+        #                                        present_kv     +------> Gather --> Unsqueeze -----+
+        reshape_1_name = f"{basename}/Reshape_1"
+        reshape_1_inputs = [root_input, f"/model/constants/TensorProto.INT64/1D/0, 0, {self.num_kv_heads}, -1"]
+        self.make_reshape(reshape_1_name, reshape_1_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_kv_heads, self.head_size])
+        transpose_1_name = f"{basename}/Transpose_1"
+        transpose_1_input = f"{reshape_1_name}/output_0"
+        self.make_transpose(transpose_1_name, transpose_1_input, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 'sequence_length', self.head_size], perm=[0,2,1,3])
+        concat_1_name = f"{basename}/Concat_1"
+        concat_1_inputs = [past_kv, f"{transpose_1_name}/output_0"]
+        self.make_node("Concat", inputs=concat_1_inputs, outputs=[present_kv], name=concat_1_name, axis=2)
+        
+        shape_1_name = f"{basename}/Shape_1"
+        self.make_shape(shape_1_name, present_kv, shape=[4])
+        gather_1_name = f"{basename}/Gather_1"
+        gather_1_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/0"]
+        self.make_gather(gather_1_name, gather_1_inputs, axis=0)
+        unsqueeze_1_name = f"{basename}/Unsqueeze_1"
+        unsqueeze_1_inputs = [f"{gather_1_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
+        self.make_unsqueeze(unsqueeze_1_name, unsqueeze_1_inputs, dtype=TensorProto.INT64, shape=[1])
+        gather_2_name = f"{basename}/Gather_2"
+        gather_2_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/1"]
+        self.make_gather(gather_2_name, gather_2_inputs, axis=0)
+        unsqueeze_2_name = f"{basename}/Unsqueeze_2"
+        unsqueeze_2_inputs = [f"{gather_2_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
+        self.make_unsqueeze(unsqueeze_2_name, unsqueeze_2_inputs, dtype=TensorProto.INT64, shape=[1])
+        gather_3_name = f"{basename}/Gather_3"
+        gather_3_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/2"]
+        self.make_gather(gather_3_name, gather_3_inputs, axis=0)
+        unsqueeze_3_name = f"{basename}/Unsqueeze_3"
+        unsqueeze_3_inputs = [f"{gather_3_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
+        self.make_unsqueeze(unsqueeze_3_name, unsqueeze_3_inputs, dtype=TensorProto.INT64, shape=[1])
+        gather_4_name = f"{basename}/Gather_4"
+        gather_4_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/3"]
+        self.make_gather(gather_4_name, gather_4_inputs, axis=0)
+        unsqueeze_4_name = f"{basename}/Unsqueeze_4"
+        unsqueeze_4_inputs = [f"{gather_4_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
+        self.make_unsqueeze(unsqueeze_4_name, unsqueeze_4_inputs, dtype=TensorProto.INT64, shape=[1])
+        concat_2_name = f"{basename}/Concat_2"
+        concat_2_inputs = [f"{unsqueeze_1_name}/output_0", f"{unsqueeze_2_name}/output_0", f"/model/constants/TensorProto.INT64/1D/{self.num_attn_heads // self.num_kv_heads}", f"{unsqueeze_3_name}/output_0", f"{unsqueeze_4_name}/output_0"]
+        self.make_concat(concat_2_name, concat_2_inputs, dtype=TensorProto.INT64, shape=[5], axis=0)
+
+        mul_1_name = f"{basename}/Mul_1"
+        mul_1_inputs = [f"{unsqueeze_2_name}/output_0", f"/model/constants/TensorProto.INT64/0D/{self.num_attn_heads // self.num_kv_heads}"]
+        self.make_mul(mul_1_name, mul_1_inputs, dtype=TensorProto.INT64, shape=None)
+        concat_3_name = f"{basename}/Concat_3"
+        concat_3_inputs = [f"{unsqueeze_1_name}/output_0", f"{mul_1_name}/output_0", f"{unsqueeze_3_name}/output_0", f"{unsqueeze_4_name}/output_0"]
+        self.make_concat(concat_3_name, concat_3_inputs, dtype=TensorProto.INT64, shape=[4], axis=0)
+
+        # Make the subgraph that follows the initial subgraph
+        #
+        #                               Mul ---> Equal
+        #                              /              \
+        # Reshape --> Shape --> ConstantOfShape --> Where
+        #    |                                        |
+        #    +----------------------------------------+
+        reshape_2_name = f"{basename}/Reshape_2"
+        reshape_2_inputs = [f"{concat_2_name}/output_0", "/model/constants/TensorProto.INT64/1D/-1"]
+        self.make_reshape(reshape_2_name, reshape_2_inputs, dtype=TensorProto.INT64, shape=None)
+        shape_2_name = f"{basename}/Shape_2"
+        self.make_shape(shape_2_name, f"{reshape_2_name}/output_0", shape=[1])
+        constant_shape_name = f"{basename}/ConstantOfShape"
+        constant_shape_value = numpy_helper.from_array(np.array([1], dtype="int64"))
+        self.make_constant_of_shape(constant_shape_name, f"{shape_2_name}/output_0", value=constant_shape_value, dtype=TensorProto.INT64, shape=[5])
+        mul_2_name = f"{basename}/Mul"
+        mul_2_inputs = [f"{constant_shape_name}/output_0", "/model/constants/TensorProto.INT64/0D/-1"]
+        self.make_mul(mul_2_name, mul_2_inputs, dtype=TensorProto.INT64, shape=[5])
+        equal_name = f"{basename}/Equal"
+        equal_inputs = [f"{reshape_2_name}/output_0", f"{mul_2_name}/output_0"]
+        self.make_equal(equal_name, equal_inputs, shape=[5])
+        where_name = f"{basename}/Where"
+        where_inputs = [f"{equal_name}/output_0", f"{constant_shape_name}/output_0", f"{reshape_2_name}/output_0"]
+        self.make_where(where_name, where_inputs, dtype=TensorProto.INT64, shape=[5])
+
+        # Make the final nodes
+        #
+        # Where (from above)  Concat (from above)
+        #                   \           \
+        # Unsqueeze --> Expand --> Reshape --> Transpose --> Reshape
+        unsqueeze_5_name = f"{basename}/Unsqueeze_5"
+        unsqueeze_5_inputs = [present_kv, "/model/constants/TensorProto.INT64/1D/2"]
+        self.make_unsqueeze(unsqueeze_5_name, unsqueeze_5_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 1, 'sequence_length', self.head_size])
+        expand_name = f"{basename}/Expand"
+        expand_inputs = [f"{unsqueeze_5_name}/output_0", f"{where_name}/output_0"]
+        self.make_expand(expand_name, expand_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, self.num_attn_heads // self.num_kv_heads, 'sequence_length', self.head_size])
+        reshape_3_name = f"{basename}/Reshape_3"
+        reshape_3_inputs = [f"{expand_name}/output_0", f"{concat_3_name}/output_0"]
+        self.make_reshape(reshape_3_name, reshape_3_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_attn_heads, 'sequence_length', self.head_size])
+        transpose_2_name = f"{basename}/Transpose_2"
+        transpose_2_input = f"{reshape_3_name}/output_0"
+        self.make_transpose(transpose_2_name, transpose_2_input, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads, self.head_size], perm=[0,2,1,3])
+        reshape_4_name = f"{basename}/Reshape_4"
+        reshape_4_inputs = [f"{transpose_2_name}/output_0", f"/model/constants/TensorProto.INT64/1D/0, 0, {self.num_attn_heads * self.head_size}"]
+        self.make_reshape(reshape_4_name, reshape_4_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.num_attn_heads * self.head_size])
+
+        input_to_attention = f"{reshape_4_name}/output_0"
+        return input_to_attention
+
     def make_attention_op(self, name, **kwargs):
         op_type = self.attention_attrs["op_type"]
         
@@ -648,13 +836,20 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         #                    |
         #                  O_Add
 
+        q_input_to_attention = ""
+        k_input_to_attention = ""
+        v_input_to_attention = ""
+
         # Make MatMul nodes
         q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul"
         self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input)
+        q_input_to_attention = f"{q_matmul_name}/output_0"
         k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul"
         self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input)
+        k_input_to_attention = f"{k_matmul_name}/output_0"
         v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul"
         self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input)
+        v_input_to_attention = f"{v_matmul_name}/output_0"
 
         # Make Add nodes (if bias exists)
         q_bias_exists = attention.q_proj.bias is not None
@@ -664,27 +859,42 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         if q_bias_exists:
             q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add"
             self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=f"{q_matmul_name}/output_0")
+            q_input_to_attention = f"{q_add_name}/output_0"
         if k_bias_exists:
             k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add"
             self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=f"{k_matmul_name}/output_0")
+            k_input_to_attention = f"{k_add_name}/output_0"
         if v_bias_exists:
             v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add"
             self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=f"{v_matmul_name}/output_0")
+            v_input_to_attention = f"{v_add_name}/output_0"
 
         # Make RotaryEmbedding nodes
         q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding"
         q_rotary_input = f"{q_matmul_name if not q_bias_exists else q_add_name}/output_0"
         self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, q_rotary_input, position_ids=kwargs.get("position_ids", "position_ids"))
+        q_input_to_attention = f"{q_rotary_name}/output_0"
+
         k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding"
         k_rotary_input = f"{k_matmul_name if not k_bias_exists else k_add_name}/output_0"
         self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, k_rotary_input, position_ids=kwargs.get("position_ids", "position_ids"))
+        k_input_to_attention = f"{k_rotary_name}/output_0"
+
+        # Make repeat KV nodes (TODO: remove once ORT supports GQA for CPU)
+        past_k = f"past_key_values.{layer_id}.key"
+        past_v = f"past_key_values.{layer_id}.value"
+        present_k = f"present.{layer_id}.key"
+        present_v = f"present.{layer_id}.value"
+        if self.num_attn_heads != self.num_kv_heads and not self.attention_attrs['use_gqa']:
+            k_input_to_attention = self.make_repeat_kv(layer_id, k_input_to_attention, past_k, present_k)
+            v_input_to_attention = self.make_repeat_kv(layer_id, v_input_to_attention, past_v, present_v)
+            past_k, past_v, present_k, present_v = "", "", "", ""
 
         # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.)
         attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}"
         self.make_attention_op(
-            attn_name, q_path=f"{q_rotary_name}/output_0", k_path=f"{k_rotary_name}/output_0", v_path=f"{v_matmul_name if not v_bias_exists else v_add_name}/output_0",
-            past_k=f"past_key_values.{layer_id}.key", past_v=f"past_key_values.{layer_id}.value",
-            present_k=f"present.{layer_id}.key", present_v=f"present.{layer_id}.value", **kwargs,
+            attn_name, q_path=q_input_to_attention, k_path=k_input_to_attention, v_path=v_input_to_attention,
+            past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, **kwargs,
         )
 
         # Make MatMul node (output projection weight node)
@@ -938,8 +1148,8 @@ def make_attention_mask_reformatting(self):
 
         # TODO: replace Concat with Expand for performance gains
         concat_name = f"{basename}/Concat"
-        concat_inputs = [f"{end_add_name}/output_0" for _ in range(self.num_kv_heads)]
-        concat_shape = ["batch_size", self.num_kv_heads, "source_sequence_length", "target_sequence_length"]
+        concat_inputs = [f"{end_add_name}/output_0" for _ in range(self.num_attn_heads)]
+        concat_shape = ["batch_size", self.num_attn_heads, "source_sequence_length", "target_sequence_length"]
         self.make_concat(concat_name, concat_inputs, dtype=self.io_dtype, shape=concat_shape, axis=1) # Shape of mask is now (B, N, S, T)
 
         self.mask_attrs["mask_name"] = concat_name
@@ -1027,7 +1237,7 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name):
         # Merged path
         reshape_name = f"{basename}/Reshape"
         reshape_inputs = [f"{add_2_name}/output_0", f"{concat_3_name}/output_0"]
-        self.make_reshape(reshape_name, reshape_inputs)
+        self.make_reshape(reshape_name, reshape_inputs, dtype=TensorProto.INT64, shape=None)
         less_name = f"{basename}/Less"
         less_inputs = [f"{range_name}/output_0", f"{reshape_name}/output_0"]
         self.make_less(less_name, less_inputs)
@@ -1147,7 +1357,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         self.make_mul(mul_name, mul_inputs, dtype=TensorProto.INT64, shape=["unk"])
         equal_name = f"{basename}/Equal"
         equal_inputs = [f"{concat_name}/output_0", f"{mul_name}/output_0"]
-        self.make_equal(equal_name, equal_inputs)
+        self.make_equal(equal_name, equal_inputs, shape=[4])
         
         where_name = f"{basename}/Where_1"
         where_inputs = [f"{equal_name}/output_0", f"{constant_shape_name}/output_0", f"{concat_name}/output_0"]
@@ -1159,7 +1369,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         self.make_expand(expand_name, expand_inputs, dtype=expand_dtype, shape=expand_shape)
 
         return expand_name
-
+    
 
 class LlamaModel(Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
@@ -1247,7 +1457,7 @@ def make_position_ids_reformatting(self):
         self.make_concat(concat_name, concat_inputs, dtype=TensorProto.INT64, shape=[2], axis=0)
         reshape_name = f"{basename}/Reshape"
         reshape_inputs = ["position_ids", f"{concat_name}/output_0"]
-        self.make_reshape(reshape_name, reshape_inputs)
+        self.make_reshape(reshape_name, reshape_inputs, dtype=TensorProto.INT64, shape=None)
 
         return reshape_name
 
@@ -1265,54 +1475,6 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
 
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
-
-    def make_group_query_attention(self, name, **kwargs):
-        if self.layer_id < self.num_layers - 3:
-            super().make_group_query_attention(name, **kwargs)
-            return
-
-        # Cast inputs and outputs of GroupQueryAttention
-        input_kwargs = {"q_path", "k_path", "v_path", "past_k", "past_v"}
-        new_kwargs = {}
-
-        # Make input cast nodes to bfloat16
-        for input_name in input_kwargs:
-            cast_name = f"/model/layers.{self.layer_id}/attn/{input_name.replace('path', 'proj')}/Cast"
-            cast_shape = ['batch_size', 'sequence_length', self.hidden_size] if input_name in {"q_path", "k_path", "v_path"} else ["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size]
-            self.make_cast(cast_name, kwargs[input_name], dtype=TensorProto.BFLOAT16, shape=cast_shape)
-            new_kwargs[input_name] = f"{cast_name}/output_0"
-
-        # Make GroupQueryAttention node
-        inputs = [
-            new_kwargs["q_path"], new_kwargs["k_path"], new_kwargs["v_path"],
-            new_kwargs["past_k"], new_kwargs["past_v"],
-            kwargs.get("seqlens_k", ""), kwargs.get("total_seq_len", ""),
-        ]
-        outputs = [f"{name}/Cast/output_0", f"{name}/output_1", f"{name}/output_2"]
-        self.make_node("GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads)
-        self.make_value_info(outputs[0], TensorProto.BFLOAT16, shape=['batch_size', 'sequence_length', self.hidden_size])
-
-        present_kv_shape = ["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size]
-        self.make_value_info(outputs[1], TensorProto.BFLOAT16, shape=present_kv_shape)
-        self.make_value_info(outputs[2], TensorProto.BFLOAT16, shape=present_kv_shape)
-
-        # Make output cast nodes to float16
-        target_dtype = TensorProto.FLOAT16
-
-        cast_o_path_name = f"{name}/o_proj/Cast"
-        cast_o_path_output = f"{name}/output_0"
-        self.make_node("Cast", inputs=[outputs[0]], outputs=[cast_o_path_output], name=cast_o_path_name, to=target_dtype)
-        self.make_value_info(cast_o_path_output, target_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
-        
-        cast_present_k_name = f"{name}/present_k/Cast"
-        cast_present_k_output = f"present.{self.layer_id}.key"
-        self.make_node("Cast", inputs=[outputs[1]], outputs=[cast_present_k_output], name=cast_present_k_name, to=target_dtype)
-        self.make_value_info(cast_present_k_output, target_dtype, shape=present_kv_shape)
-        
-        cast_present_v_name = f"{name}/present_v/Cast"
-        cast_present_v_output = f"present.{self.layer_id}.value"
-        self.make_node("Cast", inputs=[outputs[2]], outputs=[cast_present_v_output], name=cast_present_v_name, to=target_dtype)
-        self.make_value_info(cast_present_v_output, target_dtype, shape=present_kv_shape)
         
     def make_mlp(self, layer_id, mlp, root_input):
         # Make nodes for the MLP subgraph

From 28de36d28e083db4ef0c97e3207d2b3cc4937250 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Fri, 22 Mar 2024 09:24:34 -0700
Subject: [PATCH 18/36] Update README.md (#223)

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index 350a2fe67..cc87d474c 100644
--- a/README.md
+++ b/README.md
@@ -98,10 +98,6 @@ huggingface-cli login --token <your HuggingFace token>
 python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o <model folder>
 ```
 
-## Known issues
-
-* Mistral and Gemma support on CUDA only
-
 ## Contributing
 
 This project welcomes contributions and suggestions.  Most contributions require you to agree to a

From a8a31a1e578ff5762b608bacdf3792f6044a3f34 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 22 Mar 2024 18:26:17 -0400
Subject: [PATCH 19/36] enable nuget publishing (#209)

---
 .pipelines/nuget-publishing.yml               | 13 +++-
 .../stages/jobs/nuget-linux-packaging-job.yml | 50 ------------
 .../stages/jobs/nuget-packaging-job.yml       | 77 +++++++++++++++++++
 .../stages/jobs/nuget-win-packaging-job.yml   | 70 -----------------
 .../stages/jobs/py-linux-packaging-job.yml    |  8 +-
 .../stages/jobs/py-win-packaging-job.yml      | 30 +-------
 .../stages/jobs/steps/capi-linux-step.yml     | 23 +++++-
 .../stages/jobs/steps/capi-win-step.yml       | 36 ++++++++-
 .../jobs/steps/compliant/esrp_nuget.yml       | 31 ++++++++
 .../jobs/steps/nuget-releasing-step.yml       | 49 ++++++++++++
 .../stages/jobs/steps/nuget-win-step.yml      | 16 +++-
 .../stages/jobs/steps/utils/capi-archive.yml  |  6 +-
 .../stages/jobs/steps/utils/download-ort.yml  |  2 +-
 .../get-nuget-package-version-as-variable.yml | 42 ++++++++++
 .pipelines/stages/nuget-packaging-stage.yml   | 28 +++++--
 15 files changed, 313 insertions(+), 168 deletions(-)
 delete mode 100644 .pipelines/stages/jobs/nuget-linux-packaging-job.yml
 create mode 100644 .pipelines/stages/jobs/nuget-packaging-job.yml
 delete mode 100644 .pipelines/stages/jobs/nuget-win-packaging-job.yml
 create mode 100644 .pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
 create mode 100644 .pipelines/stages/jobs/steps/nuget-releasing-step.yml
 create mode 100644 .pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml

diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
index bb639be7c..e91b57489 100644
--- a/.pipelines/nuget-publishing.yml
+++ b/.pipelines/nuget-publishing.yml
@@ -33,6 +33,16 @@ parameters:
   - '12.2'
   default: '11.8'
 
+- name: publish_to_ado_feed
+  displayName: 'Publish to Azure DevOps Feed'
+  type: boolean
+  default: false
+
+- name: publish_to_nuget
+  displayName: 'Publish to NuGet.org'
+  type: boolean
+  default: false
+
 resources:
   repositories:
   - repository: manylinux
@@ -51,4 +61,5 @@ stages:
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
     ort_version: ${{ parameters.ort_version }}
     cuda_version: ${{ parameters.cuda_version }}
-
+    publish_to_nuget: ${{ parameters.publish_to_nuget }}
+    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml b/.pipelines/stages/jobs/nuget-linux-packaging-job.yml
deleted file mode 100644
index fdf9d7106..000000000
--- a/.pipelines/stages/jobs/nuget-linux-packaging-job.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-parameters:
-- name: arch
-  type: string
-- name: ep
-  type: string
-- name: ort_version
-  type: string
-- name: cuda_version
-  type: string
-  default: ''
-
-jobs:
-- job: Linux_Nuget_Packaging_${{ parameters.ep }}_${{ parameters.arch }}
-  pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
-  timeoutInMinutes: 180
-  variables:
-  - name: artifactName
-    value: 'onnxruntime-genai-capi-linux-${{ parameters.ep }}-${{ parameters.arch }}'
-  - name: ort_version
-    value: ${{ parameters.ort_version }}
-  - name: arch
-    value: ${{ parameters.arch }}
-  - name: ep
-    value: ${{ parameters.ep }}
-  - name: buildDir
-    value: 'build/${{ parameters.ep }}'
-  - name: ort_filename
-    ${{ if eq(parameters.ep, 'cpu') }}:
-      value: 'onnxruntime-linux-${{ parameters.arch }}-${{ parameters.ort_version }}'
-    ${{ else}}:
-      ${{if eq(parameters.cuda_version, '11.8') }}:
-        value: 'onnxruntime-linux-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
-      ${{ else }}:
-        value: 'onnxruntime-linux-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
-  workspace:
-    clean: all
-  steps:
-  - template: steps/capi-linux-step.yml
-    parameters:
-      target: 'onnxruntime-genai'
-
-# TODO: Add a step to build the nuget package
-
-  - task: PublishBuildArtifacts@1
-    displayName: 'Publish Artifact: ONNXRuntime Genai capi'
-    inputs:
-      ArtifactName: $(artifactName)
-
-  - template: steps/compliant-and-cleanup-step.yml
-
diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml
new file mode 100644
index 000000000..af5250c3c
--- /dev/null
+++ b/.pipelines/stages/jobs/nuget-packaging-job.yml
@@ -0,0 +1,77 @@
+parameters:
+- name: arch
+  type: string
+- name: ep
+  type: string
+- name: ort_version
+  type: string
+- name: cuda_version
+  type: string
+  default: ''
+- name: os
+  type: string
+- name: publish_to_ado_feed
+  type: boolean
+- name: publish_to_nuget
+  type: boolean
+jobs:
+- job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging
+  ${{ if eq(parameters.os, 'linux') }}:
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
+  ${{ if eq(parameters.os, 'win') }}:
+    pool: 'onnxruntime-Win-CPU-2022'
+  timeoutInMinutes: 180
+#  set variables here to be used in the template and steps
+  variables:
+  - name: arch
+    value: ${{ parameters.arch }}
+  - name: artifactName
+    value: 'onnxruntime-genai-${{ parameters.os }}-${{ parameters.ep }}-${{ parameters.arch }}'
+  - name: buildConfig
+    value: 'Release'
+  - name: buildDir
+    value: 'build/${{ parameters.ep }}'
+  - name: cuda_version
+    value: ${{ parameters.cuda_version }}
+  - name: ep
+    value: ${{ parameters.ep }}
+  - name: ort_version
+    value: ${{ parameters.ort_version }}
+  - name: GDN_CODESIGN_TARGETDIRECTORY
+    value: '$(Build.ArtifactStagingDirectory)/nuget'
+  - name: ort_filename
+    ${{ if eq(parameters.ep, 'cpu') }}:
+      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}'
+    ${{ else}}:
+      ${{if eq(parameters.cuda_version, '11.8') }}:
+        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
+      ${{ else }}:
+        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
+  - name: genai_nuget_ext
+    ${{ if eq(parameters.ep, 'cpu') }}:
+      value: ''
+    ${{ if eq(parameters.ep, 'cuda') }}:
+      value: '.Cuda'
+  - name: ort_nuget_ext
+    ${{ if eq(parameters.ep, 'cpu') }}:
+      value: ''
+    ${{ if eq(parameters.ep, 'cuda') }}:
+      value: '.Gpu'
+  workspace:
+    clean: all
+  steps:
+  - template: steps/capi-${{ parameters.os }}-step.yml
+    parameters:
+      target: 'onnxruntime-genai'
+
+# TODO: Add a step to build the linux nuget package
+  - ${{ if eq(parameters.os, 'win') }}:
+    - template: steps/nuget-${{ parameters.os }}-step.yml
+    - ${{ if or(eq(parameters.publish_to_nuget, true), eq(parameters.publish_to_ado_feed, true))}}:
+      - template: steps/nuget-releasing-step.yml
+        parameters:
+          publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+          publish_to_nuget: ${{ parameters.publish_to_nuget }}
+
+  - template: steps/compliant-and-cleanup-step.yml
+
diff --git a/.pipelines/stages/jobs/nuget-win-packaging-job.yml b/.pipelines/stages/jobs/nuget-win-packaging-job.yml
deleted file mode 100644
index b15ceb1ee..000000000
--- a/.pipelines/stages/jobs/nuget-win-packaging-job.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-parameters:
-- name: arch
-  type: string
-  values:
-    - 'x64'
-    - 'arm64'
-- name: ep
-  type: string
-  values:
-  - 'cpu'
-  - 'cuda'
-- name: ort_version
-  type: string
-- name: cuda_version
-  type: string
-  default: ''
-jobs:
-- job: Windows_Nuget_Packaging_${{ parameters.ep }}_${{ parameters.arch }}
-  pool: 'onnxruntime-Win-CPU-2022'
-  timeoutInMinutes: 180
-  variables:
-  - name: buildConfig
-    value: 'Release'
-  - name: cuda_version
-    value: ${{ parameters.cuda_version }}
-  - name: ort_version
-    value: ${{ parameters.ort_version }}
-  - name: arch
-    value: ${{ parameters.arch }}
-  - name: ep
-    value: ${{ parameters.ep }}
-  - name: buildDir
-    value: 'build\${{ parameters.ep }}'
-  - name: artifactName
-    value : 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}'
-  - name: ort_filename
-    ${{ if eq(parameters.ep, 'cpu') }}:
-      value: 'onnxruntime-win-${{ parameters.arch }}-${{ parameters.ort_version }}'
-    ${{ else}}:
-      ${{if eq(parameters.cuda_version, '11.8') }}:
-        value: 'onnxruntime-win-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
-      ${{ else }}:
-        value: 'onnxruntime-win-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
-  - name: genai_nuget_ext
-    ${{ if eq(parameters.ep, 'cpu') }}:
-      value: ''
-    ${{ if eq(parameters.ep, 'cuda') }}:
-      value: '.Cuda'
-  - name: ort_nuget_ext
-    ${{ if eq(parameters.ep, 'cpu') }}:
-      value: ''
-    ${{ if eq(parameters.ep, 'cuda') }}:
-      value: '.Gpu'
-  workspace:
-    clean: all
-  steps:
-  - template: steps/capi-win-step.yml
-    parameters:
-      target: 'onnxruntime-genai'
-
-  - template: steps/nuget-win-step.yml
-
-
-  - task: PublishBuildArtifacts@1
-    displayName: 'Publish Artifact: ONNXRuntime Genai capi'
-    inputs:
-      ArtifactName: $(artifactName)
-
-  - template: steps/compliant-and-cleanup-step.yml
-
diff --git a/.pipelines/stages/jobs/py-linux-packaging-job.yml b/.pipelines/stages/jobs/py-linux-packaging-job.yml
index 7c9e55044..b7c35d6a5 100644
--- a/.pipelines/stages/jobs/py-linux-packaging-job.yml
+++ b/.pipelines/stages/jobs/py-linux-packaging-job.yml
@@ -31,6 +31,7 @@ jobs:
   workspace:
     clean: all
   pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
+#  set variables here to be used in the template and steps
   variables:
   # The build machine pool doesn't have dotnet, so it can't run CG.
   - name: skipComponentGovernanceDetection
@@ -39,6 +40,8 @@ jobs:
     value: ${{ parameters.arch }}
   - name: ep
     value: ${{ parameters.ep }}
+  - name: artifactName
+    value: 'onnxruntime-genai-capi-linux-${{ parameters.ep }}-${{ parameters.arch }}-python'
   - name: cuda_version
     value: ${{ parameters.cuda_version }}
   - name: ort_version
@@ -57,10 +60,5 @@ jobs:
     parameters:
       target: 'python'
 
-  - task: PublishBuildArtifacts@1
-    displayName: 'Publish Artifact: ONNXRuntime python wheel'
-    inputs:
-      ArtifactName: onnxruntime-genai-linux-$(ep)-$(arch)
-
   - template: steps/compliant-and-cleanup-step.yml
 
diff --git a/.pipelines/stages/jobs/py-win-packaging-job.yml b/.pipelines/stages/jobs/py-win-packaging-job.yml
index 88b285506..0989398eb 100644
--- a/.pipelines/stages/jobs/py-win-packaging-job.yml
+++ b/.pipelines/stages/jobs/py-win-packaging-job.yml
@@ -24,11 +24,14 @@ jobs:
       Python312_x64:
         PythonVersion: '3.12'
   timeoutInMinutes: 180
+#  set variables here to be used in the template and steps
   variables:
   - name: ep
     value: ${{ parameters.ep }}
   - name: cuda_version
     value: ${{ parameters.cuda_version }}
+  - name: artifactName
+    value: 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}-wheel'
   - name: arch
     value: ${{ parameters.arch }}
   - name: ort_version
@@ -64,32 +67,5 @@ jobs:
   - template: steps/capi-win-step.yml
     parameters:
       target: 'python'
-#      ep: ${{ parameters.ep }}
-
-  - template: steps/compliant/win-esrp-dll-step.yml
-    parameters:
-      FolderPath: '$(Build.Repository.LocalPath)\build\$(ep)\wheel\onnxruntime_genai'
-      DisplayName: 'ESRP - PYD Sign'
-      DoEsrp: true
-      Pattern: '*.pyd'
-
-  - powershell: |
-      cmake --build --preset windows_$(arch)_$(ep)_release --parallel --PyPackageBuild
-    displayName: 'Build Python Wheel'
-
-  - powershell: |
-      Get-ChildItem -Path $(Build.Repository.LocalPath) -Recurse
-
-  - task: CopyFiles@2
-    displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
-    inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)\build\$(ep)\wheel'
-      Contents: '*.whl'
-      TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-  - task: PublishBuildArtifacts@1
-    displayName: 'Publish Artifact: ONNXRuntime Genai python wheel'
-    inputs:
-      ArtifactName: onnxruntime-genai-win-$(ep)-$(arch)
 
   - template: steps/compliant-and-cleanup-step.yml
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml
index 897fae5d5..6fa0f3c92 100644
--- a/.pipelines/stages/jobs/steps/capi-linux-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml
@@ -66,6 +66,13 @@ steps:
   - template: utils/capi-archive.yml
     parameters:
       archiveType: tar
+
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish Artifact: ONNXRuntime Genai capi'
+    inputs:
+      ArtifactName: $(artifactName)-capi
+      PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi'
+
 - ${{ if eq(parameters.target, 'python') }}:
   - bash: |
       set -e -x
@@ -82,6 +89,13 @@ steps:
             --target python"
     displayName: 'Build Python $(PyNoDotVer)'
     workingDirectory: '$(Build.Repository.LocalPath)'
+
+  - task: BinSkim@4
+    displayName: 'Run BinSkim'
+    inputs:
+      AnalyzeTargetGlob: '$(Build.Repository.LocalPath)/**/*.pyd'
+    continueOnError: true
+
   - bash: |
       set -e -x
       docker run \
@@ -97,12 +111,19 @@ steps:
             --target PyPackageBuild"
     displayName: 'PyPackageBuild $(PyNoDotVer)'
     workingDirectory: '$(Build.Repository.LocalPath)'
+
   - task: CopyFiles@2
     displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
     inputs:
       SourceFolder: '$(Build.Repository.LocalPath)/build/$(ep)/wheel'
       Contents: '*manylinux*.whl'
-      TargetFolder: '$(Build.ArtifactStagingDirectory)'
+      TargetFolder: '$(Build.ArtifactStagingDirectory)/wheel'
+
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish Artifact: ONNXRuntime python wheel'
+    inputs:
+      ArtifactName: $(artifactName)
+      PathtoPublish: '$(Build.ArtifactStagingDirectory)/wheel'
 
 - script: |
     ls $(Build.Repository.LocalPath) -R
diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml
index 1e4d9e2d3..aebc4cd13 100644
--- a/.pipelines/stages/jobs/steps/capi-win-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-win-step.yml
@@ -27,7 +27,6 @@ steps:
     echo "ep=$(ep)"
     echo "cuda_version=$(cuda_version)"
     echo "target=${{ parameters.target }}"
-    echo "ort_filename=$(ort_filename)"
   displayName: 'Print Parameters'
 
 - template: utils/download-ort.yml
@@ -62,6 +61,7 @@ steps:
     parameters:
       FolderPath: '$(buildDir)'
       DisplayName: 'ESRP - Sign C++ dlls'
+      Pattern: '*genai.dll'
 
   - task: BinSkim@4
     displayName: 'Run BinSkim'
@@ -73,9 +73,43 @@ steps:
     parameters:
       archiveType: zip
 
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish Artifact: ONNXRuntime Genai capi'
+    inputs:
+      ArtifactName: $(artifactName)-capi
+      PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi'
+
 - ${{ if eq(parameters.target, 'python') }}:
   - task: BinSkim@4
     displayName: 'Run BinSkim'
     inputs:
       AnalyzeTargetGlob: '$(Build.Repository.LocalPath)\**\*.pyd'
     continueOnError: true
+
+  - template: compliant/win-esrp-dll-step.yml
+    parameters:
+      FolderPath: '$(Build.Repository.LocalPath)\build\$(ep)\wheel\onnxruntime_genai'
+      DisplayName: 'ESRP - PYD Sign'
+      DoEsrp: true
+      Pattern: '*.pyd'
+
+  - powershell: |
+      cmake --build --preset windows_$(arch)_$(ep)_release --parallel --PyPackageBuild
+    displayName: 'Build Python Wheel'
+
+  - powershell: |
+      Get-ChildItem -Path $(Build.Repository.LocalPath) -Recurse
+
+  - task: CopyFiles@2
+    displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+    inputs:
+      SourceFolder: '$(Build.Repository.LocalPath)\build\$(ep)\wheel'
+      Contents: '*.whl'
+      TargetFolder: '$(Build.ArtifactStagingDirectory)\wheel'
+
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish Artifact: ONNXRuntime python wheel'
+    inputs:
+      ArtifactName: $(artifactName)-wheel
+      PathtoPublish: '$(Build.ArtifactStagingDirectory)\wheel'
+
diff --git a/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
new file mode 100644
index 000000000..081e7a809
--- /dev/null
+++ b/.pipelines/stages/jobs/steps/compliant/esrp_nuget.yml
@@ -0,0 +1,31 @@
+parameters:
+  FolderPath: ''
+  DisplayName: ''
+  DoEsrp: 'false'
+
+steps:
+- ${{ if eq(parameters['DoEsrp'], 'true') }}:
+  - task: SFP.build-tasks.custom-build-task-1.EsrpCodeSigning@2
+    displayName: ${{ parameters.DisplayName }}
+    inputs:
+      ConnectedServiceName: 'OnnxRuntime CodeSign 20190817'
+      FolderPath: ${{ parameters.FolderPath }}
+      Pattern: '*.nupkg'
+      signConfigType: inlineSignParams
+      inlineOperation: |
+       [
+           {
+               "keyCode": "CP-401405",
+               "operationSetCode": "NuGetSign",
+               "parameters": [ ],
+               "toolName": "sign",
+               "toolVersion": "1.0"
+           },
+           {
+               "keyCode": "CP-401405",
+               "operationSetCode": "NuGetVerify",
+               "parameters": [ ],
+               "toolName": "sign",
+               "toolVersion": "1.0"
+           }
+       ]
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/nuget-releasing-step.yml b/.pipelines/stages/jobs/steps/nuget-releasing-step.yml
new file mode 100644
index 000000000..8442fd069
--- /dev/null
+++ b/.pipelines/stages/jobs/steps/nuget-releasing-step.yml
@@ -0,0 +1,49 @@
+parameters:
+- name: publish_to_ado_feed
+  type: boolean
+- name: publish_to_nuget
+  type: boolean
+steps:
+- task: NuGetToolInstaller@1
+  inputs:
+    versionSpec: 6.8.x
+
+- powershell: |
+    New-Item -Path $(Agent.TempDirectory) -Name "binfiles" -ItemType "directory"
+    $base_path_name = Join-Path -Path $(Agent.TempDirectory) -ChildPath "binfiles"
+    Get-ChildItem $(GDN_CODESIGN_TARGETDIRECTORY) -Filter *.nupkg |
+        Foreach-Object {
+          $dir_name = Join-Path -Path $base_path_name -ChildPath $_.Basename
+          $cmd = "7z.exe x $($_.FullName) -y -o$dir_name"
+          Write-Output $cmd
+          Invoke-Expression -Command $cmd
+        }
+    dir $(Agent.TempDirectory)
+    tree $(Agent.TempDirectory)
+  workingDirectory: '$(Agent.TempDirectory)'
+
+- task: CodeSign@1
+  displayName: 'Run Codesign Validation'
+
+- task: PublishSecurityAnalysisLogs@3
+  displayName: 'Publish Security Analysis Logs'
+  continueOnError: true
+
+- task: PostAnalysis@2
+  inputs:
+    GdnBreakAllTools: true
+    GdnBreakPolicy: M365
+    GdnBreakPolicyMinSev: Error
+
+- template: utils/get-nuget-package-version-as-variable.yml
+  parameters:
+    packageFolder: '$(GDN_CODESIGN_TARGETDIRECTORY)'
+#This task must be run on a Windows machine
+- ${{ if eq(parameters.publish_to_ado_feed, true) }}:
+  - task: NuGetCommand@2
+    displayName: 'NuGet push to Azure DevOps Feed'
+    inputs:
+      command: push
+      packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg'
+      publishVstsFeed: 'PublicPackages/onnxruntime-genai'
+      allowPackageConflicts: true
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml
index e54c63336..b3c5e4fa8 100644
--- a/.pipelines/stages/jobs/steps/nuget-win-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml
@@ -14,7 +14,7 @@ steps:
   parameters:
     FolderPath: '$(Build.Repository.LocalPath)\src\csharp\bin\Release\'
     DisplayName: 'ESRP - Sign C# dlls'
-
+    Pattern: '*OnnxRuntimeGenAI*.dll'
 - powershell: |
     $VERSION = '0.1.0-rc1'
     nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec `
@@ -41,4 +41,16 @@ steps:
   inputs:
     SourceFolder: '$(Build.Repository.LocalPath)\nuget'
     Contents: '*.nupkg'
-    TargetFolder: '$(Build.ArtifactStagingDirectory)'
\ No newline at end of file
+    TargetFolder: '$(Build.ArtifactStagingDirectory)\nuget'
+
+- template: compliant/esrp_nuget.yml
+  parameters:
+    DisplayName: 'ESRP - sign NuGet package'
+    FolderPath: '$(Build.ArtifactStagingDirectory)\nuget'
+    DoEsrp: 'true'
+
+- task: PublishBuildArtifacts@1
+  displayName: 'Publish Artifact: ONNXRuntime Genai capi'
+  inputs:
+    PathtoPublish: '$(Build.ArtifactStagingDirectory)\nuget'
+    ArtifactName: $(artifactName)-nuget'
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/utils/capi-archive.yml b/.pipelines/stages/jobs/steps/utils/capi-archive.yml
index 6f198fe97..1395b31f7 100644
--- a/.pipelines/stages/jobs/steps/utils/capi-archive.yml
+++ b/.pipelines/stages/jobs/steps/utils/capi-archive.yml
@@ -5,7 +5,7 @@ steps:
 - bash: |
     echo "##[error]Error: artifactName and buildDir are not set"
     exit 1
-  displayName: 'Check if variables ort_filename and ort_filename are set'
+  displayName: 'Check if variables artifactName and buildDir are set'
   condition: or( eq (variables['artifactName'], ''), eq (variables['buildDir'], ''))
 
 - task: CopyFiles@2
@@ -60,9 +60,9 @@ steps:
     archiveType: ${{ parameters.archiveType }}
     ${{ if eq(parameters.archiveType, 'tar') }}:
       tarCompression: 'gz'
-      archiveFile: '$(Build.ArtifactStagingDirectory)/$(artifactName).tgz'
+      archiveFile: '$(Build.ArtifactStagingDirectory)/capi/$(artifactName).tgz'
     ${{ else }}:
-      archiveFile: '$(Build.ArtifactStagingDirectory)/$(artifactName).zip'
+      archiveFile: '$(Build.ArtifactStagingDirectory)/capi/$(artifactName).zip'
     replaceExistingArchive: true
 
 - task: DeleteFiles@1
diff --git a/.pipelines/stages/jobs/steps/utils/download-ort.yml b/.pipelines/stages/jobs/steps/utils/download-ort.yml
index 25bea32c6..366e3009e 100644
--- a/.pipelines/stages/jobs/steps/utils/download-ort.yml
+++ b/.pipelines/stages/jobs/steps/utils/download-ort.yml
@@ -5,7 +5,7 @@ steps:
 - bash: |
     echo "##[error]Error: ort_version and ort_filename are not set"
     exit 1
-  displayName: 'Check if variables ort_filename and ort_filename are set'
+  displayName: 'Check if variables ort_version and ort_filename are set'
   condition: or( eq (variables['ort_version'], ''), eq (variables['ort_filename'], ''))
 
 - task: DownloadGitHubRelease@0
diff --git a/.pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml b/.pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml
new file mode 100644
index 000000000..4edf0d03a
--- /dev/null
+++ b/.pipelines/stages/jobs/steps/utils/get-nuget-package-version-as-variable.yml
@@ -0,0 +1,42 @@
+parameters:
+  packageFolder: $(Build.ArtifactStagingDirectory)
+
+steps:
+- task: CmdLine@2
+  condition: eq(variables['Agent.OS'], 'Windows_NT')
+  displayName: 'Extract version number from the NuPkg file, Windows VMs'
+  inputs:
+    workingDirectory: '${{ parameters.packageFolder }}'
+    script: |
+      SETLOCAL EnableDelayedExpansion
+      FOR /R %%i IN (Microsoft.ML.OnnxRuntime.Managed*.nupkg) do (
+        set filename=%%~ni
+        set ortversion=!filename:~33!
+        @echo ortversion is !ortversion!
+        @echo ##vso[task.setvariable variable=NuGetPackageVersionNumber;]!ortversion!
+      )
+- task: CmdLine@2
+  condition: eq(variables['Agent.OS'], 'Windows_NT')
+  displayName: 'Extract version number from the DirectML NuPkg file, Windows VMs'
+  inputs:
+    workingDirectory: '${{ parameters.packageFolder }}'
+    script: |
+      SETLOCAL EnableDelayedExpansion
+      FOR /R %%i IN (Microsoft.ML.OnnxRuntime.DirectML*.nupkg) do (
+        set filename=%%~ni
+        set ortversion=!filename:~34!
+        @echo DirectMLNuGetPackageVersionNumber is !ortversion!
+        @echo ##vso[task.setvariable variable=DirectMLNuGetPackageVersionNumber;]!ortversion!
+      )
+- task: CmdLine@2
+  condition: not(eq(variables['Agent.OS'], 'Windows_NT'))
+  displayName: 'Extract version number from the NuPkg file, Unix VMs'
+  inputs:
+    workingDirectory: '${{ parameters.packageFolder }}'
+    script: |
+      filenamewithext=$(ls Microsoft.ML.OnnxRuntime.Managed*nupkg)
+      filename=${filenamewithext%.*}
+      ortversion=${filename:33}
+      # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+      set +x
+      echo "##vso[task.setvariable variable=NuGetPackageVersionNumber;]$ortversion"
diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml
index e1125cf96..db500916b 100644
--- a/.pipelines/stages/nuget-packaging-stage.yml
+++ b/.pipelines/stages/nuget-packaging-stage.yml
@@ -12,35 +12,49 @@ parameters:
 - name: cuda_version
   type: string
   default: ''
-
+- name: publish_to_ado_feed
+  type: boolean
+- name: publish_to_nuget
+  type: boolean
 
 stages:
 - stage: nuget_packaging
   jobs:
   - ${{ if eq(parameters.enable_win_cpu, true) }}:
-    - template: jobs/nuget-win-packaging-job.yml
+    - template: jobs/nuget-packaging-job.yml
       parameters:
         arch: 'x64'
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
+        os: 'win'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        publish_to_nuget: ${{ parameters.publish_to_nuget }}
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
-    - template: jobs/nuget-win-packaging-job.yml
+    - template: jobs/nuget-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
         ort_version: ${{ parameters.ort_version }}
-
+        os: 'win'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        publish_to_nuget: ${{ parameters.publish_to_nuget }}
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
-    - template: jobs/nuget-linux-packaging-job.yml
+    - template: jobs/nuget-packaging-job.yml
       parameters:
         arch: 'x64'
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
+        os: 'linux'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        publish_to_nuget: ${{ parameters.publish_to_nuget }}
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
-    - template: jobs/nuget-linux-packaging-job.yml
+    - template: jobs/nuget-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
-        ort_version: ${{ parameters.ort_version }}
\ No newline at end of file
+        ort_version: ${{ parameters.ort_version }}
+        os: 'linux'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        publish_to_nuget: ${{ parameters.publish_to_nuget }}
\ No newline at end of file

From b3e62ad68eb682fc685d473d4d1e5f5a33f1f308 Mon Sep 17 00:00:00 2001
From: aciddelgado <139922440+aciddelgado@users.noreply.github.com>
Date: Fri, 22 Mar 2024 16:35:54 -0700
Subject: [PATCH 20/36] fix benchmark wall clock (#216)

Corrects a small error with the benchmarking script implemented in [the
original benchmark script PR
](https://github.com/microsoft/onnxruntime-genai/pull/114). All tests
done with same parameters.

## Performance Metrics

## Fp32-cpu
- **Average Tokenization Latency (per token):** 0.4079251408984419 ms
- **Average Tokenization Throughput (per token):** 2451.4301761287193
tps
- **Average Prompt Processing Latency (per token):** 9.474278237737508
ms
- **Average Prompt Processing Throughput (per token):**
105.5489373340173 tps
- **Average Token Generation Latency (per token):** 186.07972450856732
ms
- **Average Token Generation Throughput (per token):**
5.3740406303856005 tps
- **Average Sampling Latency (per token):** 0.1549795838015365 ms
- **Average Sampling Throughput (per token):** 6452.462804910989 tps
- **Average Wall Clock Time:** 49.07776738643646 s
- **Average Wall Clock Throughput:** 7.8243168026858 tps

## Int4-cpu
- **Average Tokenization Latency (per token):** 0.12926810202770866 ms
- **Average Tokenization Throughput (per token):** 7735.860466069577 tps
- **Average Prompt Processing Latency (per token):** 10.303267585383082
ms
- **Average Prompt Processing Throughput (per token):**
97.05658828260155 tps
- **Average Token Generation Latency (per token):** 84.14963581704069 ms
- **Average Token Generation Throughput (per token):** 11.88359272491938
tps
- **Average Sampling Latency (per token):** 0.1820565843723898 ms
- **Average Sampling Throughput (per token):** 5492.797766405076 tps
- **Average Wall Clock Time:** 22.826100442409516 s
- **Average Wall Clock Throughput:** 16.822847203745376 tps

## Fp16-cuda
- **Average Tokenization Latency (per token):** 0.2424314897507429 ms
- **Average Tokenization Throughput (per token):** 4124.876685896518 tps
- **Average Prompt Processing Latency (per token):** 0.07853922086042076
ms
- **Average Prompt Processing Throughput (per token):**
12732.491983555472 tps
- **Average Token Generation Latency (per token):** 8.747893429670668 ms
- **Average Token Generation Throughput (per token):** 114.3132352993979
tps
- **Average Sampling Latency (per token):** 0.02670477955492011 ms
- **Average Sampling Throughput (per token):** 37446.480243112856 tps
- **Average Wall Clock Time:** 2.2886383962631225 s
- **Average Wall Clock Throughput:** 167.785352472891 tps

## Int4-cuda
- **Average Tokenization Latency (per token):** 0.11371983797289431 ms
- **Average Tokenization Throughput (per token):** 8793.540492366468 tps
- **Average Prompt Processing Latency (per token):** 0.10714008702052524
ms
- **Average Prompt Processing Throughput (per token):**
9333.574648006643 tps
- **Average Token Generation Latency (per token):** 4.4145354868173055
ms
- **Average Token Generation Throughput (per token):**
226.52439944954617 tps
- **Average Sampling Latency (per token):** 0.024083436303499184 ms
- **Average Sampling Throughput (per token):** 41522.31381759694 tps
- **Average Wall Clock Time:** 1.1688858103752136 s
- **Average Wall Clock Throughput:** 328.51797548704576 tps
---
 benchmark/python/benchmark_e2e.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
index f5cfd3143..44e8194eb 100644
--- a/benchmark/python/benchmark_e2e.py
+++ b/benchmark/python/benchmark_e2e.py
@@ -85,6 +85,8 @@ def main(args):
     wall_clock_times = []
     if args.verbose: print(f"Running benchmark for batch size = {batch_size}, prompt length = {prompt_length}")
     for _ in tqdm(range(num_repetitions)):
+        wall_clock_start_time = time.time()
+
         # Prepare run
         generator = og.Generator(model, params)
 
@@ -94,6 +96,12 @@ def main(args):
         tokenize_end_time = time.perf_counter()
         tokenize_times.append(tokenize_end_time - tokenize_start_time)
 
+        # Prepare run
+        params = og.GeneratorParams(model)
+        params.input_ids = tokens
+        params.set_search_options({"max_length":max_length, "min_length":max_length})
+        generator = og.Generator(model, params)
+
         # Measure prompt processing
         prompt_start_time = time.perf_counter()
         generator.compute_logits()
@@ -106,7 +114,6 @@ def main(args):
         sampling_times.append(sampling_end_time - sampling_start_time)
 
         # Measure token generation
-        wall_clock_start_time = time.time()
         while not generator.is_done():
             # Run inference
             token_gen_start_time = time.perf_counter()
@@ -154,7 +161,7 @@ def main(args):
 
     # Calculate wall clock time
     avg_wall_clock_time = sum(wall_clock_times) / len(wall_clock_times)
-    avg_wall_clock_thrpt = batch_size * (generation_length / avg_wall_clock_time)
+    avg_wall_clock_thrpt = batch_size * (max_length / avg_wall_clock_time)
     print(f"Average Wall Clock Time: {avg_wall_clock_time} s")
     print(f"Average Wall Clock Throughput: {avg_wall_clock_thrpt} tps")
 

From 879e6194cce1361ed88ce37a5b75c2c8b4146856 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Fri, 22 Mar 2024 20:15:42 -0700
Subject: [PATCH 21/36] Remove GenerateNextToken* special case functions (#221)

set_search_options already supports the functionality, so the extra
functions are confusing users since there are multiple ways to do the
same thing.
set_search_options is also more flexible as it supports all future
options without the need for extra APIs.
---
 src/config.h                           |  2 +-
 src/csharp/Generator.cs                | 20 ---------
 src/csharp/NativeMethods.cs            | 23 ----------
 src/generators.cpp                     | 37 ++++++----------
 src/generators.h                       |  4 --
 src/ort_genai_c.cpp                    | 28 ------------
 src/ort_genai_c.h                      | 17 --------
 src/python/python.cpp                  | 20 ---------
 test/c_api_tests.cpp                   | 40 +++--------------
 test/csharp/TestOnnxRuntimeGenAIAPI.cs | 41 +++---------------
 test/model_tests.cpp                   |  6 +--
 test/sampling_tests.cpp                | 60 ++++++++++++++++++--------
 12 files changed, 72 insertions(+), 226 deletions(-)

diff --git a/src/config.h b/src/config.h
index 6cc634658..2621edc21 100644
--- a/src/config.h
+++ b/src/config.h
@@ -79,7 +79,7 @@ struct Config {
     int num_return_sequences{1};
     float repetition_penalty{1.0f};  // 1.0 means no penalty.
     int top_k{};                     // Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in the generate method of the model.
-    float top_p{1.0f};               // If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    float top_p{};                   // If set to float >0 and <1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
     float temperature{1.0f};
     bool early_stopping{true};  //  Whether to stop the beam search when at least num_beams sentences are finished per batch or not.
     int no_repeat_ngram_size{};
diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs
index 10c3d4e47..64c1c5623 100644
--- a/src/csharp/Generator.cs
+++ b/src/csharp/Generator.cs
@@ -30,26 +30,6 @@ public void GenerateNextToken()
             Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken(_generatorHandle));
         }
 
-        public void GenerateNextTokenTop()
-        {
-            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_Top(_generatorHandle));
-        }
-
-        public void GenerateNextTokenTopK(int k, float temperature)
-        {
-            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK(_generatorHandle, k, temperature));
-        }
-
-        public void GenerateNextTokenTopP(float p, float temperature)
-        {
-            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopP(_generatorHandle, p, temperature));
-        }
-
-        public void GenerateNextTokenTopKTopP(int k, float p, float temperature)
-        {
-            Result.VerifySuccess(NativeMethods.OgaGenerator_GenerateNextToken_TopK_TopP(_generatorHandle, k, p, temperature));
-        }
-
         public ReadOnlySpan<int> GetSequence(ulong index)
         {
             ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64();
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index 039dfb4de..552c9046a 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -80,29 +80,6 @@ internal class NativeLib
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken(IntPtr /* OgaGenerator* */ generator);
 
-        // This function is used to generate the next token in the sequence using the greedy search algorithm.
-        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_Top(IntPtr /* OgaGenerator* */ generator);
-
-        // This function is used to generate the next token in the sequence using the greedy search algorithm.
-        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK(IntPtr /* OgaGenerator* */ generator,
-                                                                                         int /* int32_t */ k,
-                                                                                         float /* single_t */ t);
-
-        // This function is used to generate the next token in the sequence using the greedy search algorithm.
-        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopP(IntPtr /* OgaGenerator* */ generator,
-                                                                                         float /* single_t */ p,
-                                                                                         float /* single_t */ t);
-
-        // This function is used to generate the next token in the sequence using the greedy search algorithm.
-        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerator_GenerateNextToken_TopK_TopP(IntPtr /* OgaGenerator* */ generator,
-                                                                                         int /* int32_t */ k,
-                                                                                         float /* single_t */ p,
-                                                                                         float /* single_t */ t);
-
         // This function returns the length of the sequence at the given index.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator,
diff --git a/src/generators.cpp b/src/generators.cpp
index 6844a9aaf..cee8b1b02 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -84,7 +84,7 @@ Generator::Generator(const Model& model, const GeneratorParams& params) : model_
 
 void Generator::ComputeLogits() {
   if (computed_logits_)
-    throw std::runtime_error("ComputeLogits called again without calling GenerateNextToken* first");
+    throw std::runtime_error("ComputeLogits called again without calling GenerateNextToken first");
 
   search_->SetLogits(state_->Run(search_->GetSequenceLength(), search_->GetNextTokens(), search_->GetNextIndices()));
   computed_logits_ = true;
@@ -101,46 +101,37 @@ bool Generator::IsDone() const {
   return search_->IsDone();
 }
 
-void Generator::GenerateNextToken_TopK_TopP(int top_k, float top_p, float temperature) {
+void Generator::GenerateNextToken() {
   if (!computed_logits_)
-    throw std::runtime_error("Must call ComputeLogits before GenerateNextToken*");
+    throw std::runtime_error("Must call ComputeLogits before GenerateNextToken");
   computed_logits_ = false;
 
-  if (top_k == 1) {
+  auto& search = search_->params_->search;
+  if (!search.do_sample || search.top_k == 1) {
     search_->SelectTop();
     return;
   }
 
   // The user explicitly called TopK_TopP on a beam search
-  if (search_->params_->search.num_beams != 1)
+  if (search.num_beams != 1)
     throw std::runtime_error("TopK and TopP cannot be used with a beam search");
 
   // Sanity checks
-  if (top_p < 0.0f || top_p > 1.0f)
+  if (search.top_p < 0.0f || search.top_p > 1.0f)
     throw std::runtime_error("top_p must be between 0.0 and 1.0");
-  if (top_k < 0)
+  if (search.top_k < 0)
     throw std::runtime_error("top_k must be 0 or greater");
 
-  if (top_p > 0.0f && top_p < 1.0f && top_k > 1) {
-    search_->SampleTopKTopP(top_k, top_p, temperature);
-  } else if (top_k > 1) {
-    search_->SampleTopK(top_k, temperature);
+  if (search.top_p > 0.0f && search.top_p < 1.0f && search.top_k > 1) {
+    search_->SampleTopKTopP(search.top_k, search.top_p, search.temperature);
+  } else if (search.top_k > 1) {
+    search_->SampleTopK(search.top_k, search.temperature);
   } else {
-    assert(top_k == 0);
-    if (top_p == 0.0f)
-      throw std::runtime_error("top_k and top_p cannot both be zero");
-    search_->SampleTopP(top_p, temperature);
+    assert(search.top_k == 0);
+    search_->SampleTopP(search.top_p, search.temperature);
   }
 }
 
-void Generator::GenerateNextToken() {
-  auto& search = search_->params_->search;
-  if (search.do_sample)
-    GenerateNextToken_TopK_TopP(search.top_k, search.top_p, search.temperature);
-  else
-    GenerateNextToken_Top();
-}
-
 RoamingArray<int32_t> Generator::GetSequence(int index) const {
   return search_->GetSequence(index);
 }
diff --git a/src/generators.h b/src/generators.h
index 1b42b45e9..3fb9f5201 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -100,10 +100,6 @@ struct Generator {
 
   bool IsDone() const;
   void ComputeLogits();
-  void GenerateNextToken_TopK_TopP(int top_k, float top_p, float temperature);
-  void GenerateNextToken_TopP(float p, float temperature) { GenerateNextToken_TopK_TopP(0, p, temperature); }
-  void GenerateNextToken_TopK(int k, float temperature) { GenerateNextToken_TopK_TopP(k, 0.0f, temperature); }
-  void GenerateNextToken_Top() { GenerateNextToken_TopK_TopP(1, 0.0f, 0.0f); }
   void GenerateNextToken();
 
   RoamingArray<int32_t> GetSequence(int index) const;
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index bbf84be51..1beb2a43b 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -156,34 +156,6 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator)
   OGA_CATCH
 }
 
-OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator) {
-  OGA_TRY
-  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_Top();
-  return nullptr;
-  OGA_CATCH
-}
-
-OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t) {
-  OGA_TRY
-  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_TopK(k, t);
-  return nullptr;
-  OGA_CATCH
-}
-
-OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t) {
-  OGA_TRY
-  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_TopP(p, t);
-  return nullptr;
-  OGA_CATCH
-}
-
-OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t) {
-  OGA_TRY
-  reinterpret_cast<Generators::Generator*>(generator)->GenerateNextToken_TopK_TopP(k, p, t);
-  return nullptr;
-  OGA_CATCH
-}
-
 size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().size();
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index e702082fc..fbd394f10 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -172,23 +172,6 @@ OGA_EXPORT bool OGA_API_CALL OgaGenerator_IsDone(const OgaGenerator* generator);
  * \return OgaResult containing the error message if the computation of the logits failed.
  */
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_ComputeLogits(OgaGenerator* generator);
-
-/*
- * \brief Generates the next token based on the computed logits using the greedy search.
- * \param[in] generator The generator to generate the next token for.
- * \return OgaResult containing the error message if the generation of the next token failed.
- */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_Top(OgaGenerator* generator);
-
-/* Top-K sampling: most probable words from the model's output probability distribution for the next word
- */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK(OgaGenerator* generator, int k, float t);
-
-/*Top-P sampling selects words from the smallest set of words whose cumulative probability exceeds a predefined threshold (p)
- */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopP(OgaGenerator* generator, float p, float t);
-
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken_TopK_TopP(OgaGenerator* generator, int k, float p, float t);
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator);
 
 /*
diff --git a/src/python/python.cpp b/src/python/python.cpp
index 1c8db803d..584beb97c 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -137,22 +137,6 @@ struct PyGenerator {
     generator_->ComputeLogits();
   }
 
-  void GenerateNextToken_TopK_TopP(int top_k, float top_p, float temperature) {
-    generator_->GenerateNextToken_TopK_TopP(top_k, top_p, temperature);
-  }
-
-  void GenerateNextToken_TopP(float p, float temperature) {
-    generator_->GenerateNextToken_TopP(p, temperature);
-  }
-
-  void GenerateNextToken_TopK(int k, float temperature) {
-    generator_->GenerateNextToken_TopK(k, temperature);
-  }
-
-  void GenerateNextToken_Top() {
-    generator_->GenerateNextToken_Top();
-  }
-
   void GenerateNextToken() {
     generator_->GenerateNextToken();
   }
@@ -235,10 +219,6 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def("is_done", &PyGenerator::IsDone)
       .def("compute_logits", &PyGenerator::ComputeLogits)
       .def("generate_next_token", &PyGenerator::GenerateNextToken)
-      .def("generate_next_token_top", &PyGenerator::GenerateNextToken_Top)
-      .def("generate_next_token_top_p", &PyGenerator::GenerateNextToken_TopP)
-      .def("generate_next_token_top_k", &PyGenerator::GenerateNextToken_TopK)
-      .def("generate_next_token_top_k_top_p", &PyGenerator::GenerateNextToken_TopK_TopP)
       .def("get_next_tokens", &PyGenerator::GetNextTokens)
       .def("get_sequence", &PyGenerator::GetSequence);
 
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index 2ac6bfb71..3a04a8180 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -187,6 +187,7 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
   CheckResult(OgaCreateGeneratorParams(model, &params));
   OgaGeneratorParamsPtr params_ptr{params};
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", max_length));
+  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", false));
   CheckResult(OgaGeneratorParamsSetInputIDs(params, input_ids.data(), input_ids.size(), sequence_length, batch_size));
 
   OgaGenerator* generator;
@@ -195,7 +196,7 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
 
   while (!OgaGenerator_IsDone(generator)) {
     CheckResult(OgaGenerator_ComputeLogits(generator));
-    CheckResult(OgaGenerator_GenerateNextToken_Top(generator));
+    CheckResult(OgaGenerator_GenerateNextToken(generator));
   }
 
   // Verify outputs match expected outputs
@@ -252,20 +253,11 @@ TEST(CAPITests, TopKCAPI) {
   CheckResult(OgaCreateGeneratorParams(model, &params));
   OgaGeneratorParamsPtr params_ptr{params};
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
-
-  OgaGenerator* generator;
-  CheckResult(OgaCreateGenerator(model, params, &generator));
-  OgaGeneratorPtr generator_ptr{generator};
-
-  while (!OgaGenerator_IsDone(generator)) {
-    CheckResult(OgaGenerator_ComputeLogits(generator));
-    CheckResult(OgaGenerator_GenerateNextToken_TopK(generator, top_k, temp));
-  }
-
   CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
+  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
+
   OgaSequences* output_sequences;
   CheckResult(OgaGenerate(model, params, &output_sequences));
   OgaSequencesPtr output_sequences_ptr{output_sequences};
@@ -310,20 +302,10 @@ TEST(CAPITests, TopPCAPI) {
   CheckResult(OgaCreateGeneratorParams(model, &params));
   OgaGeneratorParamsPtr params_ptr{params};
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
-
-  OgaGenerator* generator;
-  CheckResult(OgaCreateGenerator(model, params, &generator));
-  OgaGeneratorPtr generator_ptr{generator};
-
-  while (!OgaGenerator_IsDone(generator)) {
-    CheckResult(OgaGenerator_ComputeLogits(generator));
-    CheckResult(OgaGenerator_GenerateNextToken_TopP(generator, top_p, temp));
-  }
-
   CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
+  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
   OgaSequences* output_sequences;
   CheckResult(OgaGenerate(model, params, &output_sequences));
   OgaSequencesPtr output_sequences_ptr{output_sequences};
@@ -369,21 +351,11 @@ TEST(CAPITests, TopKTopPCAPI) {
   CheckResult(OgaCreateGeneratorParams(model, &params));
   OgaGeneratorParamsPtr params_ptr{params};
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
-
-  OgaGenerator* generator;
-  CheckResult(OgaCreateGenerator(model, params, &generator));
-  OgaGeneratorPtr generator_ptr{generator};
-
-  while (!OgaGenerator_IsDone(generator)) {
-    CheckResult(OgaGenerator_ComputeLogits(generator));
-    CheckResult(OgaGenerator_GenerateNextToken_TopK_TopP(generator, top_k, top_p, temp));
-  }
-
   CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p));
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
+  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
   OgaSequences* output_sequences;
   CheckResult(OgaGenerate(model, params, &output_sequences));
   OgaSequencesPtr output_sequences_ptr{output_sequences};
diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
index 7bca5ffdc..156f943b4 100644
--- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs
+++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
@@ -64,7 +64,7 @@ public void TestGreedySearch()
                         while (!generator.IsDone())
                         {
                             generator.ComputeLogits();
-                            generator.GenerateNextTokenTop();
+                            generator.GenerateNextToken();
                         }
 
                         for (ulong i = 0; i < batchSize; i++)
@@ -92,7 +92,7 @@ public void TestTopKSearch()
         {
             int topK = 100;
             float temp = 0.6f;
-            ulong maxLength = 40;
+            ulong maxLength = 20;
             
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
             using (var model = new Model(modelPath))
@@ -115,17 +115,8 @@ public void TestTopKSearch()
                     using GeneratorParams generatorParams = new GeneratorParams(model);
                     Assert.NotNull(generatorParams);
 
-                    generatorParams.SetSearchOption("max_length", 20);
                     generatorParams.SetInputSequences(sequences);
-
-                    using Generator generator = new Generator(model, generatorParams);
-                    Assert.NotNull(generator);
-                    while (!generator.IsDone())
-                    {
-                        generator.ComputeLogits();
-                        generator.GenerateNextTokenTopK(topK, temp);
-                    }
-
+                    generatorParams.SetSearchOption("max_length", maxLength);
                     generatorParams.SetSearchOption("do_sample", true);
                     generatorParams.SetSearchOption("top_k", topK);
                     generatorParams.SetSearchOption("temperature", temp);
@@ -143,7 +134,7 @@ public void TestTopPSearch()
         {
             float topP = 0.6f;
             float temp = 0.6f;
-            ulong maxLength = 40;
+            ulong maxLength = 20;
             
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
             using (var model = new Model(modelPath))
@@ -166,17 +157,8 @@ public void TestTopPSearch()
                     using GeneratorParams generatorParams = new GeneratorParams(model);
                     Assert.NotNull(generatorParams);
 
-                    generatorParams.SetSearchOption("max_length", 20);
                     generatorParams.SetInputSequences(sequences);
-
-                    using Generator generator = new Generator(model, generatorParams);
-                    Assert.NotNull(generator);
-                    while (!generator.IsDone())
-                    {
-                        generator.ComputeLogits();
-                        generator.GenerateNextTokenTopP(topP, temp);
-                    }
-
+                    generatorParams.SetSearchOption("max_length", maxLength);
                     generatorParams.SetSearchOption("do_sample", true);
                     generatorParams.SetSearchOption("top_p", topP);
                     generatorParams.SetSearchOption("temperature", temp);
@@ -195,7 +177,7 @@ public void TestTopKTopPSearch()
             int topK = 100;
             float topP = 0.6f;
             float temp = 0.6f;
-            ulong maxLength = 40;
+            ulong maxLength = 20;
             
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
             using (var model = new Model(modelPath))
@@ -218,17 +200,8 @@ public void TestTopKTopPSearch()
                     using GeneratorParams generatorParams = new GeneratorParams(model);
                     Assert.NotNull(generatorParams);
 
-                    generatorParams.SetSearchOption("max_length", 20);
                     generatorParams.SetInputSequences(sequences);
-
-                    using Generator generator = new Generator(model, generatorParams);
-                    Assert.NotNull(generator);
-                    while (!generator.IsDone())
-                    {
-                        generator.ComputeLogits();
-                        generator.GenerateNextTokenTopKTopP(topK, topP, temp);
-                    }
-
+                    generatorParams.SetSearchOption("max_length", maxLength);
                     generatorParams.SetSearchOption("do_sample", true);
                     generatorParams.SetSearchOption("top_k", topK);
                     generatorParams.SetSearchOption("top_p", topP);
diff --git a/test/model_tests.cpp b/test/model_tests.cpp
index a2b3a7832..79c1d2c64 100644
--- a/test/model_tests.cpp
+++ b/test/model_tests.cpp
@@ -44,7 +44,7 @@ TEST(ModelTests, GreedySearchGptFp32) {
 
   while (!generator->IsDone()) {
     generator->ComputeLogits();
-    generator->GenerateNextToken_Top();
+    generator->GenerateNextToken();
   }
 
   // Verify outputs match expected outputs
@@ -128,7 +128,7 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label)
 
   while (!generator->IsDone()) {
     generator->ComputeLogits();
-    generator->GenerateNextToken_Top();
+    generator->GenerateNextToken();
   }
 
   // Verify outputs match expected outputs
@@ -226,7 +226,7 @@ Print all primes between 1 and n
   auto generator = Generators::CreateGenerator(*model, *params);
   while (!generator->IsDone()) {
     generator->ComputeLogits();
-    generator->GenerateNextToken_Top();
+    generator->GenerateNextToken();
   }
 
   auto result = generator->GetSequence(0);
diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp
index 531270f78..239c71ab6 100644
--- a/test/sampling_tests.cpp
+++ b/test/sampling_tests.cpp
@@ -27,6 +27,8 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) {
   int batch_size = 4;
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample=true;
+  params->search.top_p=0.25f;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -37,7 +39,7 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) {
   generator->search_->SetLogits(logits_span);
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  generator->GenerateNextToken_TopP(0.25f, 1.0f);
+  generator->GenerateNextToken();
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t)));
 }
@@ -53,6 +55,8 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) {
   int batch_size = 4;
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = 2;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -64,8 +68,7 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) {
   generator->computed_logits_ = true;
 
   // Verify outputs match expected outputs
-  int k = 2;
-  generator->GenerateNextToken_TopK(k, 1.0);
+  generator->GenerateNextToken();
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -85,6 +88,9 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) {
   int batch_size = 4;
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = 2;
+  params->search.top_p = 0.25f;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -95,9 +101,7 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCpu) {
   generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  float p = 0.25f;
-  int k = 2;
-  generator->GenerateNextToken_TopK_TopP(k, p, 1.0);
+  generator->GenerateNextToken();
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -130,6 +134,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) {
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_p = 0.95f;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -147,7 +153,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) {
     auto logits_copy = logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_TopP(0.95f, 1.0f);
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
@@ -166,6 +172,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = k;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -183,7 +191,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
     auto logits_copy=logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_TopK(k, 1.0f);
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
@@ -203,6 +211,9 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = k;
+  params->search.top_p = p;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -220,7 +231,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
     auto logits_copy = logits_cpu;
     generator->search_->SetLogits(Generators::cpu_span<float>(logits_copy));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_TopK_TopP(k, p, 1.0f);
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     // Verify outputs match expected outputs
     for (int b = 0; b < batch_size; b++) {
@@ -248,6 +259,8 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) {
   int batch_size = 4;
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_p = 0.25f;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -259,7 +272,7 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) {
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  generator->GenerateNextToken_TopP(0.25f, 1.0f);
+  generator->GenerateNextToken();
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   EXPECT_TRUE(0 == std::memcmp(output_span.data(), next_tokens.data(), expected_output.size() * sizeof(int32_t)));
 }
@@ -276,6 +289,8 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) {
   int batch_size = 4;
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = 2;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -287,8 +302,7 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) {
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  int k = 2;
-  generator->GenerateNextToken_TopK(k, 1.0);
+  generator->GenerateNextToken();
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -309,6 +323,9 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) {
   int batch_size = 4;
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = 2;
+  params->search.top_p = 0.25f;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -320,9 +337,7 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) {
   generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), logits_cpu.size()));
   generator->computed_logits_ = true;
   // Verify outputs match expected outputs
-  float p = 0.25f;
-  int k = 2;
-  generator->GenerateNextToken_TopK_TopP(k, p, 1.0);
+  generator->GenerateNextToken();
   auto next_tokens = generator->search_->GetNextTokens().GetCPU();
   for (int b = 0; b < batch_size; b++) {
     auto next_token = next_tokens[b];
@@ -338,6 +353,8 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_p = 0.95f;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -358,7 +375,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
     cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_TopP(0.95f, 1.0f);
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
@@ -378,6 +395,8 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) {
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = k;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -398,7 +417,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) {
     auto generator = Generators::CreateGenerator(*model, *params);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_TopK(k, 1.0f);
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
@@ -419,6 +438,9 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
   auto params = Generators::CreateGeneratorParams();
   params->search.max_length = 10;
+  params->search.do_sample = true;
+  params->search.top_k = k;
+  params->search.top_p = p;
   params->batch_size = batch_size;
   params->sequence_length = 1;
   params->vocab_size = vocab_size;
@@ -439,7 +461,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
     cudaMemcpyAsync(cpu_logits, logits_gpu.get(), vocab_size * batch_size * sizeof(float), cudaMemcpyDeviceToHost, params->cuda_stream);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_TopK_TopP(k, p, 1.0f);
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs
@@ -478,7 +500,7 @@ TEST(SamplingTests, RandomizedSamplingSelectTopCuda) {
     auto generator = Generators::CreateGenerator(*model, *params);
     generator->search_->SetLogits(Generators::gpu_span<float>(logits_gpu.get(), vocab_size * batch_size));
     generator->computed_logits_ = true;
-    generator->GenerateNextToken_Top();
+    generator->GenerateNextToken();
     auto next_tokens = generator->search_->GetNextTokens().GetCPU();
     cudaStreamSynchronize(params->cuda_stream);
     // Verify outputs match expected outputs

From a76a9ac46f91a1aeb7ecad8958dde3f0ac0f3d81 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 25 Mar 2024 16:26:53 -0400
Subject: [PATCH 22/36] Fix a typo (#232)

---
 .pipelines/stages/jobs/steps/nuget-win-step.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml
index b3c5e4fa8..3191412a3 100644
--- a/.pipelines/stages/jobs/steps/nuget-win-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml
@@ -53,4 +53,4 @@ steps:
   displayName: 'Publish Artifact: ONNXRuntime Genai capi'
   inputs:
     PathtoPublish: '$(Build.ArtifactStagingDirectory)\nuget'
-    ArtifactName: $(artifactName)-nuget'
\ No newline at end of file
+    ArtifactName: $(artifactName)-nuget
\ No newline at end of file

From 4022ba6b48827768ab7b06a8d5e84c6921f493d1 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Mon, 25 Mar 2024 15:47:48 -0700
Subject: [PATCH 23/36] Easy to use C++ API Wrapper (#225)

Wraps the C API in a 0-overhead C++ style API that automatically manages
resources and gives C++ style interfaces. It removes all of the C "glue"
and should be much simpler and safer to use.

I switched the C API example to use it, plus our C API unit tests.

Before:
```
  OgaModel* model;
  CheckResult(OgaCreateModel("phi-2", OgaDeviceTypeCPU, &model));
  OgaModelPtr model_ptr{model};

  OgaTokenizer* tokenizer;
  CheckResult(OgaCreateTokenizer(model, &tokenizer));
  OgaTokenizerPtr tokenizer_ptr{tokenizer};

  const char* prompt = "def is_prime(num):";
  std::cout << "Prompt: " << std::endl << prompt << std::endl;

  OgaSequences* sequences;
  CheckResult(OgaCreateSequences(&sequences));
  OgaSequencesPtr sequences_ptr{sequences};
  CheckResult(OgaTokenizerEncode(tokenizer, prompt, sequences));

  OgaGeneratorParams* params;
  CheckResult(OgaCreateGeneratorParams(model, &params));
  OgaGeneratorParamsPtr params_ptr{params};
  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 200));
  CheckResult(OgaGeneratorParamsSetInputSequences(params, sequences));

  OgaSequences* output_sequences;
  CheckResult(OgaGenerate(model, params, &output_sequences));
  OgaSequencesPtr output_sequences_ptr{output_sequences};

  size_t sequence_length = OgaSequencesGetSequenceCount(output_sequences, 0);
  const int32_t* sequence = OgaSequencesGetSequenceData(output_sequences, 0);

  const char* out_string;
  CheckResult(OgaTokenizerDecode(tokenizer, sequence, sequence_length, &out_string));

  std::cout << "Output: " << std::endl << out_string << std::endl;
```

After:
```
  auto model = OgaModel::Create("phi-2");
  auto tokenizer = OgaTokenizer::Create(*model);

  const char* prompt = "def is_prime(num):";
  std::cout << "Prompt: " << std::endl << prompt << std::endl;

  auto sequences = OgaSequences::Create();
  tokenizer->Encode(prompt, *sequences);

  auto params = OgaGeneratorParams::Create(*model);
  params->SetSearchOption("max_length", 200);
  params->SetInputSequences(*sequences);

  auto output_sequences = model->Generate(*params);
  auto out_string = tokenizer->Decode(output_sequences->Get(0));

  std::cout << "Output: " << std::endl << out_string << std::endl;
```
---
 examples/c/README.md    |   3 +-
 examples/c/src/main.cpp |  97 ++++++-----
 src/ort_genai.h         | 189 ++++++++++++++++++++++
 src/ort_genai_c.cpp     |  29 ++--
 src/ort_genai_c.h       |   2 +-
 test/c_api_tests.cpp    | 348 ++++++++++++----------------------------
 6 files changed, 361 insertions(+), 307 deletions(-)
 create mode 100644 src/ort_genai.h

diff --git a/examples/c/README.md b/examples/c/README.md
index 6bd58b38d..8cd2168fd 100644
--- a/examples/c/README.md
+++ b/examples/c/README.md
@@ -33,10 +33,9 @@ python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o
   - onnxruntime.dll
   - onnxruntime_providers_shared.dll
   - onnxruntime_providers_cuda.dll
-  - onnxruntime.lib
   - onnxruntime-genai.dll
   - onnxruntime-genai.lib
-2. Copy over the `ort_genai_c.h` header file to the [include](include) directory.
+2. Copy over the `ort_genai.h` and `ort_genai_c.h` header files to the [include](include) directory.
 
 On Windows:
 ```bash
diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp
index 60ee8b837..d9aeb68a8 100644
--- a/examples/c/src/main.cpp
+++ b/examples/c/src/main.cpp
@@ -1,72 +1,61 @@
 #include <iostream>
-#include "ort_genai_c.h"
+#include <span>
+#include "ort_genai.h"
 
-struct Deleters {
-  void operator()(OgaResult* p) {
-    OgaDestroyResult(p);
-  }
-  void operator()(OgaSequences* p) {
-    OgaDestroySequences(p);
-  }
-  void operator()(OgaModel* p) {
-    OgaDestroyModel(p);
-  }
-  void operator()(OgaGeneratorParams* p) {
-    OgaDestroyGeneratorParams(p);
-  }
-  void operator()(OgaGenerator* p) {
-    OgaDestroyGenerator(p);
-  }
-  void operator()(OgaTokenizer* p) {
-    OgaDestroyTokenizer(p);
-  }
-};
+// C++ API Example
 
-using OgaResultPtr = std::unique_ptr<OgaResult, Deleters>;
-using OgaSequencesPtr = std::unique_ptr<OgaSequences, Deleters>;
-using OgaModelPtr = std::unique_ptr<OgaModel, Deleters>;
-using OgaGeneratorParamsPtr = std::unique_ptr<OgaGeneratorParams, Deleters>;
-using OgaGeneratorPtr = std::unique_ptr<OgaGenerator, Deleters>;
-using OgaTokenizerPtr = std::unique_ptr<OgaTokenizer, Deleters>;
+void CXX_API() {
+  auto model = OgaModel::Create("phi-2");
+  auto tokenizer = OgaTokenizer::Create(*model);
 
-void CheckResult(OgaResult* result) {
-  if (!result)
-    return;
+  const char* prompt = "def is_prime(num):";
+  std::cout << "Prompt: " << std::endl << prompt << std::endl;
+
+  auto sequences = OgaSequences::Create();
+  tokenizer->Encode(prompt, *sequences);
+
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", 200);
+  params->SetInputSequences(*sequences);
+
+  auto output_sequences = model->Generate(*params);
+  auto out_string = tokenizer->Decode(output_sequences->Get(0));
 
-  OgaResultPtr result_ptr{result};
-  throw std::runtime_error(OgaResultGetError(result));
+  std::cout << "Output: " << std::endl << out_string << std::endl;
 }
 
-int main() {
-  std::cout << "-------------" << std::endl;
-  std::cout << "Hello, Phi-2!" << std::endl;
-  std::cout << "-------------" << std::endl;
+// C API Example
+
+void CheckResult(OgaResult* result) {
+  if (result) {
+    std::string string=OgaResultGetError(result);
+    OgaDestroyResult(result);
+    throw std::runtime_error(string);
+  }
+}
 
+void C_API() {
   OgaModel* model;
-  CheckResult(OgaCreateModel("phi-2", OgaDeviceTypeCPU, &model));
-  OgaModelPtr model_ptr{model};
+  OgaCreateModel("phi-2", &model);
 
   OgaTokenizer* tokenizer;
   CheckResult(OgaCreateTokenizer(model, &tokenizer));
-  OgaTokenizerPtr tokenizer_ptr{tokenizer};
 
   const char* prompt = "def is_prime(num):";
-  std::cout << "Prompt: " << std::endl << prompt << std::endl;
+  std::cout << "Prompt: " << std::endl
+            << prompt << std::endl;
 
   OgaSequences* sequences;
   CheckResult(OgaCreateSequences(&sequences));
-  OgaSequencesPtr sequences_ptr{sequences};
   CheckResult(OgaTokenizerEncode(tokenizer, prompt, sequences));
 
   OgaGeneratorParams* params;
   CheckResult(OgaCreateGeneratorParams(model, &params));
-  OgaGeneratorParamsPtr params_ptr{params};
   CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 200));
   CheckResult(OgaGeneratorParamsSetInputSequences(params, sequences));
 
   OgaSequences* output_sequences;
   CheckResult(OgaGenerate(model, params, &output_sequences));
-  OgaSequencesPtr output_sequences_ptr{output_sequences};
 
   size_t sequence_length = OgaSequencesGetSequenceCount(output_sequences, 0);
   const int32_t* sequence = OgaSequencesGetSequenceData(output_sequences, 0);
@@ -74,7 +63,27 @@ int main() {
   const char* out_string;
   CheckResult(OgaTokenizerDecode(tokenizer, sequence, sequence_length, &out_string));
 
-  std::cout << "Output: " << std::endl << out_string << std::endl;
+  std::cout << "Output: " << std::endl
+            << out_string << std::endl;
+
+  OgaDestroyString(out_string);
+  OgaDestroySequences(output_sequences);
+  OgaDestroyGeneratorParams(params);
+  OgaDestroySequences(sequences);
+  OgaDestroyTokenizer(tokenizer);
+  OgaDestroyModel(model);
+}
+
+int main() {
+  std::cout << "-------------" << std::endl;
+  std::cout << "Hello, Phi-2!" << std::endl;
+  std::cout << "-------------" << std::endl;
+
+  std::cout << "C++ API" << std::endl;
+  CXX_API();
+
+  std::cout << "C API" << std::endl;
+  C_API();
 
   return 0;
 }
\ No newline at end of file
diff --git a/src/ort_genai.h b/src/ort_genai.h
new file mode 100644
index 000000000..82f8c722c
--- /dev/null
+++ b/src/ort_genai.h
@@ -0,0 +1,189 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "ort_genai_c.h"
+
+// GenAI C++ API
+//
+// This is a zero cost wrapper around the C API, and provides for a set of C++ classes with automatic resource management
+
+/* A simple end to end example of how to generate an answer from a prompt:
+ *
+ * auto model = OgaModel::Create("phi-2");
+ * auto tokenizer = OgaTokenizer::Create(*model);
+ *
+ * auto sequences = OgaSequences::Create();
+ * tokenizer->Encode("A great recipe for Kung Pao chicken is ", *sequences);
+ *
+ * auto params = OgaGeneratorParams::Create(*model);
+ * params->SetInputSequences(*sequences);
+ * params->SetSearchOption("max_length", 200);
+ *
+ * auto output_sequences = model->Generate(*params);
+ * auto out_string = tokenizer->Decode(output_sequences->Get(0));
+ *
+ * std::cout << "Output: " << std::endl << out_string << std::endl;
+ */
+
+// The types defined in this file are to give us zero overhead C++ style interfaces around an opaque C pointer.
+// For example, there is no actual 'OgaModel' type defined anywhere, so we create a fake definition here
+// that lets users have a C++ style OgaModel type that can be held in a std::unique_ptr.
+//
+// This OgaAbstract struct is to prevent accidentally trying to use them by value.
+struct OgaAbstract {
+  OgaAbstract() = delete;
+  OgaAbstract(const OgaAbstract&) = delete;
+  void operator=(const OgaAbstract&) = delete;
+};
+
+struct OgaResult : OgaAbstract {
+  const char* GetError() const { return OgaResultGetError(this); }
+  static void operator delete(void* p) { OgaDestroyResult(reinterpret_cast<OgaResult*>(p)); }
+};
+
+// This is used to turn OgaResult return values from the C API into std::runtime_error exceptions
+inline void OgaCheckResult(OgaResult* result) {
+  if (result) {
+    std::unique_ptr<OgaResult> p_result{result};  // Take ownership so it's destroyed properly
+    throw std::runtime_error(p_result->GetError());
+  }
+}
+
+struct OgaModel : OgaAbstract {
+  static std::unique_ptr<OgaModel> Create(const char* config_path) {
+    OgaModel* p;
+    OgaCheckResult(OgaCreateModel(config_path, &p));
+    return std::unique_ptr<OgaModel>(p);
+  }
+
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
+    OgaSequences* p;
+    OgaCheckResult(OgaGenerate(this, &params, &p));
+    return std::unique_ptr<OgaSequences>(p);
+  }
+
+  static void operator delete(void* p) { OgaDestroyModel(reinterpret_cast<OgaModel*>(p)); }
+};
+
+struct OgaString {
+  OgaString(const char* p) : p_{p} {}
+  ~OgaString() { OgaDestroyString(p_); }
+
+  operator const char*() const { return p_; }
+
+  const char* p_;
+};
+
+struct OgaSequences : OgaAbstract {
+  static std::unique_ptr<OgaSequences> Create() {
+    OgaSequences* p;
+    OgaCheckResult(OgaCreateSequences(&p));
+    return std::unique_ptr<OgaSequences>(p);
+  }
+
+  size_t Count() const {
+    return OgaSequencesCount(this);
+  }
+
+  std::span<const int32_t> Get(size_t index) const {
+    return {OgaSequencesGetSequenceData(this, index), OgaSequencesGetSequenceCount(this, index)};
+  }
+
+  static void operator delete(void* p) { OgaDestroySequences(reinterpret_cast<OgaSequences*>(p)); }
+};
+
+struct OgaTokenizer : OgaAbstract {
+  static std::unique_ptr<OgaTokenizer> Create(const OgaModel& model) {
+    OgaTokenizer* p;
+    OgaCheckResult(OgaCreateTokenizer(&model, &p));
+    return std::unique_ptr<OgaTokenizer>(p);
+  }
+
+  void Encode(const char* str, OgaSequences& sequences) const {
+    OgaCheckResult(OgaTokenizerEncode(this, str, &sequences));
+  }
+
+  OgaString Decode(std::span<const int32_t> tokens) const {
+    const char* p;
+    OgaCheckResult(OgaTokenizerDecode(this, tokens.data(), tokens.size(), &p));
+    return p;
+  }
+
+  static void operator delete(void* p) { OgaDestroyTokenizer(reinterpret_cast<OgaTokenizer*>(p)); }
+};
+
+struct OgaTokenizerStream : OgaAbstract {
+  static std::unique_ptr<OgaTokenizerStream> Create(const OgaTokenizer& tokenizer) {
+    OgaTokenizerStream* p;
+    OgaCheckResult(OgaCreateTokenizerStream(&tokenizer, &p));
+    return std::unique_ptr<OgaTokenizerStream>(p);
+  }
+
+  /*
+   * Decode a single token in the stream. If this results in a word being generated, it will be returned in 'out'.
+   * The caller is responsible for concatenating each chunk together to generate the complete result.
+   * 'out' is valid until the next call to OgaTokenizerStreamDecode or when the OgaTokenizerStream is destroyed
+   */
+  const char* Decode(int32_t token) {
+    const char* out;
+    OgaCheckResult(OgaTokenizerStreamDecode(this, token, &out));
+    return out;
+  }
+
+  static void operator delete(void* p) { OgaDestroyTokenizerStream(reinterpret_cast<OgaTokenizerStream*>(p)); }
+};
+
+struct OgaGeneratorParams : OgaAbstract {
+  static std::unique_ptr<OgaGeneratorParams> Create(const OgaModel& model) {
+    OgaGeneratorParams* p;
+    OgaCheckResult(OgaCreateGeneratorParams(&model, &p));
+    return std::unique_ptr<OgaGeneratorParams>(p);
+  }
+
+  void SetSearchOption(const char* name, int value) {
+    OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value));
+  }
+
+  void SetSearchOption(const char* name, double value) {
+    OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value));
+  }
+
+  void SetSearchOption(const char* name, bool value) {
+    OgaCheckResult(OgaGeneratorParamsSetSearchBool(this, name, value));
+  }
+
+  void SetInputIDs(const int32_t* input_ids, size_t input_ids_count, size_t sequence_length, size_t batch_size) {
+    OgaCheckResult(OgaGeneratorParamsSetInputIDs(this, input_ids, input_ids_count, sequence_length, batch_size));
+  }
+
+  void SetInputSequences(const OgaSequences& sequences) {
+    OgaCheckResult(OgaGeneratorParamsSetInputSequences(this, &sequences));
+  }
+
+  static void operator delete(void* p) { OgaDestroyGeneratorParams(reinterpret_cast<OgaGeneratorParams*>(p)); }
+};
+
+struct OgaGenerator : OgaAbstract {
+  static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, const OgaGeneratorParams& params) {
+    OgaGenerator* p;
+    OgaCheckResult(OgaCreateGenerator(&model, &params, &p));
+    return std::unique_ptr<OgaGenerator>(p);
+  }
+
+  bool IsDone() const {
+    return OgaGenerator_IsDone(this);
+  }
+
+  void ComputeLogits() {
+    OgaCheckResult(OgaGenerator_ComputeLogits(this));
+  }
+
+  void GenerateNextToken() {
+    OgaCheckResult(OgaGenerator_GenerateNextToken(this));
+  }
+
+  std::span<const int32_t> GetSequence(size_t index) const {
+    return {OgaGenerator_GetSequence(this, index), OgaGenerator_GetSequenceLength(this, index)};
+  }
+
+  static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast<OgaGenerator*>(p)); }
+};
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index 1beb2a43b..e9548d509 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#include "ort_genai_c.h"
 #include <memory>
 #include <onnxruntime_c_api.h>
-#include <exception>
+#include <stdexcept>
 #include <cstdint>
 #include <cstddef>
+#include "span.h"
+#include "ort_genai_c.h"
 #include "generators.h"
 #include "models/model.h"
 #include "search.h"
@@ -22,24 +23,24 @@ OrtEnv& GetOrtEnv() {
   return *g_ort_env;
 }
 
+struct Result {
+  explicit Result(const char* what) : what_{what} {}
+  std::string what_;
+};
+
 }  // namespace Generators
 
 extern "C" {
 
 #define OGA_TRY try {
-#define OGA_CATCH                   \
-  }                                 \
-  catch (const std::exception& e) { \
-    return new OgaResult{e.what()}; \
+#define OGA_CATCH                                                                                  \
+  }                                                                                                \
+  catch (const std::exception& e) {                                                                \
+    return reinterpret_cast<OgaResult*>(std::make_unique<Generators::Result>(e.what()).release()); \
   }
 
-struct OgaResult {
-  explicit OgaResult(const char* what) : what_{what} {}
-  std::string what_;
-};
-
-const char* OGA_API_CALL OgaResultGetError(OgaResult* result) {
-  return result->what_.c_str();
+const char* OGA_API_CALL OgaResultGetError(const OgaResult* result) {
+  return reinterpret_cast<const Generators::Result*>(result)->what_.c_str();
 }
 
 OgaResult* OGA_API_CALL OgaCreateSequences(OgaSequences** out) {
@@ -231,7 +232,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGetCurrentGpuDeviceId(int* device_id) {
 }
 
 void OGA_API_CALL OgaDestroyResult(OgaResult* p) {
-  delete p;
+  delete reinterpret_cast<Generators::Result*>(p);
 }
 
 void OGA_API_CALL OgaDestroyString(const char* p) {
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index fbd394f10..41eb65909 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -42,7 +42,7 @@ typedef struct OgaTokenizerStream OgaTokenizerStream;
  * \return Error message contained in the OgaResult. The const char* is owned by the OgaResult
  *         and can will be freed when the OgaResult is destroyed.
  */
-OGA_EXPORT const char* OGA_API_CALL OgaResultGetError(OgaResult* result);
+OGA_EXPORT const char* OGA_API_CALL OgaResultGetError(const OgaResult* result);
 
 /*
  * \param[in] result OgaResult to be destroyed.
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index 3a04a8180..a1ec2b923 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -3,71 +3,22 @@
 #include <search.h>
 #include <models/model.h>
 #include <iostream>
-#include <ort_genai_c.h>
+#include <ort_genai.h>
 #ifndef MODEL_PATH
 #define MODEL_PATH "../../test/test_models/"
 #endif
-struct Deleters {
-  void operator()(OgaResult* p) {
-    OgaDestroyResult(p);
-  }
-  void operator()(OgaSequences* p) {
-    OgaDestroySequences(p);
-  }
-  void operator()(OgaModel* p) {
-    OgaDestroyModel(p);
-  }
-  void operator()(OgaTokenizer* p) {
-    OgaDestroyTokenizer(p);
-  }
-  void operator()(OgaTokenizerStream* p) {
-    OgaDestroyTokenizerStream(p);
-  }
-  void operator()(OgaGeneratorParams* p) {
-    OgaDestroyGeneratorParams(p);
-  }
-  void operator()(OgaGenerator* p) {
-    OgaDestroyGenerator(p);
-  }
-};
-
-using OgaResultPtr = std::unique_ptr<OgaResult, Deleters>;
-using OgaSequencesPtr = std::unique_ptr<OgaSequences, Deleters>;
-using OgaModelPtr = std::unique_ptr<OgaModel, Deleters>;
-using OgaTokenizerPtr = std::unique_ptr<OgaTokenizer, Deleters>;
-using OgaTokenizerStreamPtr = std::unique_ptr<OgaTokenizerStream, Deleters>;
-using OgaGeneratorParamsPtr = std::unique_ptr<OgaGeneratorParams, Deleters>;
-using OgaGeneratorPtr = std::unique_ptr<OgaGenerator, Deleters>;
-
-void CheckResult(OgaResult* result) {
-  if (!result)
-    return;
-
-  OgaResultPtr result_ptr{result};
-  throw std::runtime_error(OgaResultGetError(result));
-}
-
 TEST(CAPITests, TokenizerCAPI) {
 #if TEST_PHI2
-  OgaModel* model;
-  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
-  OgaModelPtr model_ptr{model};
-
-  OgaTokenizer* tokenizer;
-  CheckResult(OgaCreateTokenizer(model, &tokenizer));
-  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+  auto model = OgaModel::Create(MODEL_PATH "phi-2");
+  auto tokenizer = OgaTokenizer::Create(*model);
 
   // Encode single decode single
   {
     const char* input_string = "She sells sea shells by the sea shore.";
-    OgaSequences* input_sequences;
-    CheckResult(OgaCreateSequences(&input_sequences));
-    CheckResult(OgaTokenizerEncode(tokenizer, input_string, input_sequences));
-    OgaSequencesPtr input_sequences_ptr{input_sequences};
-
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(input_sequences, 0), OgaSequencesGetSequenceCount(input_sequences, 0)};
-    const char* out_string;
-    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
+    auto input_sequences = OgaSequences::Create();
+    tokenizer->Encode(input_string, *input_sequences);
+
+    auto out_string = tokenizer->Decode(input_sequences->Get(0));
     ASSERT_STREQ(input_string, out_string);
   }
 
@@ -77,39 +28,30 @@ TEST(CAPITests, TokenizerCAPI) {
       "The quick brown fox jumps over the lazy dog.",
   };
 
-  OgaSequences* sequences;
-  CheckResult(OgaCreateSequences(&sequences));
-  OgaSequencesPtr sequences_ptr{sequences};
+  auto sequences = OgaSequences::Create();
 
   // Encode all strings
   {
-    for (auto &string : input_strings)
-      CheckResult(OgaTokenizerEncode(tokenizer, string, sequences));
+    for (auto& string : input_strings)
+      tokenizer->Encode(string, *sequences);
   }
 
   // Decode one at a time
-  for (size_t i = 0; i < OgaSequencesCount(sequences); i++) {
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(sequences, i), OgaSequencesGetSequenceCount(sequences, i)};
-    const char* out_string;
-    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
+  for (size_t i = 0; i < sequences->Count(); i++) {
+    auto out_string = tokenizer->Decode(sequences->Get(i));
     std::cout << "Decoded string:" << out_string << std::endl;
     if (strcmp(input_strings[i], out_string) != 0)
       throw std::runtime_error("Token decoding mismatch");
-    OgaDestroyString(out_string);
   }
 
   // Stream Decode one at a time
-  for (size_t i = 0; i < OgaSequencesCount(sequences); i++) {
-    OgaTokenizerStream* tokenizer_stream;
-    CheckResult(OgaCreateTokenizerStream(tokenizer, &tokenizer_stream));
-    OgaTokenizerStreamPtr tokenizer_stream_ptr{tokenizer_stream};
+  for (size_t i = 0; i < sequences->Count(); i++) {
+    auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
 
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(sequences, i), OgaSequencesGetSequenceCount(sequences, i)};
+    std::span<const int32_t> sequence = sequences->Get(i);
     std::string stream_result;
     for (auto& token : sequence) {
-      const char* chunk;
-      CheckResult(OgaTokenizerStreamDecode(tokenizer_stream, token, &chunk));
-      stream_result += std::string(chunk);
+      stream_result += tokenizer_stream->Decode(token);
     }
     std::cout << "Stream decoded string:" << stream_result << std::endl;
     if (strcmp(input_strings[i], stream_result.c_str()) != 0)
@@ -120,17 +62,8 @@ TEST(CAPITests, TokenizerCAPI) {
 
 TEST(CAPITests, EndToEndPhiBatch) {
 #if TEST_PHI2
-  OgaModel* model;
-  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
-  OgaModelPtr model_ptr{model};
-
-  OgaTokenizer* tokenizer;
-  CheckResult(OgaCreateTokenizer(model, &tokenizer));
-  OgaTokenizerPtr tokenizer_ptr{tokenizer};
-
-  OgaSequences* input_sequences;
-  CheckResult(OgaCreateSequences(&input_sequences));
-  OgaSequencesPtr sequences_ptr{input_sequences};
+  auto model = OgaModel::Create(MODEL_PATH "phi-2");
+  auto tokenizer = OgaTokenizer::Create(*model);
 
   const char* input_strings[] = {
       "This is a test.",
@@ -138,27 +71,20 @@ TEST(CAPITests, EndToEndPhiBatch) {
       "The quick brown fox jumps over the lazy dog.",
   };
 
+  auto input_sequences = OgaSequences::Create();
   for (auto& string : input_strings)
-    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
+    tokenizer->Encode(string, *input_sequences);
 
-  OgaGeneratorParams* params;
-  CheckResult(OgaCreateGeneratorParams(model, &params));
-  OgaGeneratorParamsPtr params_ptr{params};
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 20));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", 20);
+  params->SetInputSequences(*input_sequences);
 
-  OgaSequences* output_sequences;
-  CheckResult(OgaGenerate(model, params, &output_sequences));
-  OgaSequencesPtr output_sequences_ptr{output_sequences};
+  auto output_sequences = model->Generate(*params);
 
   // Decode The Batch
-  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
-
-    const char* out_string;
-    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
+  for (size_t i = 0; i < output_sequences->Count(); i++) {
+    auto out_string = tokenizer->Decode(output_sequences->Get(i));
     std::cout << "Decoded string:" << out_string << std::endl;
-    OgaDestroyString(out_string);
   }
 #endif
 }
@@ -179,44 +105,33 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
   // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20
   // And copy the resulting gpt2_init_past_fp32.onnx file into these two files (as it's the same for gpt2)
 
-  OgaModel* model;
-  CheckResult(OgaCreateModel(MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32", &model));
-  OgaModelPtr model_ptr{model};
+  auto model = OgaModel::Create(MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
 
-  OgaGeneratorParams* params;
-  CheckResult(OgaCreateGeneratorParams(model, &params));
-  OgaGeneratorParamsPtr params_ptr{params};
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", max_length));
-  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", false));
-  CheckResult(OgaGeneratorParamsSetInputIDs(params, input_ids.data(), input_ids.size(), sequence_length, batch_size));
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", max_length);
+  params->SetInputIDs(input_ids.data(), input_ids.size(), sequence_length, batch_size);
 
-  OgaGenerator* generator;
-  CheckResult(OgaCreateGenerator(model, params, &generator));
-  OgaGeneratorPtr generator_ptr{generator};
+  auto generator = OgaGenerator::Create(*model, *params);
 
-  while (!OgaGenerator_IsDone(generator)) {
-    CheckResult(OgaGenerator_ComputeLogits(generator));
-    CheckResult(OgaGenerator_GenerateNextToken(generator));
+  while (!generator->IsDone()) {
+    generator->ComputeLogits();
+    generator->GenerateNextToken();
   }
 
   // Verify outputs match expected outputs
   for (int i = 0; i < batch_size; i++) {
-    size_t token_count = OgaGenerator_GetSequenceLength(generator, i);
-    const int32_t* data = OgaGenerator_GetSequence(generator, i);
-    std::vector<int32_t> sequence(data, data + token_count);
+    auto sequence = generator->GetSequence(i);
 
     auto* expected_output_start = &expected_output[i * max_length];
     EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
   }
 
   // Test high level API
-  OgaSequences* sequences;
-  CheckResult(OgaGenerate(model, params, &sequences));
-  OgaSequencesPtr sequences_ptr{sequences};
+  auto sequences = model->Generate(*params);
 
   // Verify outputs match expected outputs
   for (int i = 0; i < batch_size; i++) {
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(sequences, i), OgaSequencesGetSequenceCount(sequences, i)};
+    auto sequence = sequences->Get(i);
 
     auto* expected_output_start = &expected_output[i * max_length];
     EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
@@ -224,151 +139,92 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
 }
 
 #if TEST_PHI2
-TEST(CAPITests, TopKCAPI) {
-  float top_k = 50;
-  float temp = 0.6f;
 
-  OgaModel* model;
-  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
-  OgaModelPtr model_ptr{model};
+struct Phi2Test {
+  Phi2Test() {
+    model_ = OgaModel::Create(MODEL_PATH "phi-2");
+    tokenizer_ = OgaTokenizer::Create(*model_);
 
-  OgaTokenizer* tokenizer;
-  CheckResult(OgaCreateTokenizer(model, &tokenizer));
-  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+    input_sequences_ = OgaSequences::Create();
 
-  OgaSequences* input_sequences;
-  CheckResult(OgaCreateSequences(&input_sequences));
-  OgaSequencesPtr sequences_ptr{input_sequences};
+    const char* input_strings[] = {
+        "This is a test.",
+        "Rats are awesome pets!",
+        "The quick brown fox jumps over the lazy dog.",
+    };
 
-  const char* input_strings[] = {
-      "This is a test.",
-      "Rats are awesome pets!",
-      "The quick brown fox jumps over the lazy dog.",
-  };
+    for (auto& string : input_strings)
+      tokenizer_->Encode(string, *input_sequences_);
 
-  for (auto& string : input_strings)
-    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
+    params_ = OgaGeneratorParams::Create(*model_);
+    params_->SetInputSequences(*input_sequences_);
+    params_->SetSearchOption("max_length", 40);
+  }
 
-  OgaGeneratorParams* params;
-  CheckResult(OgaCreateGeneratorParams(model, &params));
-  OgaGeneratorParamsPtr params_ptr{params};
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
-  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
+  void Run() {
+    // Low level loop
+    {
+      auto generator = OgaGenerator::Create(*model_, *params_);
 
-  OgaSequences* output_sequences;
-  CheckResult(OgaGenerate(model, params, &output_sequences));
-  OgaSequencesPtr output_sequences_ptr{output_sequences};
+      while (!generator->IsDone()) {
+        generator->ComputeLogits();
+        generator->GenerateNextToken();
+      }
 
-  // Decode The Batch
-  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
+      // Decode One at a time
+      for (size_t i = 0; i < 3; i++) {
+        auto out_string = tokenizer_->Decode(generator->GetSequence(i));
+        std::cout << "Decoded string:" << out_string << std::endl;
+      }
+    }
 
-    const char* out_string;
-    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
-    std::cout << "Decoded string:" << out_string << std::endl;
-    OgaDestroyString(out_string);
-  }
-}
+    // High level
+    {
+      auto output_sequences = model_->Generate(*params_);
 
-TEST(CAPITests, TopPCAPI) {
-  float top_p = 0.6f;
-  float temp = 0.6f;
+      // Decode The Batch
+      for (size_t i = 0; i < output_sequences->Count(); i++) {
+        auto out_string = tokenizer_->Decode(output_sequences->Get(i));
+        std::cout << "Decoded string:" << out_string << std::endl;
+      }
+    }
+  }
 
-  OgaModel* model;
-  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
-  OgaModelPtr model_ptr{model};
+  std::unique_ptr<OgaModel> model_;
+  std::unique_ptr<OgaTokenizer> tokenizer_;
+  std::unique_ptr<OgaSequences> input_sequences_;
+  std::unique_ptr<OgaGeneratorParams> params_;
+};
 
-  OgaTokenizer* tokenizer;
-  CheckResult(OgaCreateTokenizer(model, &tokenizer));
-  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+TEST(CAPITests, TopKCAPI) {
+  Phi2Test test;
 
-  OgaSequences* input_sequences;
-  CheckResult(OgaCreateSequences(&input_sequences));
-  OgaSequencesPtr sequences_ptr{input_sequences};
+  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOption("top_k", 50);
+  test.params_->SetSearchOption("temperature", 0.6f);
 
-  const char* input_strings[] = {
-      "This is a test.",
-      "Rats are awesome pets!",
-      "The quick brown fox jumps over the lazy dog.",
-  };
+  test.Run();
+}
 
-  for (auto& string : input_strings)
-    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
-
-  OgaGeneratorParams* params;
-  CheckResult(OgaCreateGeneratorParams(model, &params));
-  OgaGeneratorParamsPtr params_ptr{params};
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
-  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
-  OgaSequences* output_sequences;
-  CheckResult(OgaGenerate(model, params, &output_sequences));
-  OgaSequencesPtr output_sequences_ptr{output_sequences};
+TEST(CAPITests, TopPCAPI) {
+  Phi2Test test;
 
-  // Decode The Batch
-  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
+  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOption("top_p", 0.6f);
+  test.params_->SetSearchOption("temperature", 0.6f);
 
-    const char* out_string;
-    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
-    std::cout << "Decoded string:" << out_string << std::endl;
-    OgaDestroyString(out_string);
-  }
+  test.Run();
 }
 
 TEST(CAPITests, TopKTopPCAPI) {
-  float top_p = 0.6f;
-  int top_k = 50;
-  float temp = 0.6f;
-
-  OgaModel* model;
-  CheckResult(OgaCreateModel(MODEL_PATH "phi-2", &model));
-  OgaModelPtr model_ptr{model};
+  Phi2Test test;
 
-  OgaTokenizer* tokenizer;
-  CheckResult(OgaCreateTokenizer(model, &tokenizer));
-  OgaTokenizerPtr tokenizer_ptr{tokenizer};
+  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOption("top_k", 50);
+  test.params_->SetSearchOption("top_p", 0.6f);
+  test.params_->SetSearchOption("temperature", 0.6f);
 
-  OgaSequences* input_sequences;
-  CheckResult(OgaCreateSequences(&input_sequences));
-  OgaSequencesPtr sequences_ptr{input_sequences};
-
-  const char* input_strings[] = {
-      "This is a test.",
-      "Rats are awesome pets!",
-      "The quick brown fox jumps over the lazy dog.",
-  };
-
-  for (auto& string : input_strings)
-    CheckResult(OgaTokenizerEncode(tokenizer, string, input_sequences));
-
-  OgaGeneratorParams* params;
-  CheckResult(OgaCreateGeneratorParams(model, &params));
-  OgaGeneratorParamsPtr params_ptr{params};
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "max_length", 40));
-  CheckResult(OgaGeneratorParamsSetSearchBool(params, "do_sample", true));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_k", top_k));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "top_p", top_p));
-  CheckResult(OgaGeneratorParamsSetSearchNumber(params, "temperature", temp));
-  CheckResult(OgaGeneratorParamsSetInputSequences(params, input_sequences));
-  OgaSequences* output_sequences;
-  CheckResult(OgaGenerate(model, params, &output_sequences));
-  OgaSequencesPtr output_sequences_ptr{output_sequences};
-
-  // Decode The Batch
-  for (size_t i = 0; i < OgaSequencesCount(output_sequences); i++) {
-    std::span<const int32_t> sequence{OgaSequencesGetSequenceData(output_sequences, i), OgaSequencesGetSequenceCount(output_sequences, i)};
-
-    const char* out_string;
-    CheckResult(OgaTokenizerDecode(tokenizer, sequence.data(), sequence.size(), &out_string));
-    std::cout << "Decoded string:" << out_string << std::endl;
-    OgaDestroyString(out_string);
-  }
+  test.Run();
 }
 
-#endif // TEST_PHI2
+#endif  // TEST_PHI2

From 53df7dcb52092c2b73e20e1cc3a4089bc251139a Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 26 Mar 2024 18:00:51 -0700
Subject: [PATCH 24/36] Support DML provider on Windows (#220)

---
 CMakeLists.txt       |  8 ++++++++
 build.py             |  4 ++++
 cmake/options.cmake  |  1 +
 src/models/model.cpp | 14 +++++++++++++-
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 280a6148d..de12d6482 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,6 +82,14 @@ else()
   list(REMOVE_ITEM generator_srcs ${generator_cuda_srcs})
 endif()
 
+if(USE_DML)
+  if(WIN32)
+    add_compile_definitions(USE_DML=1)
+  else()
+    message(FATAL_ERROR "USE_DML is ON but this isn't windows.")
+  endif()
+endif()
+
 if(ENABLE_TESTS AND TEST_PHI2)
   add_compile_definitions(TEST_PHI2=1)
 else()
diff --git a/build.py b/build.py
index 899415e3e..150ba7a54 100644
--- a/build.py
+++ b/build.py
@@ -96,6 +96,7 @@ def validate_cuda_home(cuda_home: str | bytes | os.PathLike | None):
 def build(
     skip_wheel: bool = False,
     use_cuda: bool | None = None,
+    use_dml: bool | None = None,
     cuda_home: str | bytes | os.PathLike | None = None,
     cmake_generator: str | None = None,
     ort_home: str | bytes | os.PathLike | None = None,
@@ -141,6 +142,7 @@ def build(
         "-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
         "-DUSE_CXX17=ON",
         "-DUSE_CUDA=ON" if cuda_home else "-DUSE_CUDA=OFF",
+        "-DUSE_DML=ON" if use_dml else "-DUSE_DML=OFF",
         f"-DBUILD_WHEEL={build_wheel}",
     ]
 
@@ -218,6 +220,7 @@ def build(
     parser.add_argument("--skip_csharp", action="store_true", help="Skip building the C# API.")
     parser.add_argument("--build_dir", default=None, help="Path to output directory.")
     parser.add_argument("--use_cuda", action="store_true", help="Whether to use CUDA. Default is to not use cuda.")
+    parser.add_argument("--use_dml", action="store_true", help="Whether to use DML. Default is to not use DML.")
     parser.add_argument("--parallel", action="store_true", help="Enable parallel build.")
     parser.add_argument(
         "--config",
@@ -231,6 +234,7 @@ def build(
     build(
         skip_wheel=args.skip_wheel,
         use_cuda=args.use_cuda,
+        use_dml=args.use_dml,
         cuda_home=args.cuda_home,
         cmake_generator=args.cmake_generator,
         ort_home=args.ort_home,
diff --git a/cmake/options.cmake b/cmake/options.cmake
index d42ea1ce1..80f004215 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -1,6 +1,7 @@
 include(CMakeDependentOption)
 
 option(USE_CUDA "Build with CUDA support" ON)
+option(USE_DML "Build with DML support" OFF)
 option(NO_TOKENIZER "Don't include the Tokenizer" OFF)
 option(ENABLE_PYTHON "Build the Python API." ON)
 option(ENABLE_TESTS "Enable tests" ON)
diff --git a/src/models/model.cpp b/src/models/model.cpp
index a31b1ed84..55ae62728 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -9,6 +9,11 @@
 #include "decoder_only.h"
 #include "whisper.h"
 #include "kernels.h"
+#ifdef USE_DML
+//  Because dml_provider_factory includes windows headers that #define min and max, this next line will prevent this from happening
+#define NOMINMAX
+#include "dml_provider_factory.h"
+#endif
 
 namespace Generators {
 
@@ -291,7 +296,14 @@ void Model::CreateSessionOptions() {
 
       Ort::ThrowOnError(Ort::api->UpdateROCMProviderOptions(&ort_provider_options, keys.data(), values.data(), keys.size()));
       ort_options.AppendExecutionProvider_ROCM(ort_provider_options);
-      device_type_ = DeviceType::CPU;  // Scoring uses CPU, even though the model uses ROCM
+#ifdef USE_DML
+    } else if (provider_options.name == "dml") {
+      const OrtDmlApi* p_dml_api{};
+      Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api)));
+      if (!p_dml_api)
+        throw std::runtime_error("Unexpected nullptr getting OrtDmlApi");
+      p_dml_api->SessionOptionsAppendExecutionProvider_DML(&ort_options, 0);
+#endif
     } else
       throw std::runtime_error("Unknown provider type: " + provider_options.name);
   }

From 1a13baefa53fc7899c0ca4ca26271610ee73d785 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Wed, 27 Mar 2024 12:52:18 -0400
Subject: [PATCH 25/36] Provide instruction on how to download and extract
 nightly ORT (#238)

Provide instruction on how to download and extract nightly ORT
---
 README.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/README.md b/README.md
index cc87d474c..b572a94fd 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,32 @@ Export int4 CPU version
 huggingface-cli login --token <your HuggingFace token>
 python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o <model folder>
 ```
+## Getting the latest nightly Onnxruntime build
+By default, onnxruntime-genai uses the latest stable release of onnxruntime. If you want to use the latest nightly build 
+of onnxruntime, you can download the nightly build of onnxruntime from our
+[Azure DevOps Artifacts](https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/OnnxRuntime/).
+nuget package can be uncompressed by renaming the extension to `.zip` and extracting the contents.
+The onnxruntime dynamlic libraries and header files are available in the nightly build. You can extract the nuget package
+and copy the dynamic libraries and header files to the `ort/` folder under onnxruntime-genai project root on the same level
+as this `README.md` file. 
+
+The library files are located in the `runtime/$OS-$Arch/native` folder and the header files are located in the
+`build/native/include` folder in the nuget package.
+
+The final folder structure should look like this:
+```
+onnxruntime-genai
+│   README.md
+│   ... 
+│   ort/
+│   │   include/
+│   │   │   coreml_provider_factory.h
+│   │   │   ...
+│   │   │   provider_options.h
+│   │   lib/
+│   │   │   (lib)onnxruntime.(so|dylib|dll)
+│   │   │   (lib)onnxruntime_providers_shared.(so|dylib|dll)
+```
 
 ## Contributing
 

From f9e8e40e5d5453a7183db15e50231ba66e182f99 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Wed, 27 Mar 2024 13:15:17 -0400
Subject: [PATCH 26/36] Adding py-ado-release to pipeline (#233)

---
 .pipelines/nuget-publishing.yml               |   6 -
 .pipelines/pypl-publishing.yml                |   6 +
 .../stages/jobs/nuget-packaging-job.yml       |  29 ++---
 .../stages/jobs/py-linux-packaging-job.yml    |  64 -----------
 .pipelines/stages/jobs/py-packaging-job.yml   | 105 ++++++++++++++++++
 .../stages/jobs/py-win-packaging-job.yml      |  71 ------------
 .../stages/jobs/steps/capi-linux-step.yml     |   2 +-
 .../stages/jobs/steps/capi-win-step.yml       |   2 +-
 ....yml => nuget-ado-feed-releasing-step.yml} |  20 ++--
 .../jobs/steps/py-ado-feed-releasing-step.yml |  10 ++
 .pipelines/stages/nuget-packaging-stage.yml   |   8 +-
 .pipelines/stages/py-packaging-stage.yml      |  22 +++-
 12 files changed, 163 insertions(+), 182 deletions(-)
 delete mode 100644 .pipelines/stages/jobs/py-linux-packaging-job.yml
 create mode 100644 .pipelines/stages/jobs/py-packaging-job.yml
 delete mode 100644 .pipelines/stages/jobs/py-win-packaging-job.yml
 rename .pipelines/stages/jobs/steps/{nuget-releasing-step.yml => nuget-ado-feed-releasing-step.yml} (73%)
 create mode 100644 .pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml

diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
index e91b57489..451411fd2 100644
--- a/.pipelines/nuget-publishing.yml
+++ b/.pipelines/nuget-publishing.yml
@@ -38,11 +38,6 @@ parameters:
   type: boolean
   default: false
 
-- name: publish_to_nuget
-  displayName: 'Publish to NuGet.org'
-  type: boolean
-  default: false
-
 resources:
   repositories:
   - repository: manylinux
@@ -61,5 +56,4 @@ stages:
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
     ort_version: ${{ parameters.ort_version }}
     cuda_version: ${{ parameters.cuda_version }}
-    publish_to_nuget: ${{ parameters.publish_to_nuget }}
     publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
\ No newline at end of file
diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml
index d5cb45dca..edce0b37d 100644
--- a/.pipelines/pypl-publishing.yml
+++ b/.pipelines/pypl-publishing.yml
@@ -32,6 +32,11 @@ parameters:
   - '11.8'
   - '12.2'
 
+- name: publish_to_ado_feed
+  displayName: 'Whether to publish the packages to ADO feed.'
+  type: boolean
+  default: false
+
 resources:
   repositories:
   - repository: manylinux
@@ -50,3 +55,4 @@ stages:
     enable_win_cuda: ${{ parameters.enable_win_cuda }}
     ort_version: ${{ parameters.ort_version }}
     cuda_version: ${{ parameters.cuda_version }}
+    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml
index af5250c3c..9bfd6454d 100644
--- a/.pipelines/stages/jobs/nuget-packaging-job.yml
+++ b/.pipelines/stages/jobs/nuget-packaging-job.yml
@@ -10,10 +10,12 @@ parameters:
   default: ''
 - name: os
   type: string
+  values:
+  - 'linux'
+  - 'win'
 - name: publish_to_ado_feed
   type: boolean
-- name: publish_to_nuget
-  type: boolean
+
 jobs:
 - job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging
   ${{ if eq(parameters.os, 'linux') }}:
@@ -21,7 +23,7 @@ jobs:
   ${{ if eq(parameters.os, 'win') }}:
     pool: 'onnxruntime-Win-CPU-2022'
   timeoutInMinutes: 180
-#  set variables here to be used in the template and steps
+  #  set variables here to be used in the template and steps
   variables:
   - name: arch
     value: ${{ parameters.arch }}
@@ -60,18 +62,19 @@ jobs:
   workspace:
     clean: all
   steps:
-  - template: steps/capi-${{ parameters.os }}-step.yml
-    parameters:
-      target: 'onnxruntime-genai'
+  - ${{ if eq(parameters.os, 'linux') }}:
+    - template: steps/capi-linux-step.yml
+      parameters:
+        target: 'onnxruntime-genai'
+    # TODO: Add a step to build the linux nuget package
 
-# TODO: Add a step to build the linux nuget package
   - ${{ if eq(parameters.os, 'win') }}:
-    - template: steps/nuget-${{ parameters.os }}-step.yml
-    - ${{ if or(eq(parameters.publish_to_nuget, true), eq(parameters.publish_to_ado_feed, true))}}:
-      - template: steps/nuget-releasing-step.yml
-        parameters:
-          publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-          publish_to_nuget: ${{ parameters.publish_to_nuget }}
+    - template: steps/capi-win-step.yml
+      parameters:
+        target: 'onnxruntime-genai'
+    - template: steps/nuget-win-step.yml
+    - ${{ if eq(parameters.publish_to_ado_feed, true)}}:
+      - template: steps/nuget-ado-feed-releasing-step.yml
 
   - template: steps/compliant-and-cleanup-step.yml
 
diff --git a/.pipelines/stages/jobs/py-linux-packaging-job.yml b/.pipelines/stages/jobs/py-linux-packaging-job.yml
deleted file mode 100644
index b7c35d6a5..000000000
--- a/.pipelines/stages/jobs/py-linux-packaging-job.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-parameters:
-- name: arch
-  type: string
-- name: ort_version
-  type: string
-- name: ep
-  type: string
-- name: cuda_version
-  type: string
-  default: ''
-jobs:
-- job: Linux_${{ parameters.ep }}_${{ parameters.arch }}_Wheels
-  strategy:
-    matrix:
-      Python38:
-        PyDotVer: '3.8'
-        PyNoDotVer: '38'
-      Python39:
-        PyDotVer: '3.9'
-        PyNoDotVer: '39'
-      Python310:
-        PyDotVer: '3.10'
-        PyNoDotVer: '310'
-      Python311:
-        PyDotVer: '3.11'
-        PyNoDotVer: '311'
-      Python312:
-        PyDotVer: '3.12'
-        PyNoDotVer: '312'
-  timeoutInMinutes: 240
-  workspace:
-    clean: all
-  pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
-#  set variables here to be used in the template and steps
-  variables:
-  # The build machine pool doesn't have dotnet, so it can't run CG.
-  - name: skipComponentGovernanceDetection
-    value: true
-  - name: arch
-    value: ${{ parameters.arch }}
-  - name: ep
-    value: ${{ parameters.ep }}
-  - name: artifactName
-    value: 'onnxruntime-genai-capi-linux-${{ parameters.ep }}-${{ parameters.arch }}-python'
-  - name: cuda_version
-    value: ${{ parameters.cuda_version }}
-  - name: ort_version
-    value: ${{ parameters.ort_version }}
-  - name: ort_filename
-    ${{ if eq(parameters.ep, 'cpu') }}:
-      value: 'onnxruntime-linux-${{ parameters.arch }}-${{ parameters.ort_version }}'
-    ${{ else}}:
-      ${{if eq(parameters.cuda_version, '11.8') }}:
-        value: 'onnxruntime-linux-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
-      ${{ else }}:
-        value: 'onnxruntime-linux-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
-  steps:
-
-  - template: steps/capi-linux-step.yml
-    parameters:
-      target: 'python'
-
-  - template: steps/compliant-and-cleanup-step.yml
-
diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml
new file mode 100644
index 000000000..7066b070e
--- /dev/null
+++ b/.pipelines/stages/jobs/py-packaging-job.yml
@@ -0,0 +1,105 @@
+parameters:
+- name: arch
+  type: string
+- name: ep
+  type: string
+- name: ort_version
+  type: string
+- name: cuda_version
+  type: string
+  default: ''
+- name: os
+  type: string
+  values:
+  - 'linux'
+  - 'win'
+- name: publish_to_ado_feed
+  type: boolean
+
+jobs:
+- job: python_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging
+  ${{ if eq(parameters.os, 'linux') }}:
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
+  ${{ if eq(parameters.os, 'win') }}:
+    pool: 'onnxruntime-Win-CPU-2022'
+  strategy:
+    matrix:
+      Python38:
+        PyDotVer: '3.8'
+        PyNoDotVer: '38'
+      Python39:
+        PyDotVer: '3.9'
+        PyNoDotVer: '39'
+      Python310:
+        PyDotVer: '3.10'
+        PyNoDotVer: '310'
+      Python311:
+        PyDotVer: '3.11'
+        PyNoDotVer: '311'
+      Python312:
+        PyDotVer: '3.12'
+        PyNoDotVer: '312'
+  timeoutInMinutes: 240
+  workspace:
+    clean: all
+  #  set variables here to be used in the template and steps
+  variables:
+  - name: skipComponentGovernanceDetection
+    ${{ if eq(parameters.os, 'linux') }}:
+      value: true
+    ${{ if eq(parameters.os, 'win') }}:
+      value: false
+  - name: arch
+    value: ${{ parameters.arch }}
+  - name: artifactName
+    value: 'onnxruntime-genai-${{ parameters.os }}-${{ parameters.ep }}-${{ parameters.arch }}'
+  - name: buildConfig
+    value: 'Release'
+  - name: buildDir
+    value: 'build/${{ parameters.ep }}'
+  - name: cuda_version
+    value: ${{ parameters.cuda_version }}
+  - name: ep
+    value: ${{ parameters.ep }}
+  - name: ort_version
+    value: ${{ parameters.ort_version }}
+  - name: ort_filename
+    ${{ if eq(parameters.ep, 'cpu') }}:
+      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}'
+    ${{ else}}:
+      ${{if eq(parameters.cuda_version, '11.8') }}:
+        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
+      ${{ else }}:
+        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: $(PyDotVer)
+      addToPath: true
+      architecture: $(arch)
+  - task: PythonScript@0
+    inputs:
+      scriptSource: inline
+      script: |
+        import sys
+        import subprocess
+        subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'build', 'packaging', 'twine'])
+      workingDirectory: '$(Build.BinariesDirectory)'
+      displayName: 'Install python modules'
+
+  - ${{ if eq(parameters.os, 'linux') }}:
+    - template: steps/capi-linux-step.yml
+      parameters:
+        target: 'python'
+
+  # Windows job needs to set the python version and install the required packages
+  - ${{ if eq(parameters.os, 'win') }}:
+    - template: steps/capi-win-step.yml
+      parameters:
+        target: 'python'
+
+  - ${{ if eq(parameters.publish_to_ado_feed, true)}}:
+    - template: steps/py-ado-feed-releasing-step.yml
+
+  - template: steps/compliant-and-cleanup-step.yml
+
diff --git a/.pipelines/stages/jobs/py-win-packaging-job.yml b/.pipelines/stages/jobs/py-win-packaging-job.yml
deleted file mode 100644
index 0989398eb..000000000
--- a/.pipelines/stages/jobs/py-win-packaging-job.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-parameters:
-- name: arch
-  type: string
-- name: ort_version
-  type: string
-- name: cuda_version
-  type: string
-  default: ''
-- name: ep
-  type: string
-jobs:
-- job: Windows_${{ parameters.ep }}_${{ parameters.arch }}_Wheels
-  pool: 'onnxruntime-Win-CPU-2022'
-  strategy:
-    matrix:
-      Python38_x64:
-        PythonVersion: '3.8'
-      Python39_x64:
-        PythonVersion: '3.9'
-      Python310_x64:
-        PythonVersion: '3.10'
-      Python311_x64:
-        PythonVersion: '3.11'
-      Python312_x64:
-        PythonVersion: '3.12'
-  timeoutInMinutes: 180
-#  set variables here to be used in the template and steps
-  variables:
-  - name: ep
-    value: ${{ parameters.ep }}
-  - name: cuda_version
-    value: ${{ parameters.cuda_version }}
-  - name: artifactName
-    value: 'onnxruntime-genai-capi-win-${{ parameters.ep }}-${{ parameters.arch }}-wheel'
-  - name: arch
-    value: ${{ parameters.arch }}
-  - name: ort_version
-    value: ${{ parameters.ort_version }}
-  - name: ort_filename
-    ${{ if eq(parameters.ep, 'cpu') }}:
-      value: 'onnxruntime-win-${{ parameters.arch }}-${{ parameters.ort_version }}'
-    ${{ else}}:
-      ${{if eq(parameters.cuda_version, '11.8') }}:
-        value: 'onnxruntime-win-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
-      ${{ else }}:
-        value: 'onnxruntime-win-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
-  workspace:
-    clean: all
-  steps:
-
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: $(PythonVersion)
-      addToPath: true
-      architecture: $(arch)
-
-  - task: PythonScript@0
-    inputs:
-      scriptSource: inline
-      script: |
-        import sys
-        import subprocess
-        subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'build', 'packaging'])
-      workingDirectory: '$(Build.BinariesDirectory)'
-      displayName: 'Install python modules'
-
-  - template: steps/capi-win-step.yml
-    parameters:
-      target: 'python'
-
-  - template: steps/compliant-and-cleanup-step.yml
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml
index 6fa0f3c92..03f76feb1 100644
--- a/.pipelines/stages/jobs/steps/capi-linux-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml
@@ -93,7 +93,7 @@ steps:
   - task: BinSkim@4
     displayName: 'Run BinSkim'
     inputs:
-      AnalyzeTargetGlob: '$(Build.Repository.LocalPath)/**/*.pyd'
+      AnalyzeTargetGlob: '$(Build.Repository.LocalPath)/build/**/*cpython*.so'
     continueOnError: true
 
   - bash: |
diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml
index aebc4cd13..3681bffd4 100644
--- a/.pipelines/stages/jobs/steps/capi-win-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-win-step.yml
@@ -59,7 +59,7 @@ steps:
 - ${{ if eq(parameters.target, 'onnxruntime-genai') }}:
   - template: compliant/win-esrp-dll-step.yml
     parameters:
-      FolderPath: '$(buildDir)'
+      FolderPath: '$(Build.Repository.LocalPath)\$(buildDir)'
       DisplayName: 'ESRP - Sign C++ dlls'
       Pattern: '*genai.dll'
 
diff --git a/.pipelines/stages/jobs/steps/nuget-releasing-step.yml b/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml
similarity index 73%
rename from .pipelines/stages/jobs/steps/nuget-releasing-step.yml
rename to .pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml
index 8442fd069..331a9ea7c 100644
--- a/.pipelines/stages/jobs/steps/nuget-releasing-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml
@@ -1,8 +1,3 @@
-parameters:
-- name: publish_to_ado_feed
-  type: boolean
-- name: publish_to_nuget
-  type: boolean
 steps:
 - task: NuGetToolInstaller@1
   inputs:
@@ -39,11 +34,10 @@ steps:
   parameters:
     packageFolder: '$(GDN_CODESIGN_TARGETDIRECTORY)'
 #This task must be run on a Windows machine
-- ${{ if eq(parameters.publish_to_ado_feed, true) }}:
-  - task: NuGetCommand@2
-    displayName: 'NuGet push to Azure DevOps Feed'
-    inputs:
-      command: push
-      packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg'
-      publishVstsFeed: 'PublicPackages/onnxruntime-genai'
-      allowPackageConflicts: true
\ No newline at end of file
+- task: NuGetCommand@2
+  displayName: 'NuGet push to Azure DevOps Feed'
+  inputs:
+    command: push
+    packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg'
+    publishVstsFeed: 'PublicPackages/onnxruntime-genai'
+    allowPackageConflicts: true
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml b/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml
new file mode 100644
index 000000000..85c0a7e3d
--- /dev/null
+++ b/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml
@@ -0,0 +1,10 @@
+steps:
+- task: TwineAuthenticate@1
+  inputs:
+    artifactFeed: PublicPackages/onnxruntime-genai
+- script: 'python -m twine upload -r onnxruntime-genai --config-file $(PYPIRC_PATH) --non-interactive *.whl'
+  workingDirectory: '$(Build.ArtifactStagingDirectory)/wheel'
+  displayName: 'Uploading wheels to PublicPackages/onnxruntime-genai'
+  retryCountOnTaskFailure: 3
+  env:
+    SYSTEM_ACCESSTOKEN: $(System.AccessToken)
\ No newline at end of file
diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml
index db500916b..f962337ac 100644
--- a/.pipelines/stages/nuget-packaging-stage.yml
+++ b/.pipelines/stages/nuget-packaging-stage.yml
@@ -14,8 +14,6 @@ parameters:
   default: ''
 - name: publish_to_ado_feed
   type: boolean
-- name: publish_to_nuget
-  type: boolean
 
 stages:
 - stage: nuget_packaging
@@ -28,7 +26,6 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        publish_to_nuget: ${{ parameters.publish_to_nuget }}
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -38,7 +35,6 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        publish_to_nuget: ${{ parameters.publish_to_nuget }}
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -47,7 +43,6 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        publish_to_nuget: ${{ parameters.publish_to_nuget }}
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -56,5 +51,4 @@ stages:
         ep: 'cuda'
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        publish_to_nuget: ${{ parameters.publish_to_nuget }}
\ No newline at end of file
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
\ No newline at end of file
diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml
index 62181490a..e23581f56 100644
--- a/.pipelines/stages/py-packaging-stage.yml
+++ b/.pipelines/stages/py-packaging-stage.yml
@@ -12,37 +12,47 @@ parameters:
 - name: cuda_version
   type: string
   default: ''
+- name: publish_to_ado_feed
+  type: boolean
+
 stages:
-- stage: Python_Packaging_Stage
+- stage: python_packaging
   jobs:
   - ${{ if eq(parameters.enable_win_cpu, true) }}:
-    - template: jobs/py-win-packaging-job.yml
+    - template: jobs/py-packaging-job.yml
       parameters:
         arch: 'x64'
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
+        os: 'win'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
-    - template: jobs/py-win-packaging-job.yml
+    - template: jobs/py-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
         ort_version: ${{ parameters.ort_version }}
-
+        os: 'win'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
 
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
-    - template: jobs/py-linux-packaging-job.yml
+    - template: jobs/py-packaging-job.yml
       parameters:
         arch: 'x64'
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
+        os: 'linux'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
-    - template: jobs/py-linux-packaging-job.yml
+    - template: jobs/py-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
         ort_version: ${{ parameters.ort_version }}
+        os: 'linux'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
 
 
 

From d656be953c66c3d1fba17a546b12a30348558323 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Wed, 27 Mar 2024 13:30:15 -0400
Subject: [PATCH 27/36] Mergin rel-0.1.0 back to main (#231)

---
 .pipelines/stages/jobs/steps/nuget-win-step.yml | 2 +-
 VERSION_INFO                                    | 2 +-
 examples/csharp/README.md                       | 8 ++++++++
 examples/python/README.md                       | 4 ++++
 test/csharp/TestOnnxRuntimeGenAIAPI.cs          | 6 +++---
 5 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml
index 3191412a3..11b134caa 100644
--- a/.pipelines/stages/jobs/steps/nuget-win-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml
@@ -16,7 +16,7 @@ steps:
     DisplayName: 'ESRP - Sign C# dlls'
     Pattern: '*OnnxRuntimeGenAI*.dll'
 - powershell: |
-    $VERSION = '0.1.0-rc1'
+    $VERSION = '0.1.0-rc4'
     nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec `
       -Prop version=$VERSION `
       -Prop genai_nuget_ext=$(genai_nuget_ext) `
diff --git a/VERSION_INFO b/VERSION_INFO
index 49ffebcaa..3e2177af6 100644
--- a/VERSION_INFO
+++ b/VERSION_INFO
@@ -1 +1 @@
-0.1.0-dev
\ No newline at end of file
+0.1.0rc4
\ No newline at end of file
diff --git a/examples/csharp/README.md b/examples/csharp/README.md
index edb71a717..7052a02d4 100644
--- a/examples/csharp/README.md
+++ b/examples/csharp/README.md
@@ -1,5 +1,13 @@
 # Gen-AI C# Phi-2 Example
 
+## Install the onnxruntime-genai library
+
+* Install the python package
+
+  ```bash
+  pip install onnxruntime-genai
+  ```
+
 ## Get the model
 
 You can generate the model using the model builder provided with this library, or bring your own model.
diff --git a/examples/python/README.md b/examples/python/README.md
index cf7fe3450..6d1490de2 100644
--- a/examples/python/README.md
+++ b/examples/python/README.md
@@ -4,6 +4,10 @@
 
 Install the python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install).
 
+  ```bash
+  cd build/wheel
+  pip install onnxruntime_genai-*.whl
+  ```
 
 ## Get the model
 
diff --git a/test/csharp/TestOnnxRuntimeGenAIAPI.cs b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
index 156f943b4..2113ffdca 100644
--- a/test/csharp/TestOnnxRuntimeGenAIAPI.cs
+++ b/test/csharp/TestOnnxRuntimeGenAIAPI.cs
@@ -93,7 +93,7 @@ public void TestTopKSearch()
             int topK = 100;
             float temp = 0.6f;
             ulong maxLength = 20;
-            
+
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
             using (var model = new Model(modelPath))
             {
@@ -135,7 +135,7 @@ public void TestTopPSearch()
             float topP = 0.6f;
             float temp = 0.6f;
             ulong maxLength = 20;
-            
+
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
             using (var model = new Model(modelPath))
             {
@@ -178,7 +178,7 @@ public void TestTopKTopPSearch()
             float topP = 0.6f;
             float temp = 0.6f;
             ulong maxLength = 20;
-            
+
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_models", "cpu", "phi-2");
             using (var model = new Model(modelPath))
             {

From 7aef327a36bc318f126c6d2703367b60595dd27b Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Fri, 29 Mar 2024 10:58:26 -0700
Subject: [PATCH 28/36] DML - Preload DirectML.dll to not use OS version (#241)

If we fail to preload, we abort using DirectML as using the OS copy will
fail when we try to use it.
Also Copy DirectML.dll to the install folder as part of the other
onnxruntime files
---
 CMakeLists.txt       |  3 +++
 src/models/model.cpp | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de12d6482..ff70bea11 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,6 +183,9 @@ endif()
 
 # Copy the onnxruntime binaries into the build folder so it's found on launch
 file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
+if(USE_DML)
+  list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll")
+endif()
 foreach(DLL_FILE ${onnxruntime_libs})
   add_custom_command(
     TARGET onnxruntime-genai POST_BUILD
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 55ae62728..5e4c519ee 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -13,6 +13,23 @@
 //  Because dml_provider_factory includes windows headers that #define min and max, this next line will prevent this from happening
 #define NOMINMAX
 #include "dml_provider_factory.h"
+
+EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+
+static std::wstring CurrentModulePath() {
+  wchar_t path[MAX_PATH];
+  GetModuleFileNameW((HINSTANCE)&__ImageBase, path, _countof(path));
+
+  wchar_t absolute_path[MAX_PATH];
+  wchar_t* name;
+  GetFullPathNameW(path, _countof(path), absolute_path, &name);
+
+  auto idx = std::distance(absolute_path, name);
+  auto out_path = std::wstring(absolute_path);
+  out_path.resize(idx);
+
+  return out_path;
+}
 #endif
 
 namespace Generators {
@@ -302,6 +319,9 @@ void Model::CreateSessionOptions() {
       Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api)));
       if (!p_dml_api)
         throw std::runtime_error("Unexpected nullptr getting OrtDmlApi");
+      auto directml_dll = CurrentModulePath() + L"DirectML.dll";
+      if (LoadLibraryExW(directml_dll.c_str(), nullptr, 0) == NULL)
+        throw std::runtime_error("DirectML.dll not found");
       p_dml_api->SessionOptionsAppendExecutionProvider_DML(&ort_options, 0);
 #endif
     } else

From a11c9a74109996a2bb58ef471dc14fdceddac61d Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Mon, 1 Apr 2024 13:11:25 -0700
Subject: [PATCH 29/36] Fix calculating rotary embedding dim (#244)

### Description

This PR fixes how `rotary_embedding_dim` is calculated.

### Motivation and Context

This PR fixes [this
issue](https://github.com/microsoft/onnxruntime-genai/issues/237).
---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index fa022f7aa..740c7ca40 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -1471,7 +1471,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
         self.layernorm_attrs["simple"] = False
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
-        self.rotemb_attrs["rotary_embedding_dim"] = self.num_attn_heads
+        self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
 
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)

From 7869d91d5a2c0c5d5c04770fb11084247237b640 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Mon, 1 Apr 2024 14:19:39 -0700
Subject: [PATCH 30/36] Add fp32 test to nightly run (#242)

---
 test/python/test_onnxruntime_genai_e2e.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/test/python/test_onnxruntime_genai_e2e.py b/test/python/test_onnxruntime_genai_e2e.py
index cc6f9dde2..f76354261 100644
--- a/test/python/test_onnxruntime_genai_e2e.py
+++ b/test/python/test_onnxruntime_genai_e2e.py
@@ -10,7 +10,7 @@
 
 
 def download_model(
-    download_path: str | bytes | os.PathLike, device: str, model_identifier: str
+    download_path: str | bytes | os.PathLike, device: str, model_identifier: str, precision: str
 ):
     # python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o download_path
     command = [
@@ -20,7 +20,7 @@ def download_model(
         "-m",
         model_identifier,
         "-p",
-        "int4",
+        precision,
         "-e",
         device,
         "-o",
@@ -51,7 +51,9 @@ def run_model(model_path: str | bytes | os.PathLike):
 
 if __name__ == "__main__":
     for model_name in ["microsoft/phi-2"]:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            device = "cuda" if og.is_cuda_available() else "cpu"
-            download_model(temp_dir, device, model_name)
-            run_model(temp_dir)
+        for precision in ["int4", "fp32"]:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                device = "cuda" if og.is_cuda_available() else "cpu"
+                download_model(temp_dir, device, model_name, precision)
+                run_model(temp_dir)
+            

From 6ad63e199b528a3d075d3a0b2e1dc91f8ff8a6e1 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 2 Apr 2024 00:36:52 -0700
Subject: [PATCH 31/36] Make position_ids be an optional input (#246)

---
 src/models/model.cpp        |  8 +++++
 src/models/model.h          |  3 ++
 src/models/position_ids.cpp | 58 ++++++++++++++++++++-----------------
 src/models/position_ids.h   |  1 +
 4 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/src/models/model.cpp b/src/models/model.cpp
index 5e4c519ee..6ab493f19 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -210,6 +210,14 @@ SessionInfo::SessionInfo(OrtSession& session) {
   }
 }
 
+bool SessionInfo::HasInput(const std::string& name) const {
+  return inputs_.find(name) != inputs_.end();
+}
+
+bool SessionInfo::HasOutput(const std::string& name) const {
+  return outputs_.find(name) != outputs_.end();
+}
+
 ONNXTensorElementDataType SessionInfo::GetInputDataType(const std::string& name) const {
   auto result = inputs_.find(name);
   if (result == inputs_.end())
diff --git a/src/models/model.h b/src/models/model.h
index 9af784362..a4b70ae46 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -88,6 +88,9 @@ struct Tokenizer : std::enable_shared_from_this<Tokenizer> {
 struct SessionInfo {
   SessionInfo(OrtSession& session);
 
+  bool HasInput(const std::string& name) const;
+  bool HasOutput(const std::string& name) const;
+
   ONNXTensorElementDataType GetInputDataType(const std::string& name) const;
   ONNXTensorElementDataType GetOutputDataType(const std::string& name) const;
 
diff --git a/src/models/position_ids.cpp b/src/models/position_ids.cpp
index a0e8d6b56..ec6ebd579 100644
--- a/src/models/position_ids.cpp
+++ b/src/models/position_ids.cpp
@@ -8,7 +8,9 @@ namespace Generators {
 PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray<int32_t>& sequence_lengths_unk)
     : model_{model},
       state_{state} {
-  type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.position_ids);
+  has_position_ids_ = model_.session_info_->HasInput(model_.config_->model.decoder.inputs.position_ids);
+  type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.attention_mask);
+
   if (type_ != Ort::TypeToTensorType<int32_t>::type && type_ != Ort::TypeToTensorType<int64_t>::type)
     throw std::runtime_error("position_ids & attention_mask only support int32 or int64 types");
 
@@ -33,38 +35,42 @@ PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray<int32_t>
 void PositionIDs::Add() {
   input_index_ = state_.inputs_.size();
 
-  state_.inputs_.push_back(position_ids_.get());
-  state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str());
+  if (has_position_ids_) {
+    state_.inputs_.push_back(position_ids_.get());
+    state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str());
+  }
 
   state_.inputs_.push_back(attention_mask_.get());
   state_.input_names_.push_back(model_.config_->model.decoder.inputs.attention_mask.c_str());
 }
 
 void PositionIDs::Update(int current_length) {
-  // Reallocate position_ids for the 2nd and onward shape
-  if (position_ids_next_) {
-    position_ids_ = std::move(position_ids_next_);
-    position_ids_shape_[1] = 1;
-    state_.inputs_[input_index_] = position_ids_.get();
-  } else {  // Just incrementing existing position IDs
-    switch (model_.device_type_) {
-      case DeviceType::CPU: {
-        if (type_ == Ort::TypeToTensorType<int32_t>::type)
-          UpdatePositionIDs<int32_t>();
-        else
-          UpdatePositionIDs<int64_t>();
-        break;
-      }
+  if (has_position_ids_) {
+    // Reallocate position_ids for the 2nd and onward shape
+    if (position_ids_next_) {
+      position_ids_ = std::move(position_ids_next_);
+      position_ids_shape_[1] = 1;
+      state_.inputs_[input_index_] = position_ids_.get();
+    } else {  // Just incrementing existing position IDs
+      switch (model_.device_type_) {
+        case DeviceType::CPU: {
+          if (type_ == Ort::TypeToTensorType<int32_t>::type)
+            UpdatePositionIDs<int32_t>();
+          else
+            UpdatePositionIDs<int64_t>();
+          break;
+        }
 #if USE_CUDA
-      case DeviceType::CUDA:
-        if (type_ == Ort::TypeToTensorType<int32_t>::type)
-          cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int32_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
-        else
-          cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int64_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
-        break;
+        case DeviceType::CUDA:
+          if (type_ == Ort::TypeToTensorType<int32_t>::type)
+            cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int32_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
+          else
+            cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int64_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
+          break;
 #endif
-      default:
-        throw std::runtime_error("PositionIDs::Update - Unsupported device type");
+        default:
+          throw std::runtime_error("PositionIDs::Update - Unsupported device type");
+      }
     }
   }
 
@@ -95,7 +101,7 @@ void PositionIDs::Update(int current_length) {
         throw std::runtime_error("PositionIDs::Update - Unsupported device type");
     }
     attention_mask_ = std::move(next_attention_mask);
-    state_.inputs_[input_index_ + 1] = attention_mask_.get();
+    state_.inputs_[input_index_ + has_position_ids_] = attention_mask_.get();
   }
 }
 
diff --git a/src/models/position_ids.h b/src/models/position_ids.h
index 411b55c1c..22601f359 100644
--- a/src/models/position_ids.h
+++ b/src/models/position_ids.h
@@ -21,6 +21,7 @@ struct PositionIDs {
   State& state_;
   size_t input_index_{~0U};
   ONNXTensorElementDataType type_;  // Common type for position_ids and attention_mask
+  bool has_position_ids_;
 
   std::array<int64_t, 2> position_ids_shape_{};  // {params.batch_size*params.beam_size, params.sequence_length}
   std::unique_ptr<OrtValue> position_ids_;

From 935ed35651aca79ff628da4a53b12d77be2af269 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Wed, 3 Apr 2024 12:15:25 -0700
Subject: [PATCH 32/36] refine the c example (#248)

---
 examples/c/CMakeLists.txt | 31 ++++++++++++++++++++++++-------
 examples/c/README.md      |  2 +-
 examples/c/src/main.cpp   | 24 +++++++++++++++++-------
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
index d44909286..9b33a3ed3 100644
--- a/examples/c/CMakeLists.txt
+++ b/examples/c/CMakeLists.txt
@@ -4,13 +4,24 @@ project(phi2)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++2a")
 
+set(ORT_GENAI_LIB_DIR ${CMAKE_SOURCE_DIR}/lib)
+
+if(WIN32)
+  set(ONNXRUNTIME_GENAI_LIB "onnxruntime-genai.dll")
+  set(ONNXRUNTIME_GENAI_DEPENDENCY "*.dll")
+elseif(APPLE)
+  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.dylib")
+  set(ONNXRUNTIME_GENAI_DEPENDENCY "*.dylib")
+else()
+  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.so")
+  set(ONNXRUNTIME_GENAI_DEPENDENCY "*.so")
+endif()
+
 add_executable(phi2 ${CMAKE_SOURCE_DIR}/src/main.cpp)
 
-add_library(onnxruntime-genai SHARED IMPORTED)
-set_target_properties(onnxruntime-genai PROPERTIES
-    IMPORTED_LOCATION_RELEASE ${CMAKE_SOURCE_DIR}/lib/onnxruntime-genai.dll
-    IMPORTED_IMPLIB_RELEASE ${CMAKE_SOURCE_DIR}/lib/onnxruntime-genai.lib
-)
+
+target_link_directories(phi2 PRIVATE ${ORT_GENAI_LIB_DIR})
+target_link_libraries(phi2 PRIVATE ${ONNXRUNTIME_GENAI_LIB})
 target_include_directories(phi2 PRIVATE ${CMAKE_SOURCE_DIR}/include)
 
 target_link_libraries(
@@ -18,5 +29,11 @@ target_link_libraries(
         PUBLIC
         onnxruntime-genai)
 
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/phi-2" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/Release")
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/lib/" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/Release")
+file(GLOB ort_genai_libs "${CMAKE_SOURCE_DIR}/lib/${ONNXRUNTIME_GENAI_DEPENDENCY}")
+
+foreach(DLL_FILE ${ort_genai_libs})
+  add_custom_command(
+    TARGET phi2 POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_FILE} $<TARGET_FILE_DIR:phi2>
+  )
+endforeach()
\ No newline at end of file
diff --git a/examples/c/README.md b/examples/c/README.md
index 8cd2168fd..0a45578cd 100644
--- a/examples/c/README.md
+++ b/examples/c/README.md
@@ -48,5 +48,5 @@ cmake --build . --config Release
 
 ```bash
 cd build\\Release
-.\phi2.exe
+.\phi2.exe path_to_model
 ```
diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp
index d9aeb68a8..e4be639f2 100644
--- a/examples/c/src/main.cpp
+++ b/examples/c/src/main.cpp
@@ -4,8 +4,8 @@
 
 // C++ API Example
 
-void CXX_API() {
-  auto model = OgaModel::Create("phi-2");
+void CXX_API(const char* model_path) {
+  auto model = OgaModel::Create(model_path);
   auto tokenizer = OgaTokenizer::Create(*model);
 
   const char* prompt = "def is_prime(num):";
@@ -34,9 +34,9 @@ void CheckResult(OgaResult* result) {
   }
 }
 
-void C_API() {
+void C_API(const char* model_path) {
   OgaModel* model;
-  OgaCreateModel("phi-2", &model);
+  OgaCreateModel(model_path, &model);
 
   OgaTokenizer* tokenizer;
   CheckResult(OgaCreateTokenizer(model, &tokenizer));
@@ -74,16 +74,26 @@ void C_API() {
   OgaDestroyModel(model);
 }
 
-int main() {
+static void print_usage(int /*argc*/, char** argv) {
+  std::cerr << "usage: " << argv[0] << " model_path" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 2) {
+    print_usage(argc, argv);
+    return -1;
+  }
+
+
   std::cout << "-------------" << std::endl;
   std::cout << "Hello, Phi-2!" << std::endl;
   std::cout << "-------------" << std::endl;
 
   std::cout << "C++ API" << std::endl;
-  CXX_API();
+  CXX_API(argv[1]);
 
   std::cout << "C API" << std::endl;
-  C_API();
+  C_API(argv[1]);
 
   return 0;
 }
\ No newline at end of file

From 75f87021b445247b650ef616e8f3d8211f26b64c Mon Sep 17 00:00:00 2001
From: rui-ren <ruiren1225@gmail.com>
Date: Wed, 3 Apr 2024 13:14:21 -0700
Subject: [PATCH 33/36] update README doc (#247)

1. Update README.md
    - Add the `model builder` step to `README.md`
    - Use `abspath` for  `og.Model`.
    - Update to `GeneratorParams`

2. Add `pandas` as dependent packages for benchmark.

Co-authored-by: Ubuntu <ruiren@ruiren-deepspeed-2.0xbbdgjggicerh10jiowhcdjwg.bx.internal.cloudapp.net>
---
 README.md               | 13 +++++++++++--
 benchmark/python/README |  4 ++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b572a94fd..0e8b42566 100644
--- a/README.md
+++ b/README.md
@@ -50,10 +50,19 @@ See full documentation at [https://onnxruntime.ai/docs/genai].
 
 [Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package.
 
+1. Build the model
+```shell
+python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./models/phi2
+```
+
+2. Run inference
 ```python
+import os
 import onnxruntime_genai as og
 
-model = og.Model(f'models/microsoft/phi-2')
+model_path = os.path.abspath("./models/phi2")
+
+model = og.Model(model_path)
 
 tokenizer = og.Tokenizer(model)
 
@@ -64,7 +73,7 @@ prompt = '''def print_prime(n):
 
 tokens = tokenizer.encode(prompt)
 
-params = og.SearchParams(model)
+params = og.GeneratorParams(model)
 params.set_search_options({"max_length":200})
 params.input_ids = tokens
 
diff --git a/benchmark/python/README b/benchmark/python/README
index da1174309..67cac3ccb 100644
--- a/benchmark/python/README
+++ b/benchmark/python/README
@@ -2,7 +2,7 @@ This is an end-to-end benchmarking script for any GenAI-supported ONNX model.
 
 
 Prerequisites: 
-0) Install onnxruntime-genai and onnxruntime
+0) Install pandas, onnxruntime-genai and onnxruntime
 
 1) Use builder.py to build the desired ONNX model
 
@@ -10,4 +10,4 @@ Prerequisites:
 
 
 Example call to benchmarking script
-python benchmark_e2e.py -i {model folder} -b 1 -l 128 -g 256 -r 100 -w 10 -k 5 -o {output csv file name}
\ No newline at end of file
+python benchmark_e2e.py -i {model folder} -b 1 -l 128 -g 256 -r 100 -w 10 -k 5 -o {output csv file name}

From a2789fb84a381b202790a905527e1aaae716524a Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Mon, 8 Apr 2024 13:25:51 -0700
Subject: [PATCH 34/36] Add a random_seed search option (#250)

It goes into the search options as `"random_seed":1234` (for example).
The default value is -1, which means to use a random seed.

There is an issue with CUDA where the output can eventually differ even
if the same seed is used. I did some investigation and the random
numbers match, but the tokens chosen will differ eventually in longer
outputs.

With CPU the output always matches.
---
 src/config.cpp        |  2 ++
 src/config.h          |  1 +
 src/cuda_sampling.cu  | 42 +++++++++++++++++++++---------------------
 src/cuda_sampling.cuh | 20 +++++++++++---------
 src/search.cpp        | 12 +++++++++++-
 src/search.h          |  1 -
 src/search_cuda.cpp   |  8 +++++++-
 7 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/src/config.cpp b/src/config.cpp
index 74045a524..24d789bdd 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -285,6 +285,8 @@ struct Search_Element : JSON::Element {
       v_.diversity_penalty = static_cast<float>(value);
     } else if (name == "length_penalty") {
       v_.length_penalty = static_cast<float>(value);
+    } else if (name == "random_seed") {
+      v_.random_seed = static_cast<int>(value);
     } else
       throw JSON::unknown_value_error{};
   }
diff --git a/src/config.h b/src/config.h
index 2621edc21..b5eb67dcc 100644
--- a/src/config.h
+++ b/src/config.h
@@ -86,6 +86,7 @@ struct Config {
     float diversity_penalty{};
     float length_penalty{1.0f};        // Exponential penalty to the length that is used with beam-based generation. length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
     bool past_present_share_buffer{};  // The past/present kv tensors are shared and allocated once to max_length (cuda only)
+    int random_seed{-1};               // -1 = Seed with random device, otherwise use value to seed RNG
   } search;
 };
 
diff --git a/src/cuda_sampling.cu b/src/cuda_sampling.cu
index 471d593ae..bef166d9f 100644
--- a/src/cuda_sampling.cu
+++ b/src/cuda_sampling.cu
@@ -11,7 +11,6 @@
 #include "smartptrs.h"
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
-#include <curand_kernel.h>
 #include <iostream>
 
 namespace Generators {
@@ -20,7 +19,15 @@ namespace cuda {
 constexpr int kMaxThreads = 1024;
 constexpr int kGPUWarpSize = 32;
 
-SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream) {
+__global__ void InitCurandStates(unsigned long long seed, curandState* states, int batch_size) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= batch_size)
+    return;
+
+  curand_init(seed, index, 0, &states[index]);
+}
+
+SamplingData::SamplingData(unsigned long long random_seed, int batch_size, int vocab_size, cudaStream_t stream) {
   indices_sorted = CudaMallocArray<int>(vocab_size * batch_size);
   scores_sorted = CudaMallocArray<float>(vocab_size * batch_size);
   scores_softmaxed = CudaMallocArray<float>(vocab_size * batch_size);
@@ -28,10 +35,13 @@ SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream)
   thresholds = CudaMallocArray<float>(batch_size);
   indices_in = CudaMallocArray<int>(vocab_size * batch_size);
   offsets = CudaMallocArray<int>(batch_size + 1);
+  curand_states = CudaMallocArray<curandState>(batch_size);
   temp_storage_bytes = 0;
   cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, (float*)nullptr, (float*)nullptr,
     (int*)nullptr, (int*)nullptr, vocab_size*batch_size, batch_size, (int*)nullptr, (int*)nullptr, 0, sizeof(float) * 8, stream);
   temp_buffer = CudaMallocArray<float>(temp_storage_bytes / sizeof(float));
+
+  InitCurandStates<<<int(batch_size / 128) + 1, 128, 0, stream>>>(random_seed, curand_states.get(), batch_size);
 }
 
 // Softmax Kernels and Launchers
@@ -431,37 +441,31 @@ void LaunchGetTopKSubset(cudaStream_t stream, float* scores_in, float* scores_ou
 }
 
 // Sets up random thresholds for top p or top k sampling
-__global__ void RandomThresholdKernelTopPAndK(int seed, float* thresholds, float* prefix_sums, int batch_size, float p, int k) {
+__global__ void RandomThresholdKernelTopPAndK(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, float p, int k) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  curandState state;
-  curand_init(seed, index, 0, &state);
 
   float k_prob = prefix_sums[k-1];
   if (index < batch_size) {
     float min_p = fminf(p, k_prob);
-    thresholds[index] = min_p * curand_uniform(&state);
+    thresholds[index] = min_p * curand_uniform(&curand_states[index]);
   }
 }
 
 // Sets up random thresholds for top p or top k sampling
-__global__ void RandomThresholdKernelTopP(int seed, float* thresholds, float* prefix_sums, int batch_size, float p) {
+__global__ void RandomThresholdKernelTopP(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, float p) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  curandState state;
-  curand_init(seed, index, 0, &state);
 
   if (index < batch_size) {
-    thresholds[index] = p * curand_uniform(&state);
+    thresholds[index] = p * curand_uniform(&curand_states[index]);
   }
 }
 
 // Sets up random thresholds for top p or top k sampling
-__global__ void RandomThresholdKernelTopK(int seed, float* thresholds, float* prefix_sums, int batch_size, int k) {
+__global__ void RandomThresholdKernelTopK(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, int k) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  curandState state;
-  curand_init(seed, index, 0, &state);
 
   if (index < batch_size) {
-    thresholds[index] = prefix_sums[k-1] * curand_uniform(&state);
+    thresholds[index] = prefix_sums[k - 1] * curand_uniform(&curand_states[index]);
   }
 }
 
@@ -502,16 +506,12 @@ void LaunchSampleKernel(SamplingData* data, cudaStream_t stream, float* scores,
   PrefixSumKernel<256><<<grid, block, 0, stream>>>(scores, prefix_sums.data(), sample_range, batch_size);
   // Random Thresholds for Top P or Top K Sampling
   std::span<float> thresholds{data->thresholds.get(), static_cast<size_t>(batch_size)};
-  std::random_device rd;
-  std::mt19937 eee(rd());
-  std::uniform_int_distribution<int> dist(0, std::numeric_limits<int>::max());
-  int seed = dist(eee);
   if (p > 0.0 && k > 1) {
-    RandomThresholdKernelTopPAndK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p, k);
+    RandomThresholdKernelTopPAndK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, p, k);
   } else if (p > 0.0) {
-    RandomThresholdKernelTopP<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p);
+    RandomThresholdKernelTopP<<<int(batch_size / 128) + 1, 128, 0, stream>>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, p);
   } else if (k > 1) {
-    RandomThresholdKernelTopK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, k);
+    RandomThresholdKernelTopK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, k);
   }
   SampleKernel<256><<<grid, block, 0, stream>>>(prefix_sums.data(), indices, index_out, sample_range, thresholds.data());
 }
diff --git a/src/cuda_sampling.cuh b/src/cuda_sampling.cuh
index f7f74b827..cc8ab9867 100644
--- a/src/cuda_sampling.cuh
+++ b/src/cuda_sampling.cuh
@@ -1,20 +1,22 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "smartptrs.h"
+#include <curand_kernel.h>
 
 namespace Generators {
 namespace cuda {
 
 struct SamplingData {
-  SamplingData(int batch_size, int vocab_size, cudaStream_t stream);
-  std::unique_ptr<int, Generators::CudaDeleter> indices_sorted = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> scores_sorted = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> scores_softmaxed = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> prefix_sums = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> thresholds = nullptr;
-  std::unique_ptr<int, Generators::CudaDeleter> indices_in = nullptr;
-  std::unique_ptr<int, Generators::CudaDeleter> offsets = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> temp_buffer = nullptr;
+  SamplingData(unsigned long long random_seed, int batch_size, int vocab_size, cudaStream_t stream);
+  cuda_unique_ptr<int> indices_sorted;
+  cuda_unique_ptr<float> scores_sorted;
+  cuda_unique_ptr<float> scores_softmaxed;
+  cuda_unique_ptr<float> prefix_sums;
+  cuda_unique_ptr<float> thresholds;
+  cuda_unique_ptr<int> indices_in;
+  cuda_unique_ptr<int> offsets;
+  cuda_unique_ptr<float> temp_buffer;
+  cuda_unique_ptr<curandState> curand_states;
   size_t temp_storage_bytes = 0;
 };
 
diff --git a/src/search.cpp b/src/search.cpp
index dd3389270..aeff79c1d 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -15,7 +15,17 @@ Search_Cpu::Search_Cpu(const GeneratorParams& params)
 }
 
 GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params)
-    : Search_Cpu(params), gen_(rd_()) {
+    : Search_Cpu(params) {
+  if (params_->search.random_seed != -1)
+    gen_.seed(params_->search.random_seed);
+  else {
+    std::random_device rd;
+    std::array<uint32_t, decltype(gen_)::state_size> data;
+    std::generate(std::begin(data), std::end(data), std::ref(rd));
+    std::seed_seq seq(data.begin(), data.end());
+    gen_.seed(seq);
+  }
+
   next_tokens_buffer_ = AllocateArray<int32_t>(params.batch_size, &next_tokens_);
   memset(next_tokens_.data(), 0, next_tokens_.size_bytes());
 
diff --git a/src/search.h b/src/search.h
index 5a52c11e2..70dab187b 100644
--- a/src/search.h
+++ b/src/search.h
@@ -83,7 +83,6 @@ struct GreedySearch_Cpu : Search_Cpu {
   std::unique_ptr<bool[]> eos_seen_buffer_;
   int not_done_count_{params_->batch_size};  // When zero, every batch entry is done (starts at batch_size_)
 
-  std::random_device rd_;
   std::mt19937 gen_;
 };
 
diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp
index aa6d85431..9a5c6e9de 100644
--- a/src/search_cuda.cpp
+++ b/src/search_cuda.cpp
@@ -33,7 +33,13 @@ GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params)
     : Search_Cuda{params} {
   next_tokens_buffer_ = CudaMallocArray<int32_t>(params.batch_size, &next_tokens_);
   cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_->cuda_stream);
-  samplingdata_ = std::make_unique<cuda::SamplingData>(params_->batch_size, params_->vocab_size, params_->cuda_stream);
+
+  unsigned long long random_seed;
+  if (params_->search.random_seed != -1)
+    random_seed = params_->search.random_seed;
+  else
+    random_seed = std::random_device{}();
+  samplingdata_ = std::make_unique<cuda::SamplingData>(random_seed, params_->batch_size, params_->vocab_size, params_->cuda_stream);
 }
 
 BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params)

From 18adb67dce145579779dd0b6a3e2b25826a7b459 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Mon, 8 Apr 2024 16:47:10 -0700
Subject: [PATCH 35/36] Add packed QKV and rotary embedding within
 GroupQueryAttention to model builder (#245)

---
 .gitignore                      |   2 +-
 src/python/py/models/README.md  |   4 +-
 src/python/py/models/builder.py | 265 ++++++++++++++++++++------------
 3 files changed, 168 insertions(+), 103 deletions(-)

diff --git a/.gitignore b/.gitignore
index 60b60827f..d42e707d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,7 @@
 /test/test_models/*
 /cache_models
 /onnxruntime-linux-x64-*
-/*.csv
+*.csv
 .idea
 cache_dir
 example-models
diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md
index 0fdd2c818..34f24083e 100644
--- a/src/python/py/models/README.md
+++ b/src/python/py/models/README.md
@@ -62,10 +62,10 @@ python3 builder.py -m model_name -o path_to_output_folder -p precision -e execut
 This scenario is where your PyTorch model has been customized or finetuned for one of the currently supported model architectures and your model can be loaded in Hugging Face.
 ```
 # From wheel:
-python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider
+python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files
 
 # From source:
-python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider
+python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files
 ```
 
 ### GGUF Model
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 740c7ca40..16f864c38 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -19,6 +19,7 @@
 import os
 import textwrap
 
+
 class Model:
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.context_length = config.max_position_embeddings
@@ -48,7 +49,13 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.value_infos = []
         self.nodes = []
 
-        # Map input names to input shapes
+        # Map input names to their types and shapes
+        self.input_names = ["input_ids", "attention_mask", "position_ids"]
+        self.input_types = {
+            "input_ids": TensorProto.INT64,
+            "attention_mask": TensorProto.INT64,
+            "position_ids": TensorProto.INT64,
+        }
         self.input_shapes = {
             "input_ids": ["batch_size", "sequence_length"],
             "attention_mask": ["batch_size", "total_sequence_length"],
@@ -105,19 +112,35 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
         self.rotemb_attrs = {
             "create_rotary_embedding_caches": True,          # Create cos/sin caches for rotary embeddings
+            "theta": rope_theta,                             # Base value if calculating cos/sin caches from scratch
             "partial_rotary_factor": partial_rotary_factor,  # Factor for partial rotary embeddings
+            "interleaved": 0,                                # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0)
             "num_heads": 0,                                  # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0)
             "rotary_embedding_dim": 0,                       # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0)
-            "theta": rope_theta,                             # Base value if calculating cos/sin caches from scratch
         }
 
         # Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.)
         self.attention_attrs = {
-            "op_type": "MultiHeadAttention",                                # Attention op to use
-            "use_gqa": ep == "cuda" and io_dtype == TensorProto.FLOAT16     # Check if GroupQueryAttention can be used
+            "op_type": "MultiHeadAttention",                 # Attention op to use
+            "use_rotemb_in_gqa": False,                      # Use rotary embeddings within GroupQueryAttention (instead of a separate RotaryEmbedding op)
+            "use_packed_matmul": False,                      # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
         }
-        if self.attention_attrs["use_gqa"]:
+        if ep == "cuda" and io_dtype == TensorProto.FLOAT16:
             self.attention_attrs["op_type"] = "GroupQueryAttention"
+            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 CUDA and FP16 CUDA.")
+
+            self.attention_attrs["use_packed_matmul"] = self.num_attn_heads == self.num_kv_heads
+
+            # GQA + Rot.Emb. does not require `position ids` as input
+            self.attention_attrs["use_rotemb_in_gqa"] = True
+            self.input_names.remove("position_ids")
+
+        # MLP-specific variables
+        self.mlp_attrs = {
+            "use_proj": True,           # Use projection style for MLP (GateProj/UpProj/DownProj)
+            "use_fc": False,            # Use fully-connected style for MLP (FC1/FC2)
+            "output_0": "",             # Output 0 for MLP layer
+        }
 
         # Quantization-specific variables (INT4, INT8, etc.)
         self.quant_attrs = {
@@ -129,7 +152,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
 
     def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
         config = GenerationConfig.from_pretrained(model_name_or_path, **extra_kwargs)
-        inputs = dict(zip(self.input_shapes.keys(), self.input_shapes.keys()))
+        inputs = dict(zip(self.input_names, self.input_names))
         inputs.update({
             "past_key_names": "past_key_values.%d.key",
             "past_value_names": "past_key_values.%d.value",
@@ -238,6 +261,7 @@ def save_model(self, out_dir):
         if os.path.exists(data_path):
             print(f"Overwriting {data_path}")
             os.remove(data_path)
+        
         save_model(
             model,
             out_path,
@@ -305,9 +329,10 @@ def make_graph(self, *args, doc_string=None, **kwargs):
     def make_inputs_and_outputs(self):
         # Add model-specific inputs to list of model inputs
         inputs = []
-        for name in self.model_inputs:
+        for name in self.input_names:
+            dtype = self.input_types[name]
             shape = self.input_shapes[name]
-            inputs.append(helper.make_tensor_value_info(name, TensorProto.INT64, shape=shape))
+            inputs.append(helper.make_tensor_value_info(name, dtype, shape=shape))
 
         # Add model-specific outputs to list of model outputs
         outputs = [
@@ -474,9 +499,13 @@ def make_matmul_fp16_or_fp32(self, matmul, name, root_input, **kwargs):
     #     self.make_node("MatMulNBits", inputs=[root_input, weight, scales], outputs=[output], name=name)
     #     self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
 
-    # TODO: make packed QKV MatMul
-    # def make_packed_matmul(self, q_matmul, k_matmul, v_matmul, name, root_input, **kwargs):
-    #     pass
+    def make_packed_matmul(self, q_matmul, k_matmul, v_matmul, name, root_input, **kwargs):
+        # N = num_heads * head_size, H = hidden_size
+        # Combine 3 Matmuls of shape NxH into 1 packed MatMul of shape 3NxH
+        # Note: Packed MatMul is of shape 3NxH instead of Hx3N because `make_matmul` will apply a transpose before saving
+        N, H = q_matmul.shape
+        matmul = np.stack((q_matmul.transpose(), k_matmul.transpose(), v_matmul.transpose()), axis=1).reshape(H, 3*N).transpose()
+        self.make_matmul(matmul, name, root_input, **kwargs)
 
     def make_add_bias(self, add, name, root_input, **kwargs):
         bias = name[1:].replace("/", ".") + ".bias"
@@ -492,6 +521,11 @@ def make_add_bias(self, add, name, root_input, **kwargs):
         else:
             self.make_add(name, add_bias_inputs, dtype=self.io_dtype, shape=shape)
 
+    def make_packed_add(self, q_add, k_add, v_add, name, root_input, **kwargs):
+        # Combine 3 Adds of shape H into 1 packed Add of shape 3H
+        add = np.stack((q_add, k_add, v_add), axis=0).flatten()
+        self.make_add_bias(add, name, root_input, **kwargs)
+
     def make_embedding(self, embedding):
         weight = "model.embed_tokens.weight"
         self.make_external_tensor(embedding.astype(self.to_numpy_dtype[self.io_dtype]), weight)
@@ -587,7 +621,7 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
 
         inputs = [root_input, kwargs.pop("position_ids"), cos_cache_name, sin_cache_name]
         output = f"{name}/output_0"
-        self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=0, **kwargs)
+        self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=self.rotemb_attrs["interleaved"], **kwargs)
         self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * (self.num_kv_heads if "k_rotary" in name else self.num_attn_heads)])
 
     # TODO: This function and any corresponding changes to support it are temporary until ORT supports GQA for CPU
@@ -795,10 +829,15 @@ def make_group_query_attention(self, name, **kwargs):
             kwargs["q_path"], kwargs["k_path"], kwargs["v_path"],
             kwargs.get("past_k", ""), kwargs.get("past_v", ""),
             kwargs.get("seqlens_k", ""), kwargs.get("total_seq_len", ""),
+            kwargs.get("cos_cache", ""), kwargs.get("sin_cache", "")
         ]
         output = f"{name}/output_0"
         outputs = [output, kwargs.get("present_k", ""), kwargs.get("present_v", "")]
-        self.make_node("GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, local_window_size=self.window_size)
+        self.make_node(
+            "GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft",
+            num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, local_window_size=self.window_size,
+            do_rotary=self.attention_attrs["use_rotemb_in_gqa"], rotary_interleaved=self.rotemb_attrs["interleaved"],
+        )
         self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * self.num_attn_heads])
 
     def make_attention(self, layer_id, attention, root_input, **kwargs):
@@ -841,60 +880,75 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         v_input_to_attention = ""
 
         # Make MatMul nodes
-        q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul"
-        self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input)
-        q_input_to_attention = f"{q_matmul_name}/output_0"
-        k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul"
-        self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input)
-        k_input_to_attention = f"{k_matmul_name}/output_0"
-        v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul"
-        self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input)
-        v_input_to_attention = f"{v_matmul_name}/output_0"
+        if self.attention_attrs["use_packed_matmul"]:
+            # Combine 3 MatMuls into 1 packed MatMul
+            qkv_matmul_name = f"/model/layers.{layer_id}/attn/qkv_proj/MatMul"
+            self.make_packed_matmul(attention.q_proj.weight.detach().numpy(), attention.k_proj.weight.detach().numpy(), attention.v_proj.weight.detach().numpy(), qkv_matmul_name, root_input)
+            q_input_to_attention = f"{qkv_matmul_name}/output_0"
+        else:
+            q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul"
+            self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input)
+            q_input_to_attention = f"{q_matmul_name}/output_0"
+            k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul"
+            self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input)
+            k_input_to_attention = f"{k_matmul_name}/output_0"
+            v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul"
+            self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input)
+            v_input_to_attention = f"{v_matmul_name}/output_0"
 
         # Make Add nodes (if bias exists)
         q_bias_exists = attention.q_proj.bias is not None
         k_bias_exists = attention.k_proj.bias is not None
         v_bias_exists = attention.v_proj.bias is not None
+        all_bias_exists = q_bias_exists and k_bias_exists and v_bias_exists
 
-        if q_bias_exists:
-            q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add"
-            self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=f"{q_matmul_name}/output_0")
-            q_input_to_attention = f"{q_add_name}/output_0"
-        if k_bias_exists:
-            k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add"
-            self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=f"{k_matmul_name}/output_0")
-            k_input_to_attention = f"{k_add_name}/output_0"
-        if v_bias_exists:
-            v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add"
-            self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=f"{v_matmul_name}/output_0")
-            v_input_to_attention = f"{v_add_name}/output_0"
+        if all_bias_exists and self.attention_attrs["use_packed_matmul"]:
+            # Combine 3 Adds into 1 packed Add
+            qkv_add_name = f"/model/layers.{layer_id}/attn/qkv_proj/Add"
+            self.make_packed_add(attention.q_proj.bias.detach().numpy(), attention.k_proj.bias.detach().numpy(), attention.v_proj.bias.detach().numpy(), qkv_add_name, root_input=q_input_to_attention)
+            q_input_to_attention = f"{qkv_add_name}/output_0"
+        else:
+            if q_bias_exists:
+                q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add"
+                self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=q_input_to_attention)
+                q_input_to_attention = f"{q_add_name}/output_0"
+            if k_bias_exists:
+                k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add"
+                self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=k_input_to_attention)
+                k_input_to_attention = f"{k_add_name}/output_0"
+            if v_bias_exists:
+                v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add"
+                self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=v_input_to_attention)
+                v_input_to_attention = f"{v_add_name}/output_0"
 
         # Make RotaryEmbedding nodes
-        q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding"
-        q_rotary_input = f"{q_matmul_name if not q_bias_exists else q_add_name}/output_0"
-        self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, q_rotary_input, position_ids=kwargs.get("position_ids", "position_ids"))
-        q_input_to_attention = f"{q_rotary_name}/output_0"
-
-        k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding"
-        k_rotary_input = f"{k_matmul_name if not k_bias_exists else k_add_name}/output_0"
-        self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, k_rotary_input, position_ids=kwargs.get("position_ids", "position_ids"))
-        k_input_to_attention = f"{k_rotary_name}/output_0"
+        cos_cache_name, sin_cache_name = "", ""
+        if self.attention_attrs["use_rotemb_in_gqa"]:
+            cos_cache_name, sin_cache_name = self.make_rotary_embedding_caches(attention.rotary_emb)
+        else:
+            q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding"
+            self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, root_input=q_input_to_attention, position_ids=kwargs.get("position_ids", "position_ids"))
+            q_input_to_attention = f"{q_rotary_name}/output_0"
+            k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding"
+            self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, root_input=k_input_to_attention, position_ids=kwargs.get("position_ids", "position_ids"))
+            k_input_to_attention = f"{k_rotary_name}/output_0"
 
         # Make repeat KV nodes (TODO: remove once ORT supports GQA for CPU)
         past_k = f"past_key_values.{layer_id}.key"
         past_v = f"past_key_values.{layer_id}.value"
         present_k = f"present.{layer_id}.key"
         present_v = f"present.{layer_id}.value"
-        if self.num_attn_heads != self.num_kv_heads and not self.attention_attrs['use_gqa']:
-            k_input_to_attention = self.make_repeat_kv(layer_id, k_input_to_attention, past_k, present_k)
-            v_input_to_attention = self.make_repeat_kv(layer_id, v_input_to_attention, past_v, present_v)
+        if self.num_attn_heads != self.num_kv_heads and self.attention_attrs["op_type"] != "GroupQueryAttention":
+            k_input_to_attention = self.make_repeat_kv(layer_id, root_input=k_input_to_attention, past_kv=past_k, present_kv=present_k)
+            v_input_to_attention = self.make_repeat_kv(layer_id, root_input=v_input_to_attention, past_kv=past_v, present_kv=present_v)
             past_k, past_v, present_k, present_v = "", "", "", ""
 
         # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.)
         attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}"
         self.make_attention_op(
             attn_name, q_path=q_input_to_attention, k_path=k_input_to_attention, v_path=v_input_to_attention,
-            past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, **kwargs,
+            past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v,
+            cos_cache=cos_cache_name, sin_cache=sin_cache_name, **kwargs,
         )
 
         # Make MatMul node (output projection weight node)
@@ -914,6 +968,14 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         self.layernorm_attrs["skip_input"] = f"{o_matmul_name if not o_bias_exists else o_add_name}/output_0"
 
     def make_mlp(self, layer_id, mlp, root_input):
+        if self.mlp_attrs["use_proj"]:
+            self.make_mlp_proj(layer_id, mlp, root_input)
+        elif self.mlp_attrs["use_fc"]:
+            self.make_mlp_fc(layer_id, mlp, root_input)
+        else:
+            raise NotImplementedError(f"The MLP layer type is not set.")
+
+    def make_mlp_proj(self, layer_id, mlp, root_input):
         # Make nodes for the MLP subgraph
         #
         #           root_input
@@ -947,6 +1009,39 @@ def make_mlp(self, layer_id, mlp, root_input):
         # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
         self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
 
+    def make_mlp_fc(self, layer_id, mlp, root_input):
+        # Make nodes for the MLP subgraph
+        #
+        #          root_input
+        #              |
+        #          FC1_MatMul
+        #              |
+        #           FC1_Add
+        #              |
+        #           ActFunc
+        #              |
+        #          FC2_MatMul
+        #              |
+        #           FC2_Add
+
+        # Make first layer of fully connected nodes (FC1)
+        fc1_matmul_name = f"/model/layers.{layer_id}/mlp/fc1/MatMul"
+        self.make_matmul(mlp.fc1.weight.detach().numpy(), fc1_matmul_name, root_input)
+        fc1_add_name = f"/model/layers.{layer_id}/mlp/fc1/Add"
+        self.make_add_bias(mlp.fc1.bias.detach().numpy(), fc1_add_name, root_input=f"{fc1_matmul_name}/output_0")
+
+        # Make activation function
+        act_fn_name = self.make_activation(layer_id, root_input=f"{fc1_add_name}/output_0")
+
+        # Make second layer of fully connected nodes (FC2)
+        fc2_matmul_name = f"/model/layers.{layer_id}/mlp/fc2/MatMul"
+        self.make_matmul(mlp.fc2.weight.detach().numpy(), fc2_matmul_name, root_input=f"{act_fn_name}/output_0")
+        fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add"
+        self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0")
+
+        # Assign output 0 of MLP layer as output of last layer
+        self.mlp_attrs["output_0"] = f"{fc2_add_name}/output_0"
+
     def make_activation_with_mul(self, layer_id, root_input, activation, domain):
         # Make nodes for this activation subgraph
         #
@@ -1071,6 +1166,12 @@ def has_final_norm(self, module, model):
         return hf_norm or hf_final_layernorm or gguf_final_norm
 
     def make_attention_mask_reformatting(self):
+        if self.attention_attrs["op_type"] == "GroupQueryAttention":
+            self.make_attention_mask_reformatting_for_gqa()            
+        else:
+            self.make_attention_mask_reformatting_2d_to_4d()
+
+    def make_attention_mask_reformatting_2d_to_4d(self):
         # Make nodes for the attention mask subgraphs that reformat the
         # 2D attention mask (B, S) to 4D causal attention mask (B, N, S, T)
         #
@@ -1370,17 +1471,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
 
         return expand_name
     
-
-class LlamaModel(Model):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
-        self.model_inputs = ["input_ids", "attention_mask", "position_ids"]
-
-    def make_attention_mask_reformatting(self):
-        if not self.attention_attrs["use_gqa"]:
-            super().make_attention_mask_reformatting()
-            return
-
+    def make_attention_mask_reformatting_for_gqa(self):
         # Make nodes for the attention mask subgraph that calculates 
         # attributes about the 2D attention mask to use in GroupQueryAttention
         #
@@ -1420,12 +1511,6 @@ def make_attention_mask_reformatting(self):
         self.mask_attrs["seqlens_k"] = cast_1_name
         self.mask_attrs["total_seq_len"] = cast_2_name
 
-
-class MistralModel(LlamaModel):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
-        self.position_ids_name = self.make_position_ids_reformatting()
-
     def make_position_ids_reformatting(self):
         # Make nodes for the position ids reformatting subgraph
         #
@@ -1461,62 +1546,42 @@ def make_position_ids_reformatting(self):
 
         return reshape_name
 
+
+class LlamaModel(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+
+
+class MistralModel(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+        self.position_ids_name = f"{self.make_position_ids_reformatting()}/output_0" if not self.attention_attrs["use_rotemb_in_gqa"] else "position_ids"
+
     def make_attention(self, layer_id, attention, root_input, **kwargs):
-        super().make_attention(layer_id, attention, root_input, position_ids=f"{self.position_ids_name}/output_0", **kwargs)
+        super().make_attention(layer_id, attention, root_input, position_ids=self.position_ids_name, **kwargs)
 
 
-class PhiModel(LlamaModel):
+class PhiModel(Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
         # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
         self.layernorm_attrs["simple"] = False
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
         self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
+        self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = False, True
 
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
-        
-    def make_mlp(self, layer_id, mlp, root_input):
-        # Make nodes for the MLP subgraph
-        #
-        #          root_input
-        #              |
-        #          FC1_MatMul
-        #              |
-        #           FC1_Add
-        #              |
-        #           FastGelu
-        #              |
-        #          FC2_MatMul
-        #              |
-        #           FC2_Add
-
-        # Make first layer of fully connected nodes (FC1)
-        fc1_matmul_name = f"/model/layers.{layer_id}/mlp/fc1/MatMul"
-        self.make_matmul(mlp.fc1.weight.detach().numpy(), fc1_matmul_name, root_input)
-        fc1_add_name = f"/model/layers.{layer_id}/mlp/fc1/Add"
-        self.make_add_bias(mlp.fc1.bias.detach().numpy(), fc1_add_name, root_input=f"{fc1_matmul_name}/output_0")
-
-        # Make activation function
-        fast_gelu_name = self.make_activation(layer_id, root_input=f"{fc1_add_name}/output_0")
-
-        # Make second layer of fully connected nodes (FC2)
-        fc2_matmul_name = f"/model/layers.{layer_id}/mlp/fc2/MatMul"
-        self.make_matmul(mlp.fc2.weight.detach().numpy(), fc2_matmul_name, root_input=f"{fast_gelu_name}/output_0")
-        fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add"
-        self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0")
-
-        return fc2_add_name
 
     def make_layer(self, layer_id, layer):
         # Each Phi decoder layer is defined as:
         # input_layernorm --> attention --> MLP --> residual_add
         self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
         self.make_attention(layer_id, layer.self_attn, self.layernorm_attrs["output_0"])
-        fc2_add_name = self.make_mlp(layer_id, layer.mlp, self.layernorm_attrs["output_0"])
+        self.make_mlp(layer_id, layer.mlp, self.layernorm_attrs["output_0"])
 
         residual_add_name = f"/model/layers.{layer_id}/residual_add/Add"
-        residual_add_inputs = [self.layernorm_attrs['skip_input'], f"{fc2_add_name}/output_0"]
+        residual_add_inputs = [self.layernorm_attrs['skip_input'], self.mlp_attrs["output_0"]]
         self.make_add(residual_add_name, residual_add_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
 
         self.layernorm_attrs["first_layernorm"] = False

From bc503fb1929525aa0a4bdc513fc2321ea951f5c8 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 9 Apr 2024 14:40:44 -0700
Subject: [PATCH 36/36] Add C++ benchmark program (#243)

Add a C++ model benchmark program. It is modeled after the existing Python benchmark script (benchmark/python/benchmark_e2e.py).

The motivation for a C++ version is to be able to run without Python. This is useful for Android.
---
 CMakeLists.txt              |  20 ++-
 benchmark/c/CMakeLists.txt  |  25 ++++
 benchmark/c/main.cpp        | 242 ++++++++++++++++++++++++++++++++++++
 benchmark/c/options.cpp     | 110 ++++++++++++++++
 benchmark/c/options.h       |  22 ++++
 cmake/options.cmake         |   1 +
 src/csharp/Generator.cs     |   4 +-
 src/csharp/NativeMethods.cs |   8 +-
 src/ort_genai.h             |  50 ++++++--
 src/ort_genai_c.cpp         |   4 +-
 src/ort_genai_c.h           |   9 +-
 src/python/CMakeLists.txt   |   1 -
 test/CMakeLists.txt         |   1 -
 test/c_api_tests.cpp        |  24 ++--
 14 files changed, 486 insertions(+), 35 deletions(-)
 create mode 100644 benchmark/c/CMakeLists.txt
 create mode 100644 benchmark/c/main.cpp
 create mode 100644 benchmark/c/options.cpp
 create mode 100644 benchmark/c/options.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff70bea11..b325c2763 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,11 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
   add_compile_definitions(_DEBUG=1)
 endif()
 
+if(MSVC)
+  # set updated value for __cplusplus macro instead of 199711L
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
+endif()
+
 message(STATUS "Adding source files")
 
 file(GLOB generator_srcs CONFIGURE_DEPENDS
@@ -127,6 +132,11 @@ else()
   set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.so")
 endif()
 
+file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
+if(USE_DML)
+  list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll")
+endif()
+
 if(NO_TOKENIZEROOT)
   add_compile_definitions(NO_TOKENIZER=1)
   message("----------------Tokenizer Disabled------------------")
@@ -148,6 +158,11 @@ if(ENABLE_PYTHON)
   message("------------------Enabling Python Wheel------------------")
 endif()
 
+if(ENABLE_MODEL_BENCHMARK)
+  add_subdirectory("${CMAKE_SOURCE_DIR}/benchmark/c")
+  message("------------------Enabling model benchmark------------------")
+endif()
+
 if(NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_LIB}")
   message(FATAL_ERROR "Expected the ONNX Runtime library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_LIB}. Actual: Not found.")
 endif()
@@ -158,7 +173,6 @@ if(USE_CUDA AND NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_CUDA_LIB}")
   message(FATAL_ERROR "Expected the ONNX Runtime providers cuda library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_CUDA_LIB}. Actual: Not found.")
 endif()
 
-file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
 target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR})
 target_link_libraries(onnxruntime-genai PRIVATE ${ONNXRUNTIME_LIB})
 
@@ -182,10 +196,6 @@ if(MSVC)
 endif()
 
 # Copy the onnxruntime binaries into the build folder so it's found on launch
-file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
-if(USE_DML)
-  list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll")
-endif()
 foreach(DLL_FILE ${onnxruntime_libs})
   add_custom_command(
     TARGET onnxruntime-genai POST_BUILD
diff --git a/benchmark/c/CMakeLists.txt b/benchmark/c/CMakeLists.txt
new file mode 100644
index 000000000..0035f3e5e
--- /dev/null
+++ b/benchmark/c/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+set(model_benchmark_srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/options.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/options.cpp
+)
+
+add_executable(model_benchmark ${model_benchmark_srcs})
+
+target_include_directories(model_benchmark PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/src  # directory containing the ort_genai headers
+)
+
+target_link_libraries(model_benchmark PRIVATE onnxruntime-genai-static ${ONNXRUNTIME_LIB})
+
+target_link_directories(model_benchmark PRIVATE ${ORT_LIB_DIR})
+
+add_custom_command(TARGET model_benchmark POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${onnxruntime_libs} $<TARGET_FILE_DIR:model_benchmark>
+)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${model_benchmark_srcs})
diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp
new file mode 100644
index 000000000..0a6840c42
--- /dev/null
+++ b/benchmark/c/main.cpp
@@ -0,0 +1,242 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "ort_genai.h"
+
+#include "options.h"
+
+namespace {
+
+using Clock = std::chrono::steady_clock;
+
+using Duration = Clock::duration;
+using DurationFp = std::chrono::duration<float, Duration::period>;
+
+class Timing {
+ public:
+  Timing(const Timing&) = delete;
+  Timing& operator=(const Timing&) = delete;
+
+  Timing(std::vector<Duration>& measurements)
+      : measurements_{measurements}, start_{Clock::now()} {
+  }
+
+  ~Timing() {
+    const auto measurement = Clock::now() - start_;
+    measurements_.push_back(measurement);
+  }
+
+ private:
+  std::vector<Duration>& measurements_;
+  const Clock::time_point start_;
+};
+
+struct Statistics {
+  DurationFp average{};
+  DurationFp stddev{};
+  DurationFp p50{};
+  DurationFp p90{};
+  DurationFp p99{};
+  size_t n{};
+};
+
+Statistics ComputeStats(const std::vector<Duration>& measurements) {
+  Statistics stats{};
+  if (measurements.empty()) {
+    return stats;
+  }
+
+  stats.n = measurements.size();
+
+  const auto sum = std::accumulate(measurements.begin(), measurements.end(), Duration{0});
+  stats.average = DurationFp{sum} / stats.n;
+
+  std::vector<Duration> sorted = measurements;
+  std::sort(sorted.begin(), sorted.end());
+
+  stats.p50 = sorted[stats.n * 0.5];
+  stats.p90 = sorted[stats.n * 0.9];
+  stats.p99 = sorted[stats.n * 0.99];
+
+  if (stats.n > 1) {
+    const float variance =
+        std::accumulate(
+            measurements.begin(), measurements.end(),
+            0.0f,
+            [mean = stats.average.count()](float accumulator, const Duration& m) -> float {
+              const float distance_from_mean = m.count() - mean;
+              return accumulator + distance_from_mean * distance_from_mean;
+            }) /
+        (stats.n - 1);
+
+    const float stddev = std::sqrt(variance);
+    stats.stddev = DurationFp{stddev};
+  }
+
+  return stats;
+}
+
+void WritePerTokenStats(std::string_view label,
+                        const Statistics& stats,
+                        const size_t tokens_per_measurement) {
+  using MicrosecondsFp = std::chrono::duration<float, std::chrono::microseconds::period>;
+  const auto avg_us = MicrosecondsFp{stats.average};
+  std::cout << label << ":"
+            << "\n\tavg (us):       " << avg_us.count()
+            << "\n\tavg (tokens/s): " << 1.0e6f / avg_us.count() * tokens_per_measurement
+            << "\n\tp50 (us):       " << MicrosecondsFp{stats.p50}.count()
+            << "\n\tstddev (us):    " << MicrosecondsFp{stats.stddev}.count()
+            << "\n\tn:              " << stats.n << " * " << tokens_per_measurement << " token(s)"
+            << "\n";
+}
+
+void WriteE2EStats(std::string_view label,
+                   const Statistics& stats) {
+  using MillisecondsFp = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  std::cout << label << ":"
+            << "\n\tavg (ms):       " << MillisecondsFp{stats.average}.count()
+            << "\n\tp50 (ms):       " << MillisecondsFp{stats.p50}.count()
+            << "\n\tstddev (ms):    " << MillisecondsFp{stats.stddev}.count()
+            << "\n\tn:              " << stats.n
+            << "\n";
+}
+
+std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) {
+  const char* const base_prompt = "A";
+  auto base_prompt_sequences = OgaSequences::Create();
+
+  tokenizer.Encode(base_prompt, *base_prompt_sequences);
+
+  auto params = OgaGeneratorParams::Create(model);
+  params->SetSearchOption("max_length", num_prompt_tokens);
+  params->SetSearchOption("min_length", num_prompt_tokens);
+  params->SetInputSequences(*base_prompt_sequences);
+
+  auto output_sequences = model.Generate(*params);
+  const auto output_sequence_length = output_sequences->SequenceCount(0);
+  const auto* output_sequence_data = output_sequences->SequenceData(0);
+  return std::string{tokenizer.Decode(output_sequence_data, output_sequence_length)};
+}
+
+void RunBenchmark(const benchmark::Options& opts) {
+  auto model = OgaModel::Create(opts.model_path.c_str());
+  auto tokenizer = OgaTokenizer::Create(*model);
+
+  const std::string prompt = GeneratePrompt(opts.num_prompt_tokens, *model, *tokenizer);
+  auto prompt_sequences = OgaSequences::Create();
+
+  if (opts.batch_size < 1) {
+    throw std::runtime_error("Batch size must be at least 1.");
+  }
+
+  for (size_t i = 0; i < opts.batch_size; ++i) {
+    tokenizer->Encode(prompt.c_str(), *prompt_sequences);
+  }
+
+  const size_t num_prompt_tokens = prompt_sequences->SequenceCount(0);
+  const size_t num_tokens = num_prompt_tokens + opts.num_tokens_to_generate;
+
+  auto make_generator_params = [&] {
+    auto params = OgaGeneratorParams::Create(*model);
+    params->SetSearchOption("max_length", num_tokens);
+    params->SetSearchOption("min_length", num_tokens);
+    params->SetInputSequences(*prompt_sequences);
+    return params;
+  };
+
+  const auto generator_params = make_generator_params();
+
+  // warmup
+  if (opts.verbose) std::cout << "Running warmup iterations (" << opts.num_warmup_iterations << ")...\n";
+  for (size_t i = 0; i < opts.num_warmup_iterations; ++i) {
+    auto output_sequences = model->Generate(*generator_params);
+
+    if (opts.verbose && i == 0) {
+      // show prompt and output on first iteration
+      std::cout << "Prompt:\n\t" << prompt << "\n";
+      const auto output_sequence_length = output_sequences->SequenceCount(0);
+      const auto* output_sequence_data = output_sequences->SequenceData(0);
+      const auto output = tokenizer->Decode(output_sequence_data, output_sequence_length);
+      std::cout << "Output:\n\t" << output << "\n";
+    }
+  }
+
+  std::vector<Duration> e2e_gen_times, prompt_processing_times, token_gen_times, sampling_times;
+  // note: be sure to reserve enough to avoid vector reallocations in the measured code
+  e2e_gen_times.reserve(opts.num_iterations);
+  prompt_processing_times.reserve(opts.num_iterations);
+  token_gen_times.reserve(opts.num_iterations * (opts.num_tokens_to_generate - 1));
+  sampling_times.reserve(opts.num_iterations * opts.num_tokens_to_generate);
+
+  if (opts.verbose) std::cout << "Running iterations (" << opts.num_iterations << ")...\n";
+  for (size_t i = 0; i < opts.num_iterations; ++i) {
+    auto generator = OgaGenerator::Create(*model, *generator_params);
+
+    {
+      Timing e2e_gen_timing{e2e_gen_times};
+
+      {
+        Timing prompt_processing_timing{prompt_processing_times};
+        generator->ComputeLogits();
+      }
+
+      {
+        Timing sampling_timing{sampling_times};
+        generator->GenerateNextToken();
+      }
+
+      while (!generator->IsDone()) {
+        {
+          Timing token_gen_timing{token_gen_times};
+          generator->ComputeLogits();
+        }
+
+        {
+          Timing sampling_timing{sampling_times};
+          generator->GenerateNextToken();
+        }
+      }
+    }
+  }
+
+  {
+    std::cout << "Batch size: " << opts.batch_size
+              << ", prompt tokens: " << num_prompt_tokens
+              << ", tokens to generate: " << opts.num_tokens_to_generate
+              << "\n";
+
+    const auto e2e_gen_stats = ComputeStats(e2e_gen_times);
+    const auto prompt_processing_stats = ComputeStats(prompt_processing_times);
+    const auto token_gen_stats = ComputeStats(token_gen_times);
+    const auto sampling_stats = ComputeStats(sampling_times);
+
+    WritePerTokenStats("Prompt processing (time to first token)",
+                       prompt_processing_stats, opts.batch_size * num_prompt_tokens);
+    WritePerTokenStats("Token generation", token_gen_stats, opts.batch_size);
+    WritePerTokenStats("Token sampling", sampling_stats, opts.batch_size);
+    WriteE2EStats("E2E generation (entire generation loop)", e2e_gen_stats);
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  try {
+    const auto opts = benchmark::ParseOptionsFromCommandLine(argc, argv);
+    RunBenchmark(opts);
+    return 0;
+  } catch (const std::exception& e) {
+    std::cerr << "Exception: " << e.what() << "\n";
+    return 1;
+  }
+}
diff --git a/benchmark/c/options.cpp b/benchmark/c/options.cpp
new file mode 100644
index 000000000..7047a4466
--- /dev/null
+++ b/benchmark/c/options.cpp
@@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "options.h"
+
+#include <cstdlib>
+#include <charconv>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <system_error>
+
+namespace benchmark {
+
+namespace {
+
+[[noreturn]] void PrintHelpAndExit(const char* program_name, int exit_code) {
+  Options defaults{};
+  std::ostringstream s;
+
+  s << "Usage: " << program_name << " -i <model path> <other options>\n"
+    << "  Options:\n"
+    << "    -i,--input_folder <path>\n"
+    << "      Path to the ONNX model directory to benchmark, compatible with onnxruntime-genai.\n "
+    << "    -b,--batch_size <number>\n"
+    << "      Number of sequences to generate in parallel. Default: " << defaults.batch_size << "\n"
+    << "    -l,--prompt_length <number>\n"
+    << "      Number of tokens in the prompt. Default: " << defaults.num_prompt_tokens << "\n"
+    << "    -g,--generation_length <number>\n"
+    << "      Number of tokens to generate. Default: " << defaults.num_tokens_to_generate << "\n"
+    << "    -r,--repetitions <number>\n"
+    << "      Number of times to repeat the benchmark. Default: " << defaults.num_iterations << "\n"
+    << "    -w,--warmup <number>\n"
+    << "      Number of warmup runs before benchmarking. Default: " << defaults.num_warmup_iterations << "\n"
+    << "    -v,--verbose\n"
+    << "      Show more informational output.\n"
+    << "    -h,--help\n"
+    << "      Show this help message and exit.\n";
+
+  std::cerr << s.str();
+  std::exit(exit_code);
+}
+
+template <typename T>
+T ParseNumber(std::string_view s) {
+  T n;
+  const auto *s_begin = s.data(), *s_end = s.data() + s.size();
+  const auto [ptr, ec] = std::from_chars(s_begin, s_end, n);
+  if (ec != std::errc{} || ptr != s_end) {
+    throw std::runtime_error(std::string{"Failed to parse option value as number: "}.append(s));
+  }
+  return n;
+}
+
+void VerifyOptions(const Options& opts) {
+  if (opts.model_path.empty()) {
+    throw std::runtime_error("ONNX model directory path must be provided.");
+  }
+}
+
+}  // namespace
+
+Options ParseOptionsFromCommandLine(int argc, const char* const* argv) {
+  const char* const program_name = argc > 0 ? argv[0] : "model_benchmark";
+  try {
+    Options opts{};
+
+    auto next_arg = [argc, argv](int& idx) {
+      if (idx + 1 >= argc) {
+        throw std::runtime_error("Option value not provided.");
+      }
+      return std::string_view{argv[++idx]};
+    };
+
+    for (int i = 1; i < argc; ++i) {
+      std::string_view arg{argv[i]};
+
+      if (arg == "-i" || arg == "--input_folder") {
+        opts.model_path = next_arg(i);
+      } else if (arg == "-b" || arg == "--batch_size") {
+        opts.batch_size = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-l" || arg == "--prompt_length") {
+        opts.num_prompt_tokens = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-g" || arg == "--generation_length") {
+        opts.num_tokens_to_generate = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-r" || arg == "--repetitions") {
+        opts.num_iterations = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-w" || arg == "--warmup") {
+        opts.num_warmup_iterations = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-v" || arg == "--verbose") {
+        opts.verbose = true;
+      } else if (arg == "-h" || arg == "--help") {
+        PrintHelpAndExit(program_name, 0);
+      } else {
+        throw std::runtime_error(std::string{"Unknown option: "}.append(arg));
+      }
+    }
+
+    VerifyOptions(opts);
+
+    return opts;
+  } catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << "\n";
+    PrintHelpAndExit(program_name, 1);
+  }
+}
+
+}  // namespace benchmark
diff --git a/benchmark/c/options.h b/benchmark/c/options.h
new file mode 100644
index 000000000..a00d19191
--- /dev/null
+++ b/benchmark/c/options.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+namespace benchmark {
+
+struct Options {
+  std::string model_path{};
+  size_t num_prompt_tokens{16};
+  size_t num_tokens_to_generate{128};
+  size_t batch_size{1};
+  size_t num_iterations{5};
+  size_t num_warmup_iterations{1};
+  bool verbose{false};
+};
+
+Options ParseOptionsFromCommandLine(int argc, const char* const* argv);
+
+}  // namespace benchmark
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 80f004215..ac40a6d1d 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -6,5 +6,6 @@ option(NO_TOKENIZER "Don't include the Tokenizer" OFF)
 option(ENABLE_PYTHON "Build the Python API." ON)
 option(ENABLE_TESTS "Enable tests" ON)
 option(TEST_PHI2 "Enable tests for Phi2" OFF)
+option(ENABLE_MODEL_BENCHMARK "Build model benchmark program" ON)
 
 cmake_dependent_option(BUILD_WHEEL "Build the python wheel" ON "ENABLE_PYTHON" OFF)
\ No newline at end of file
diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs
index 64c1c5623..e2772d632 100644
--- a/src/csharp/Generator.cs
+++ b/src/csharp/Generator.cs
@@ -32,8 +32,8 @@ public void GenerateNextToken()
 
         public ReadOnlySpan<int> GetSequence(ulong index)
         {
-            ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64();
-            IntPtr sequencePtr = NativeMethods.OgaGenerator_GetSequence(_generatorHandle, (UIntPtr)index);
+            ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceCount(_generatorHandle, (UIntPtr)index).ToUInt64();
+            IntPtr sequencePtr = NativeMethods.OgaGenerator_GetSequenceData(_generatorHandle, (UIntPtr)index);
             unsafe
             {
                 return new ReadOnlySpan<int>(sequencePtr.ToPointer(), (int)sequenceLength);
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index 552c9046a..7766c5e02 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -82,15 +82,15 @@ internal class NativeLib
 
         // This function returns the length of the sequence at the given index.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator,
-                                                                                 UIntPtr /* size_t */ index);
+        public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceCount(IntPtr /* const OgaGenerator* */ generator,
+                                                                                UIntPtr /* size_t */ index);
 
         // This function returns the sequence data at the given index. The returned pointer is owned by the
         // OgaGenerator object and will be freed when the OgaGenerator object is destroyed. It is expected
         // that the caller copies the data returned by this function after calling this function.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* const in32_t* */ OgaGenerator_GetSequence(IntPtr /* const OgaGenerator* */ generator,
-                                                                                 UIntPtr /* size_t */ index);
+        public static extern IntPtr /* const in32_t* */ OgaGenerator_GetSequenceData(IntPtr /* const OgaGenerator* */ generator,
+                                                                                     UIntPtr /* size_t */ index);
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaCreateSequences(out IntPtr /* OgaSequences** */ sequences);
diff --git a/src/ort_genai.h b/src/ort_genai.h
index 82f8c722c..ea831ee2e 100644
--- a/src/ort_genai.h
+++ b/src/ort_genai.h
@@ -1,5 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+
+#if __cplusplus >= 202002L
+#include <span>
+#endif
+
 #include "ort_genai_c.h"
 
 // GenAI C++ API
@@ -55,7 +65,7 @@ struct OgaModel : OgaAbstract {
     return std::unique_ptr<OgaModel>(p);
   }
 
-  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) const {
     OgaSequences* p;
     OgaCheckResult(OgaGenerate(this, &params, &p));
     return std::unique_ptr<OgaSequences>(p);
@@ -84,9 +94,19 @@ struct OgaSequences : OgaAbstract {
     return OgaSequencesCount(this);
   }
 
+  size_t SequenceCount(size_t index) const {
+    return OgaSequencesGetSequenceCount(this, index);
+  }
+
+  const int32_t* SequenceData(size_t index) const {
+    return OgaSequencesGetSequenceData(this, index);
+  }
+
+#if __cplusplus >= 202002L
   std::span<const int32_t> Get(size_t index) const {
-    return {OgaSequencesGetSequenceData(this, index), OgaSequencesGetSequenceCount(this, index)};
+    return {SequenceData(index), SequenceCount(index)};
   }
+#endif
 
   static void operator delete(void* p) { OgaDestroySequences(reinterpret_cast<OgaSequences*>(p)); }
 };
@@ -102,11 +122,19 @@ struct OgaTokenizer : OgaAbstract {
     OgaCheckResult(OgaTokenizerEncode(this, str, &sequences));
   }
 
+  OgaString Decode(const int32_t* tokens_data, size_t tokens_length) const {
+    const char* p;
+    OgaCheckResult(OgaTokenizerDecode(this, tokens_data, tokens_length, &p));
+    return p;
+  }
+
+#if __cplusplus >= 202002L
   OgaString Decode(std::span<const int32_t> tokens) const {
     const char* p;
     OgaCheckResult(OgaTokenizerDecode(this, tokens.data(), tokens.size(), &p));
     return p;
   }
+#endif
 
   static void operator delete(void* p) { OgaDestroyTokenizer(reinterpret_cast<OgaTokenizer*>(p)); }
 };
@@ -139,15 +167,11 @@ struct OgaGeneratorParams : OgaAbstract {
     return std::unique_ptr<OgaGeneratorParams>(p);
   }
 
-  void SetSearchOption(const char* name, int value) {
-    OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value));
-  }
-
   void SetSearchOption(const char* name, double value) {
     OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value));
   }
 
-  void SetSearchOption(const char* name, bool value) {
+  void SetSearchOptionBool(const char* name, bool value) {
     OgaCheckResult(OgaGeneratorParamsSetSearchBool(this, name, value));
   }
 
@@ -181,9 +205,19 @@ struct OgaGenerator : OgaAbstract {
     OgaCheckResult(OgaGenerator_GenerateNextToken(this));
   }
 
+  size_t GetSequenceCount(size_t index) const {
+    return OgaGenerator_GetSequenceCount(this, index);
+  }
+
+  const int32_t* GetSequenceData(size_t index) const {
+    return OgaGenerator_GetSequenceData(this, index);
+  }
+
+#if __cplusplus >= 202002L
   std::span<const int32_t> GetSequence(size_t index) const {
-    return {OgaGenerator_GetSequence(this, index), OgaGenerator_GetSequenceLength(this, index)};
+    return {GetSequenceData(index), GetSequenceCount(index)};
   }
+#endif
 
   static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast<OgaGenerator*>(p)); }
 };
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index e9548d509..78c1b8ecd 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -157,12 +157,12 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator)
   OGA_CATCH
 }
 
-size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) {
+size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().size();
 }
 
-const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* oga_generator, size_t index) {
+const int32_t* OGA_API_CALL OgaGenerator_GetSequenceData(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().data();
 }
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index 41eb65909..21a1fb2f7 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -1,5 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
+#pragma once
+
 #include <stdint.h>
 #include <cstddef>
 
@@ -179,17 +182,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator*
  * \param[in] generator The generator to get the count of the tokens for the sequence at the given index.
  * \return The number tokens in the sequence at the given index.
  */
-OGA_EXPORT size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* generator, size_t index);
+OGA_EXPORT size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* generator, size_t index);
 
 /*
  * \brief Returns a pointer to the sequence data at the given index. The number of tokens in the sequence
- *        is given by OgaGenerator_GetSequenceLength
+ *        is given by OgaGenerator_GetSequenceCount
  * \param[in] generator The generator to get the sequence data for the sequence at the given index.
  * \return The pointer to the sequence data at the given index. The sequence data is owned by the OgaGenerator
  *         and will be freed when the OgaGenerator is destroyed. The caller must copy the data if it needs to
  *         be used after the OgaGenerator is destroyed.
  */
-OGA_EXPORT const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* generator, size_t index);
+OGA_EXPORT const int32_t* OGA_API_CALL OgaGenerator_GetSequenceData(const OgaGenerator* generator, size_t index);
 
 OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateTokenizer(const OgaModel* model, OgaTokenizer** out);
 OGA_EXPORT void OGA_API_CALL OgaDestroyTokenizer(OgaTokenizer*);
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 942664246..e2175039a 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -44,7 +44,6 @@ if(BUILD_WHEEL)
   file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/py/" DESTINATION ${WHEEL_TARGET_NAME}/)
   file(COPY "${CMAKE_SOURCE_DIR}/ThirdPartyNotices.txt" DESTINATION ${WHEEL_TARGET_NAME}/)
 
-  file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
   add_custom_command(TARGET python POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
     ${onnxruntime_libs} $<TARGET_FILE:python>
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 83571118e..daf8c40b3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -49,7 +49,6 @@ else()
   target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT})
   target_link_libraries(unit_tests PRIVATE tokenizer)
 endif()
-file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
 set(TEST_MODEL_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/test_models/")
 set(TEST_MODEL_DES_DIR "$<TARGET_FILE_DIR:unit_tests>/test_models/")
 add_custom_command(TARGET unit_tests POST_BUILD
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index a1ec2b923..fbdb4e541 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -120,10 +120,13 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
 
   // Verify outputs match expected outputs
   for (int i = 0; i < batch_size; i++) {
-    auto sequence = generator->GetSequence(i);
+    const auto sequence_length = generator->GetSequenceCount(i);
+    const auto* sequence_data = generator->GetSequenceData(i);
 
-    auto* expected_output_start = &expected_output[i * max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
+    ASSERT_LE(sequence_length, max_length);
+
+    const auto* expected_output_start = &expected_output[i * max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
   }
 
   // Test high level API
@@ -131,10 +134,13 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
 
   // Verify outputs match expected outputs
   for (int i = 0; i < batch_size; i++) {
-    auto sequence = sequences->Get(i);
+    const auto sequence_length = sequences->SequenceCount(i);
+    const auto* sequence_data = sequences->SequenceData(i);
+
+    ASSERT_LE(sequence_length, max_length);
 
-    auto* expected_output_start = &expected_output[i * max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
+    const auto* expected_output_start = &expected_output[i * max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
   }
 }
 
@@ -199,7 +205,7 @@ struct Phi2Test {
 TEST(CAPITests, TopKCAPI) {
   Phi2Test test;
 
-  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOptionBool("do_sample", true);
   test.params_->SetSearchOption("top_k", 50);
   test.params_->SetSearchOption("temperature", 0.6f);
 
@@ -209,7 +215,7 @@ TEST(CAPITests, TopKCAPI) {
 TEST(CAPITests, TopPCAPI) {
   Phi2Test test;
 
-  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOptionBool("do_sample", true);
   test.params_->SetSearchOption("top_p", 0.6f);
   test.params_->SetSearchOption("temperature", 0.6f);
 
@@ -219,7 +225,7 @@ TEST(CAPITests, TopPCAPI) {
 TEST(CAPITests, TopKTopPCAPI) {
   Phi2Test test;
 
-  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOptionBool("do_sample", true);
   test.params_->SetSearchOption("top_k", 50);
   test.params_->SetSearchOption("top_p", 0.6f);
   test.params_->SetSearchOption("temperature", 0.6f);