From a6cbaf54122d946544af7a7a3613a7fee4eabed3 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Mon, 29 Apr 2024 11:11:29 -0700
Subject: [PATCH 01/15] Fix naming transpose nodes (#348)

### Description

This PR adds the node name for the `Transpose` nodes.

### Motivation and Context

While adding Phi-3 mini to the model builder, a new checker was added to
prevent adding the same node twice in an ONNX model. The new checker
found that the `Transpose` nodes were missing their unique names.
---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 1a9c37a3e..a2c61b947 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -528,7 +528,7 @@ def make_mul(self, name, inputs, dtype, shape):
 
     def make_transpose(self, name, root_input, dtype, shape, perm):
         output = f"{name}/output_0"
-        self.make_node("Transpose", inputs=[root_input], outputs=[output], perm=perm)
+        self.make_node("Transpose", inputs=[root_input], outputs=[output], name=name, perm=perm)
         self.make_value_info(output, dtype, shape=shape)
 
     def make_matmul(self, matmul, name, root_input, **kwargs):

From 769162cfb364e9ace09a9143ee768e152be0498a Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Mon, 29 Apr 2024 11:32:14 -0700
Subject: [PATCH 02/15] fix C++ APIs in C example (#347)

---
 examples/c/src/main.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp
index e4be639f2..09f0c9aa5 100644
--- a/examples/c/src/main.cpp
+++ b/examples/c/src/main.cpp
@@ -5,11 +5,14 @@
 // C++ API Example
 
 void CXX_API(const char* model_path) {
+  std::cout << "Creating model..." << std::endl;
   auto model = OgaModel::Create(model_path);
+  std::cout << "Creating tokenizer..." << std::endl;
   auto tokenizer = OgaTokenizer::Create(*model);
 
   const char* prompt = "def is_prime(num):";
-  std::cout << "Prompt: " << std::endl << prompt << std::endl;
+  std::cout << "Prompt: " << std::endl
+            << prompt << std::endl;
 
   auto sequences = OgaSequences::Create();
   tokenizer->Encode(prompt, *sequences);
@@ -19,16 +22,19 @@ void CXX_API(const char* model_path) {
   params->SetInputSequences(*sequences);
 
   auto output_sequences = model->Generate(*params);
-  auto out_string = tokenizer->Decode(output_sequences->Get(0));
+  const auto output_sequence_length = output_sequences->SequenceCount(0);
+  const auto* output_sequence_data = output_sequences->SequenceData(0);
+  auto out_string = tokenizer->Decode(output_sequence_data, output_sequence_length);
 
-  std::cout << "Output: " << std::endl << out_string << std::endl;
+  std::cout << "Output: " << std::endl
+            << out_string << std::endl;
 }
 
 // C API Example
 
 void CheckResult(OgaResult* result) {
   if (result) {
-    std::string string=OgaResultGetError(result);
+    std::string string = OgaResultGetError(result);
     OgaDestroyResult(result);
     throw std::runtime_error(string);
   }
@@ -36,9 +42,11 @@ void CheckResult(OgaResult* result) {
 
 void C_API(const char* model_path) {
   OgaModel* model;
+  std::cout << "Creating model..." << std::endl;
   OgaCreateModel(model_path, &model);
 
   OgaTokenizer* tokenizer;
+  std::cout << "Creating tokenizer..." << std::endl;
   CheckResult(OgaCreateTokenizer(model, &tokenizer));
 
   const char* prompt = "def is_prime(num):";
@@ -84,7 +92,6 @@ int main(int argc, char** argv) {
     return -1;
   }
 
-
   std::cout << "-------------" << std::endl;
   std::cout << "Hello, Phi-2!" << std::endl;
   std::cout << "-------------" << std::endl;

From a9fd3264ce6a49c0f9003710d42f24b96e32accc Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 29 Apr 2024 16:07:14 -0400
Subject: [PATCH 03/15] Merging Rel-0.2.0 back to main (#342)

Co-authored-by: Patrice Vignola <vignola.patrice@gmail.com>
Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Co-authored-by: Nat Kershaw (MSFT) <nakersha@microsoft.com>
Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com>
Co-authored-by: Kunal Vaishnavi <kvaishnavi@microsoft.com>
Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Co-authored-by: Yufeng Li <liyufeng1987@gmail.com>
Co-authored-by: Baiju Meswani <bmeswani@microsoft.com>
---
 .pipelines/nuget-publishing.yml               |  14 +-
 .pipelines/pypl-publishing.yml                |  14 +-
 .../stages/jobs/nuget-packaging-job.yml       |  27 +++-
 .pipelines/stages/jobs/py-packaging-job.yml   |  21 ++-
 .../stages/jobs/steps/capi-linux-step.yml     |   2 +-
 .../stages/jobs/steps/capi-win-step.yml       |  16 ++-
 .../stages/jobs/steps/nuget-win-step.yml      |   4 +-
 .../stages/jobs/steps/utils/capi-archive.yml  |   1 +
 .../stages/jobs/steps/utils/download-ort.yml  |  74 +++++++----
 .pipelines/stages/nuget-packaging-stage.yml   |  19 ++-
 .pipelines/stages/py-packaging-stage.yml      |  21 ++-
 benchmark/c/main.cpp                          |   2 +-
 cmake/presets/CMakeWinConfigPresets.json      |  40 ++----
 examples/python/phi-3-tutorial.md             | 121 ++++++++++++++----
 src/csharp/GeneratorParams.cs                 |   5 +
 src/csharp/NativeMethods.cs                   |   8 +-
 src/ort_genai.h                               |   8 +-
 src/ort_genai_c.cpp                           |  22 +++-
 src/ort_genai_c.h                             |   5 +-
 19 files changed, 320 insertions(+), 104 deletions(-)

diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
index b6fff7111..00083c65c 100644
--- a/.pipelines/nuget-publishing.yml
+++ b/.pipelines/nuget-publishing.yml
@@ -19,12 +19,21 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_win_dml
+  displayName: 'Whether Windows DirectML package is built.'
+  type: boolean
+  default: true
 
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
   default: '1.17.3'
 
+- name: ort_dml_version
+  displayName: 'OnnxRuntime DirectML version'
+  type: string
+  default: '1.18.0-dev-20240423-0527-c07b8d545d'
+
 - name: cuda_version
   displayName: 'CUDA version'
   type: string
@@ -54,6 +63,9 @@ stages:
     enable_win_cuda: ${{ parameters.enable_win_cuda }}
     enable_linux_cpu: ${{ parameters.enable_linux_cpu }}
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
+    enable_win_dml: ${{ parameters.enable_win_dml }}
     ort_version: ${{ parameters.ort_version }}
+    ort_dml_version: ${{ parameters.ort_dml_version }}
     cuda_version: ${{ parameters.cuda_version }}
-    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
\ No newline at end of file
+    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+    
\ No newline at end of file
diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml
index 1069ede44..1b493ca69 100644
--- a/.pipelines/pypl-publishing.yml
+++ b/.pipelines/pypl-publishing.yml
@@ -7,6 +7,11 @@ parameters:
 - name: enable_win_cuda
   displayName: 'Whether Windows CUDA package is built.'
   type: boolean
+  default : true
+
+- name: enable_win_dml
+  displayName: 'Whether Windows DirectML package is built.'
+  type: boolean
   default: true
 
 - name: enable_linux_cpu
@@ -24,6 +29,11 @@ parameters:
   type: string
   default: '1.17.3'
 
+- name: ort_dml_version
+  displayName: 'OnnxRuntime DirectML version'
+  type: string
+  default: '1.18.0-dev-20240423-0527-c07b8d545d'
+
 - name: cuda_version
   displayName: 'CUDA version'
   type: string
@@ -53,6 +63,8 @@ stages:
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
     enable_win_cpu: ${{ parameters.enable_win_cpu }}
     enable_win_cuda: ${{ parameters.enable_win_cuda }}
+    enable_win_dml: ${{ parameters.enable_win_dml }}
     ort_version: ${{ parameters.ort_version }}
+    ort_dml_version: ${{ parameters.ort_dml_version }}
     cuda_version: ${{ parameters.cuda_version }}
-    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml
index f790215b7..f971a4586 100644
--- a/.pipelines/stages/jobs/nuget-packaging-job.yml
+++ b/.pipelines/stages/jobs/nuget-packaging-job.yml
@@ -21,7 +21,10 @@ jobs:
   ${{ if eq(parameters.os, 'linux') }}:
     pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   ${{ if eq(parameters.os, 'win') }}:
-    pool: 'onnxruntime-Win-CPU-2022'
+    ${{ if eq(parameters.ep, 'directml') }}:
+      pool: 'onnxruntime-Win2022-GPU-dml-A10'
+    ${{ else }}:
+      pool: 'onnxruntime-Win-CPU-2022'
   timeoutInMinutes: 180
   #  set variables here to be used in the template and steps
   variables:
@@ -44,21 +47,39 @@ jobs:
   - name: ort_filename
     ${{ if eq(parameters.ep, 'cpu') }}:
       value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}'
-    ${{ else}}:
+    ${{ elseif eq(parameters.ep, 'cuda') }}:
       ${{if eq(parameters.cuda_version, '11.8') }}:
         value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
       ${{ else }}:
         value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
+    ${{ elseif eq(parameters.ep, 'directml')}}:
+      value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}'
+    ${{ else }}:
+      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}'
   - name: genai_nuget_ext
     ${{ if eq(parameters.ep, 'cpu') }}:
       value: ''
     ${{ if eq(parameters.ep, 'cuda') }}:
       value: '.Cuda'
+    ${{ if eq(parameters.ep, 'directml') }}:
+      value: '.DirectML'
   - name: ortHome
-      value: 'ort'
+    value: 'ort'
+  - name: dml_dir
+    value: 'Microsoft.AI.DirectML.1.14.1'
+  - name: dml_zip
+    value: 'Microsoft.AI.DirectML.1.14.1.zip'
+  - name: dml_url
+    value: "https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1"
   workspace:
     clean: all
   steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: 3.10
+      addToPath: true
+      architecture: $(arch)
+
   - ${{ if eq(parameters.os, 'linux') }}:
     - template: steps/capi-linux-step.yml
       parameters:
diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml
index 7066b070e..ba9bc5731 100644
--- a/.pipelines/stages/jobs/py-packaging-job.yml
+++ b/.pipelines/stages/jobs/py-packaging-job.yml
@@ -21,7 +21,11 @@ jobs:
   ${{ if eq(parameters.os, 'linux') }}:
     pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   ${{ if eq(parameters.os, 'win') }}:
-    pool: 'onnxruntime-Win-CPU-2022'
+    ${{ if eq(parameters.ep, 'directml') }}:
+      pool: 'onnxruntime-Win2022-GPU-dml-A10'
+    ${{ else }}:
+      pool: 'onnxruntime-Win-CPU-2022'
+
   strategy:
     matrix:
       Python38:
@@ -66,11 +70,23 @@ jobs:
   - name: ort_filename
     ${{ if eq(parameters.ep, 'cpu') }}:
       value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}'
-    ${{ else}}:
+    ${{ elseif eq(parameters.ep, 'cuda') }}:
       ${{if eq(parameters.cuda_version, '11.8') }}:
         value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
       ${{ else }}:
         value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
+    ${{ elseif eq(parameters.ep, 'directml')}}:
+      value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}'
+    ${{ else }}:
+      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}'
+
+  - name: dml_dir
+    value: 'Microsoft.AI.DirectML.1.14.1'
+  - name: dml_zip
+    value: 'Microsoft.AI.DirectML.1.14.1.zip'
+  - name: dml_url
+    value: "https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1"
+
   steps:
   - task: UsePythonVersion@0
     inputs:
@@ -97,6 +113,7 @@ jobs:
     - template: steps/capi-win-step.yml
       parameters:
         target: 'python'
+        ep: ${{ parameters.ep }}
 
   - ${{ if eq(parameters.publish_to_ado_feed, true)}}:
     - template: steps/py-ado-feed-releasing-step.yml
diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml
index 03f76feb1..2ff4d5c74 100644
--- a/.pipelines/stages/jobs/steps/capi-linux-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml
@@ -26,10 +26,10 @@ steps:
     echo "arch=$(arch)"
     echo "ep=$(ep)"
   displayName: 'Print Parameters'
-
 - template: utils/download-ort.yml
   parameters:
     archiveType: 'tgz'
+
 - bash: |
     set -e -x
     az login --identity --username 63b63039-6328-442f-954b-5a64d124e5b4
diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml
index 3681bffd4..8afafdb41 100644
--- a/.pipelines/stages/jobs/steps/capi-win-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-win-step.yml
@@ -2,6 +2,9 @@ parameters:
 - name: target
   type: string
   default: 'onnxruntime-genai'
+- name: ep
+  type: string
+  default: 'cpu'
 steps:
 - bash: |
     echo "##[error]Error: ep and arch are not set"
@@ -28,10 +31,21 @@ steps:
     echo "cuda_version=$(cuda_version)"
     echo "target=${{ parameters.target }}"
   displayName: 'Print Parameters'
-
 - template: utils/download-ort.yml
   parameters:
     archiveType: 'zip'
+    ep: ${{ parameters.ep }}
+
+- ${{ if eq(parameters.ep, 'directml') }}:
+  - powershell: |
+      Invoke-WebRequest -Uri $(dml_url) -OutFile $(dml_zip)
+      Expand-Archive $(dml_zip) -DestinationPath $(dml_dir)
+      Remove-Item -Path $(dml_zip)
+      Get-ChildItem -Recurse $(dml_dir)
+      mv $(dml_dir)\bin\x64-win\DirectML.dll ort\lib
+      mv $(dml_dir)\include\DirectML.h ort\include
+    workingDirectory: '$(Build.Repository.LocalPath)'
+    continueOnError: true
 
 - powershell: |
     azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v$(cuda_version)" 'cuda_sdk'
diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml
index b5acf6a5a..af502e0df 100644
--- a/.pipelines/stages/jobs/steps/nuget-win-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml
@@ -16,12 +16,12 @@ steps:
     DisplayName: 'ESRP - Sign C# dlls'
     Pattern: '*OnnxRuntimeGenAI*.dll'
 - powershell: |
-    $VERSION = '0.1.0-rc4'
+    $VERSION = '0.2.0-rc4'
     nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec `
       -Prop version=$VERSION `
       -Prop genai_nuget_ext=$(genai_nuget_ext) `
       -Prop configuration=$(buildConfig) `
-      -Prop buildPath=$(buildDir)
+      -Prop buildPath=$(buildDir) `
       -Prop ortHome=$(ortHome)
     nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.Managed.nuspec `
       -Prop version=$VERSION `
diff --git a/.pipelines/stages/jobs/steps/utils/capi-archive.yml b/.pipelines/stages/jobs/steps/utils/capi-archive.yml
index 1395b31f7..6034d255c 100644
--- a/.pipelines/stages/jobs/steps/utils/capi-archive.yml
+++ b/.pipelines/stages/jobs/steps/utils/capi-archive.yml
@@ -39,6 +39,7 @@ steps:
     SourceFolder: '$(Build.Repository.LocalPath)/src'
     Contents: |
       ort_genai_c.h
+      ort_genai.h
     TargetFolder: '$(Build.ArtifactStagingDirectory)/$(artifactName)/include'
 
 - task: CopyFiles@2
diff --git a/.pipelines/stages/jobs/steps/utils/download-ort.yml b/.pipelines/stages/jobs/steps/utils/download-ort.yml
index 78ffadd7c..5346bade8 100644
--- a/.pipelines/stages/jobs/steps/utils/download-ort.yml
+++ b/.pipelines/stages/jobs/steps/utils/download-ort.yml
@@ -1,6 +1,9 @@
 parameters:
 - name: archiveType
   type: string
+- name: ep
+  type: string
+  default: cpu
 steps:
 - bash: |
     echo "##[error]Error: ort_version and ort_filename are not set"
@@ -8,29 +11,54 @@ steps:
   displayName: 'Check if variables ort_version and ort_filename are set'
   condition: or( eq (variables['ort_version'], ''), eq (variables['ort_filename'], ''))
 
-- task: DownloadGitHubRelease@0
-  inputs:
-    connection: 'GitHub - Release'
-    userRepository: 'microsoft/onnxruntime'
-    defaultVersionType: 'specificTag'
-    version: 'v$(ort_version)'
-    itemPattern: '$(ort_filename).${{ parameters.archiveType }}'
-    downloadPath: '$(Build.Repository.LocalPath)'
-  displayName: Download $(ort_filename)
-
-- task: ExtractFiles@1
-  inputs:
-    archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).${{ parameters.archiveType }}'
-    destinationFolder: '$(Build.Repository.LocalPath)'
-    cleanDestinationFolder: false
-    overwriteExistingFiles: true
-  displayName: Unzip OnnxRuntime
-
-- task: CopyFiles@2
-  inputs:
-    SourceFolder: '$(Build.Repository.LocalPath)/$(ort_filename)'
-    TargetFolder: '$(Build.Repository.LocalPath)/ort'
-  displayName: Copy OnnxRuntime to ort
+#Special case for DML
+- ${{ if ne(parameters.ep, 'directml') }}:
+  - task: DownloadGitHubRelease@0
+    inputs:
+      connection: 'GitHub - Release'
+      userRepository: 'microsoft/onnxruntime'
+      defaultVersionType: 'specificTag'
+      version: 'v$(ort_version)'
+      itemPattern: '$(ort_filename).${{ parameters.archiveType }}'
+      downloadPath: '$(Build.Repository.LocalPath)'
+    displayName: Download $(ort_filename)
+  - task: ExtractFiles@1
+    inputs:
+      archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).${{ parameters.archiveType }}'
+      destinationFolder: '$(Build.Repository.LocalPath)'
+      cleanDestinationFolder: false
+      overwriteExistingFiles: true
+    displayName: Unzip OnnxRuntime
+  - task: CopyFiles@2
+    inputs:
+      SourceFolder: '$(Build.Repository.LocalPath)/$(ort_filename)'
+      TargetFolder: '$(Build.Repository.LocalPath)/ort'
+    displayName: Copy OnnxRuntime to ort
+- ${{ else }}:
+  - task: DownloadPackage@1
+    inputs:
+      packageType: 'nuget'
+      feed: '2692857e-05ef-43b4-ba9c-ccf1c22c437c/7982ae20-ed19-4a35-a362-a96ac99897b7'
+      definition: 'Microsoft.ML.OnnxRuntime.DirectML' # Can also be package name
+      version: '$(ort_version)'
+      extract: false
+      downloadPath: '$(Build.Repository.LocalPath)'
+    displayName: Download $(ort_filename)
+  - task: ExtractFiles@1
+    inputs:
+      archiveFilePatterns: '$(Build.Repository.LocalPath)/*.nupkg'
+      destinationFolder: '$(Build.Repository.LocalPath)/ort'
+      cleanDestinationFolder: false
+      overwriteExistingFiles: true
+    displayName: Unzip OnnxRuntime
+  - task: CopyFiles@2
+    inputs:
+      SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/win-x64/native'
+      TargetFolder: '$(Build.Repository.LocalPath)/ort/lib'
+  - task: CopyFiles@2
+    inputs:
+      SourceFolder: '$(Build.Repository.LocalPath)/ort/build/native/include'
+      TargetFolder: '$(Build.Repository.LocalPath)/ort/include'
 
 - task: DeleteFiles@1
   inputs:
diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml
index f962337ac..93bd7ac84 100644
--- a/.pipelines/stages/nuget-packaging-stage.yml
+++ b/.pipelines/stages/nuget-packaging-stage.yml
@@ -3,12 +3,16 @@ parameters:
   type: boolean
 - name: enable_win_cuda
   type: boolean
+- name: enable_win_dml
+  type: boolean
 - name: enable_linux_cpu
   type: boolean
 - name: enable_linux_cuda
   type: boolean
 - name: ort_version
   type: string
+- name: ort_dml_version
+  type: string
 - name: cuda_version
   type: string
   default: ''
@@ -26,6 +30,16 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
+  - ${{ if eq(parameters.enable_win_dml, true) }}:
+    - template: jobs/nuget-packaging-job.yml
+      parameters:
+        arch: 'x64'
+        ep: 'directml'
+        ort_version: ${{ parameters.ort_dml_version }}
+        os: 'win'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -35,6 +49,7 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -43,6 +58,7 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -51,4 +67,5 @@ stages:
         ep: 'cuda'
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
\ No newline at end of file
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
\ No newline at end of file
diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml
index e23581f56..efbd28d9e 100644
--- a/.pipelines/stages/py-packaging-stage.yml
+++ b/.pipelines/stages/py-packaging-stage.yml
@@ -3,12 +3,16 @@ parameters:
   type: boolean
 - name: enable_win_cuda
   type: boolean
+- name: enable_win_dml
+  type: boolean
 - name: enable_linux_cpu
   type: boolean
 - name: enable_linux_cuda
   type: boolean
 - name: ort_version
   type: string
+- name: ort_dml_version
+  type: string
 - name: cuda_version
   type: string
   default: ''
@@ -26,6 +30,16 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
+  - ${{ if eq(parameters.enable_win_dml, true) }}:
+    - template: jobs/py-packaging-job.yml
+      parameters:
+        arch: 'x64'
+        ep: 'directml'
+        ort_version: ${{ parameters.ort_dml_version }}
+        os: 'win'
+        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
     - template: jobs/py-packaging-job.yml
       parameters:
@@ -35,6 +49,7 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
 
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
     - template: jobs/py-packaging-job.yml
@@ -44,6 +59,7 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
+        
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
     - template: jobs/py-packaging-job.yml
       parameters:
@@ -53,7 +69,4 @@ stages:
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
         publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-
-
-
-
+        
\ No newline at end of file
diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp
index 3a4c9b43b..2d4b62b1f 100644
--- a/benchmark/c/main.cpp
+++ b/benchmark/c/main.cpp
@@ -112,7 +112,7 @@ void WriteE2EStats(std::string_view label,
             << "\n";
 }
 
-std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) {
+std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) {
   const char* const base_prompt = "A";
   auto base_prompt_sequences = OgaSequences::Create();
 
diff --git a/cmake/presets/CMakeWinConfigPresets.json b/cmake/presets/CMakeWinConfigPresets.json
index 57b74ae0d..3c885fdef 100644
--- a/cmake/presets/CMakeWinConfigPresets.json
+++ b/cmake/presets/CMakeWinConfigPresets.json
@@ -271,10 +271,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml release asan",
-      "binaryDir": "${sourceDir}/build/directml_asan",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml_asan"
     },
     {
       "name": "windows_x64_directml_debug_asan",
@@ -283,10 +280,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml debug asan",
-      "binaryDir": "${sourceDir}/build/directml_asan",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml_asan"
     },
     {
       "name": "windows_x64_directml_relwithdebinfo_asan",
@@ -295,10 +289,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml relwithdebinfo asan",
-      "binaryDir": "${sourceDir}/build/directml_asan",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml_asan"
     },
     {
       "name": "windows_x64_directml_minsizerel_asan",
@@ -307,10 +298,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml minsizerel asan",
-      "binaryDir": "${sourceDir}/build/directml_asan",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml_asan"
     },
     {
       "name": "windows_x64_directml_release",
@@ -319,10 +307,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml release",
-      "binaryDir": "${sourceDir}/build/directml",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml"
     },
     {
       "name": "windows_x64_directml_debug",
@@ -331,10 +316,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml debug",
-      "binaryDir": "${sourceDir}/build/directml",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml"
     },
     {
       "name": "windows_x64_directml_relwithdebinfo",
@@ -343,10 +325,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml relwithdebinfo",
-      "binaryDir": "${sourceDir}/build/directml",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml"
     },
     {
       "name": "windows_x64_directml_minsizerel",
@@ -355,10 +334,7 @@
         "windows_directml_default"
       ],
       "displayName": "windows x64 directml minsizerel",
-      "binaryDir": "${sourceDir}/build/directml",
-      "cacheVariables": {
-        "USE_DML": "ON"
-      }
+      "binaryDir": "${sourceDir}/build/directml"
     },
     {
       "name": "windows_arm64_cpu_relwithdebinfo",
diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 4e8a5660c..2e3b93aa3 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -2,62 +2,139 @@
 
 ## Steps
 1. [Download Phi-3 Mini](#download-the-model)
-2. [Install the generate() API](#install-the-generate()-api-package)
-3. [Run Phi-3 Mini](#run-the-model)
+2. [Build ONNX Runtime shared libraries](#build-onnx-runtime-from-source)
+3. [Build generate() API](#build-the-generate-api-from-source)
+4. [Run Phi-3 Mini](#run-the-model)
 
 ## Download the model 
 
 Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face.
 
+There are ONNX models for CPU (used for mobile too), as well as DirectML and CUDA.
 
-For the short context model.
+
+## Install the generate() API package
+
+Right now, both `onnxruntime` and `onnxruntime-genai` need to be built from source. Once packages are published, this tutorial will be updated.
+
+The instructions for how to build both packages from source are documented in the [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html) guide. They are repeated here for your convenience.
+
+### Pre-requisites
+
+#### CMake
+
+This is included on Windows if you have Visual Studio installed. If you are running on Linux or Mac, you can install it using `conda`.
 
 ```bash
-git clone https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx
+conda install cmake
 ```
 
-For the long context model
+### Build ONNX Runtime from source
+
+#### Clone the repo 
 
 ```bash
-git clone https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx
+git clone https://github.com/microsoft/onnxruntime.git
+cd onnxruntime
 ```
 
-These model repositories have models that run with DirectML, CPU and CUDA.
+#### Build ONNX Runtime for DirectML on Windows
 
-## Install the generate() API package
+```bash
+build.bat --build_shared_lib --skip_tests --parallel --use_dml --config Release
+```
 
-### DirectML
+#### Build ONNX Runtime for CPU on Windows
 
+```bash
+build.bat --build_shared_lib --skip_tests --parallel --config Release
 ```
-pip install numpy
-pip install --pre onnxruntime-genai-directml
+
+#### Build ONNX Runtime for CUDA on Windows
+
+```bash
+build.bat --build_shared_lib --skip_tests --parallel --use_cuda --config Release
 ```
 
-### CPU
+#### Build ONNX Runtine on Linux
 
+```bash
+./build.sh --build_shared_lib --skip_tests --parallel [--use_cuda] --config Release
 ```
-pip install numpy
-pip install --pre onnxruntime-genai
+
+You may need to provide extra command line options for building with CUDA on Linux. An example full command is as follows.
+
+```bash
+./build.sh --parallel --build_shared_lib --use_cuda --cuda_version 11.8 --cuda_home /usr/local/cuda-11.8 --cudnn_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="80" --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda-11.8/bin/nvcc
 ```
 
-### CUDA
+Replace the values given above for different versions and locations of CUDA.
 
+#### Build ONNX Runtime on Mac
+
+```bash
+./build.sh --build_shared_lib --skip_tests --parallel --config Release
 ```
-pip install numpy
-pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/
+
+### Build the generate() API from source
+
+#### Clone the repo
+
+```bash
+git clone https://github.com/microsoft/onnxruntime-genai
+cd onnxruntime-genai
+mkdir -p ort/include
+mkdir -p ort/lib
+```
+
+#### Build the generate() API on Windows
+
+
+If building for DirectML
+
+```bash
+copy ..\onnxruntime\include\onnxruntime\core\providers\dml\dml_provider_factory.h ort\include
+```
+
+```bash
+copy ..\onnxruntime\include\onnxruntime\core\session\onnxruntime_c_api.h ort\include
+copy ..\onnxruntime\build\Windows\Release\Release\*.dll ort\lib
+copy ..\onnxruntime\build\Windows\Release\Release\onnxruntime.lib ort\lib
+python build.py [--use_dml | --use_cuda]
+cd build\wheel
+pip install *.whl
+```
+
+
+#### Build the generate() API on Linux
+
+```bash
+cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include
+cp ../onnxruntime/build/Linux/Release/libonnxruntime*.so* ort/lib
+python build.py [--use_cuda]
+cd build/wheel
+pip install *.whl
+```
+
+#### Build the generate() API on Mac
+
+```bash
+cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include
+cp ../onnxruntime/build/MacOS/Release/libonnxruntime*.dylib* ort/lib
+python build.py
+cd build/wheel
+pip install *.whl
 ```
 
 ## Run the model
 
-Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py).
+Run the model with [this script](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py).
 
 The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.
 
-This example is using the long context model running with DirectML on Windows.
-
 ```bash
-curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
-python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
+pip install numpy
+python model-qa.py -m models/phi3-mini-4k-instruct-cpu-int4-rtn-block-32 
 ```
 
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
diff --git a/src/csharp/GeneratorParams.cs b/src/csharp/GeneratorParams.cs
index 5aee3be3e..a48e5c4a5 100644
--- a/src/csharp/GeneratorParams.cs
+++ b/src/csharp/GeneratorParams.cs
@@ -30,6 +30,11 @@ public void SetSearchOption(string searchOption, bool value)
             Result.VerifySuccess(NativeMethods.OgaGeneratorParamsSetSearchBool(_generatorParamsHandle, StringUtils.ToUtf8(searchOption), value));
         }
 
+        public void TryGraphCaptureWithMaxBatchSize(int maxBatchSize)
+        {
+            Result.VerifySuccess(NativeMethods.OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(_generatorParamsHandle, maxBatchSize));
+        }
+
         public void SetInputIDs(ReadOnlySpan<int> inputIDs, ulong sequenceLength, ulong batchSize)
         {
             unsafe
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index da9d20e29..f2906f3df 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -53,6 +53,10 @@ internal class NativeLib
                                                                                      byte[] /* const char* */ searchOption,
                                                                                      bool value);
 
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern IntPtr /* OgaResult* */ OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(IntPtr /* OgaGeneratorParams* */ generatorParams,
+                                                                                                       int /* int32_t */ maxBatchSize);
+
         // This function is used to set the input IDs for the generator.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern unsafe IntPtr /* OgaResult* */ OgaGeneratorParamsSetInputIDs(IntPtr /* OgaGeneratorParams* */ generatorParams,
@@ -67,7 +71,7 @@ internal class NativeLib
                                                                                          IntPtr /* const OgaSequences* */ sequences);
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model,
                                                                         IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                         out IntPtr /* OgaGenerator** */ generator);
 
@@ -125,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
         // This function is used to generate sequences for the given model using the given generator parameters.
         // The OgaSequences object is an array of sequences, where each sequence is an array of tokens.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model,
                                                                  IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                  out IntPtr /* OgaSequences** */ sequences);
 
diff --git a/src/ort_genai.h b/src/ort_genai.h
index e2c560637..fb863dae2 100644
--- a/src/ort_genai.h
+++ b/src/ort_genai.h
@@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract {
     return std::unique_ptr<OgaModel>(p);
   }
 
-  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) const {
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
     OgaSequences* p;
     OgaCheckResult(OgaGenerate(this, &params, &p));
     return std::unique_ptr<OgaSequences>(p);
@@ -193,11 +193,15 @@ struct OgaGeneratorParams : OgaAbstract {
     OgaCheckResult(OgaGeneratorParamsSetInputSequences(this, &sequences));
   }
 
+  void TryGraphCaptureWithMaxBatchSize(int max_batch_size) {
+    OgaCheckResult(OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(this, max_batch_size));
+  }
+
   static void operator delete(void* p) { OgaDestroyGeneratorParams(reinterpret_cast<OgaGeneratorParams*>(p)); }
 };
 
 struct OgaGenerator : OgaAbstract {
-  static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, const OgaGeneratorParams& params) {
+  static std::unique_ptr<OgaGenerator> Create(OgaModel& model, const OgaGeneratorParams& params) {
     OgaGenerator* p;
     OgaCheckResult(OgaCreateGenerator(&model, &params, &p));
     return std::unique_ptr<OgaGenerator>(p);
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index 1245c60d8..13cae5235 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -105,6 +105,14 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene
   OGA_CATCH
 }
 
+OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) {
+  OGA_TRY
+  auto* params = reinterpret_cast<Generators::GeneratorParams*>(generator_params);
+  params->max_batch_size = max_batch_size;
+  return nullptr;
+  OGA_CATCH
+}
+
 OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputIDs(OgaGeneratorParams* oga_params, const int32_t* input_ids, size_t input_ids_count, size_t sequence_length, size_t batch_size) {
   OGA_TRY
   auto& params = *reinterpret_cast<Generators::GeneratorParams*>(oga_params);
@@ -135,17 +143,23 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera
   OGA_CATCH
 }
 
-OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
+OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
   OGA_TRY
-  auto result = Generators::Generate(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params));
+  auto* model_p = reinterpret_cast<Generators::Model*>(model);
+  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
+  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
+  auto result = Generators::Generate(*model_p, *params);
   *out = reinterpret_cast<OgaSequences*>(std::make_unique<Generators::TokenSequences>(std::move(result)).release());
   return nullptr;
   OGA_CATCH
 }
 
-OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
+OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
   OGA_TRY
-  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params)).release());
+  auto* model_p = reinterpret_cast<Generators::Model*>(model);
+  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
+  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
+  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*model_p, *params).release());
   return nullptr;
   OGA_CATCH
 }
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index 5b6b9034f..0939d2c36 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -117,7 +117,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model);
  *             after it is done using the sequences.
  * \return OgaResult containing the error message if the generation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
 
 /*
  * \brief Creates a OgaGeneratorParams from the given model.
@@ -135,6 +135,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyGeneratorParams(OgaGeneratorParams* gener
 
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchNumber(OgaGeneratorParams* generator_params, const char* name, double value);
 OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* generator_params, const char* name, bool value);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size);
 
 /*
  * \brief Sets the input ids for the generator params. The input ids are used to seed the generation.
@@ -166,7 +167,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O
  * \param[out] out The created generator.
  * \return OgaResult containing the error message if the generator creation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
 
 /*
  * \brief Destroys the given generator.

From bf7d2f10e1d15c9848ff29824a5a9c142418dc7e Mon Sep 17 00:00:00 2001
From: Parinita Rahi <101819959+parinitarahi@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:38:06 -0700
Subject: [PATCH 04/15] Update README.md (#350)

Added changes based on Marco's feedback
---
 README.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 68b1cfbd5..f6cab5472 100644
--- a/README.md
+++ b/README.md
@@ -51,24 +51,33 @@ See full documentation at [https://onnxruntime.ai/docs/genai].
 
 ## Installation
 
-### DirectML
+If you don't know which hardware capabilities is available on your device.
+* Windows GPU (use DirectML): [Verify if you have Windows GPU](https://www.microsoft.com/en-us/windows/learning-center/how-to-check-gpu)
+
+* CUDA GPU: [Verify if you have CUDA GPU](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#verify-you-have-a-cuda-capable-gpu)
+
+* CPU and Mobile: For Windows, Mac, Android and other devices use the CPU and Mobile option below
+
+### Windows GPU  (DirectML) 
 
 ```bash
 pip install [--pre] numpy onnxruntime-genai-directml
 ```
 
-### CPU
+### CUDA GPU
 
 ```bash
-pip install [--pre] numpy onnxruntime-genai
+pip install numpy onnxruntime-genai-cuda --pre --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/
 ```
 
-### CUDA
+
+### CPU and Mobile
 
 ```bash
-pip install numpy onnxruntime-genai-cuda --pre --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/
+pip install [--pre] numpy onnxruntime-genai
 ```
 
+
 ## Sample code for phi-2 in Python
 
 [Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package.

From 2fa3964db05e095a2d1eea1aca4885c67c870f09 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Mon, 29 Apr 2024 14:57:38 -0700
Subject: [PATCH 05/15] Change phi-3-tutorial.md back (#354)

---
 examples/python/phi-3-tutorial.md | 121 ++++++------------------------
 1 file changed, 22 insertions(+), 99 deletions(-)

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 2e3b93aa3..4e8a5660c 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -2,139 +2,62 @@
 
 ## Steps
 1. [Download Phi-3 Mini](#download-the-model)
-2. [Build ONNX Runtime shared libraries](#build-onnx-runtime-from-source)
-3. [Build generate() API](#build-the-generate-api-from-source)
-4. [Run Phi-3 Mini](#run-the-model)
+2. [Install the generate() API](#install-the-generate()-api-package)
+3. [Run Phi-3 Mini](#run-the-model)
 
 ## Download the model 
 
 Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face.
 
-There are ONNX models for CPU (used for mobile too), as well as DirectML and CUDA.
 
-
-## Install the generate() API package
-
-Right now, both `onnxruntime` and `onnxruntime-genai` need to be built from source. Once packages are published, this tutorial will be updated.
-
-The instructions for how to build both packages from source are documented in the [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html) guide. They are repeated here for your convenience.
-
-### Pre-requisites
-
-#### CMake
-
-This is included on Windows if you have Visual Studio installed. If you are running on Linux or Mac, you can install it using `conda`.
-
-```bash
-conda install cmake
-```
-
-### Build ONNX Runtime from source
-
-#### Clone the repo 
-
-```bash
-git clone https://github.com/microsoft/onnxruntime.git
-cd onnxruntime
-```
-
-#### Build ONNX Runtime for DirectML on Windows
-
-```bash
-build.bat --build_shared_lib --skip_tests --parallel --use_dml --config Release
-```
-
-#### Build ONNX Runtime for CPU on Windows
-
-```bash
-build.bat --build_shared_lib --skip_tests --parallel --config Release
-```
-
-#### Build ONNX Runtime for CUDA on Windows
+For the short context model.
 
 ```bash
-build.bat --build_shared_lib --skip_tests --parallel --use_cuda --config Release
+git clone https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx
 ```
 
-#### Build ONNX Runtine on Linux
+For the long context model
 
 ```bash
-./build.sh --build_shared_lib --skip_tests --parallel [--use_cuda] --config Release
+git clone https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx
 ```
 
-You may need to provide extra command line options for building with CUDA on Linux. An example full command is as follows.
+These model repositories have models that run with DirectML, CPU and CUDA.
 
-```bash
-./build.sh --parallel --build_shared_lib --use_cuda --cuda_version 11.8 --cuda_home /usr/local/cuda-11.8 --cudnn_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="80" --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda-11.8/bin/nvcc
-```
-
-Replace the values given above for different versions and locations of CUDA.
+## Install the generate() API package
 
-#### Build ONNX Runtime on Mac
+### DirectML
 
-```bash
-./build.sh --build_shared_lib --skip_tests --parallel --config Release
 ```
-
-### Build the generate() API from source
-
-#### Clone the repo
-
-```bash
-git clone https://github.com/microsoft/onnxruntime-genai
-cd onnxruntime-genai
-mkdir -p ort/include
-mkdir -p ort/lib
+pip install numpy
+pip install --pre onnxruntime-genai-directml
 ```
 
-#### Build the generate() API on Windows
-
-
-If building for DirectML
+### CPU
 
-```bash
-copy ..\onnxruntime\include\onnxruntime\core\providers\dml\dml_provider_factory.h ort\include
 ```
-
-```bash
-copy ..\onnxruntime\include\onnxruntime\core\session\onnxruntime_c_api.h ort\include
-copy ..\onnxruntime\build\Windows\Release\Release\*.dll ort\lib
-copy ..\onnxruntime\build\Windows\Release\Release\onnxruntime.lib ort\lib
-python build.py [--use_dml | --use_cuda]
-cd build\wheel
-pip install *.whl
+pip install numpy
+pip install --pre onnxruntime-genai
 ```
 
+### CUDA
 
-#### Build the generate() API on Linux
-
-```bash
-cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include
-cp ../onnxruntime/build/Linux/Release/libonnxruntime*.so* ort/lib
-python build.py [--use_cuda]
-cd build/wheel
-pip install *.whl
 ```
-
-#### Build the generate() API on Mac
-
-```bash
-cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include
-cp ../onnxruntime/build/MacOS/Release/libonnxruntime*.dylib* ort/lib
-python build.py
-cd build/wheel
-pip install *.whl
+pip install numpy
+pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/
 ```
 
 ## Run the model
 
-Run the model with [this script](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py).
+Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py).
 
 The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.
 
+This example is using the long context model running with DirectML on Windows.
+
 ```bash
-pip install numpy
-python model-qa.py -m models/phi3-mini-4k-instruct-cpu-int4-rtn-block-32 
+curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
+python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
 ```
 
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:

From 2f619f904ca446427f914bef4540545645f66ed5 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Mon, 29 Apr 2024 15:19:40 -0700
Subject: [PATCH 06/15] Remove NO_TOKENIZER build option (#353)

We always build with tokenizer now, and the option was broken anyways as
it wasn't being updated.
---
 CMakeLists.txt       | 15 +++++----------
 cmake/options.cmake  |  1 -
 src/models/model.cpp | 22 ----------------------
 src/models/model.h   | 17 -----------------
 test/CMakeLists.txt  |  9 +++------
 test/model_tests.cpp |  8 --------
 6 files changed, 8 insertions(+), 64 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acf8f22f6..7aa8021a1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,16 +179,11 @@ if(USE_DML)
   add_dependencies(onnxruntime-genai-static RESTORE_PACKAGES)
 endif()
 
-if(NO_TOKENIZEROOT)
-  add_compile_definitions(NO_TOKENIZER=1)
-  message("----------------Tokenizer Disabled------------------")
-else()
-  add_subdirectory("${CMAKE_SOURCE_DIR}/src/tokenizer")
-  target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT})
-  target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT})
-  target_link_libraries(onnxruntime-genai PRIVATE tokenizer)
-  target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer)
-endif()
+add_subdirectory("${CMAKE_SOURCE_DIR}/src/tokenizer")
+target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT})
+target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT})
+target_link_libraries(onnxruntime-genai PRIVATE tokenizer)
+target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer)
 
 if(ENABLE_TESTS)
   add_subdirectory("${CMAKE_SOURCE_DIR}/test")
diff --git a/cmake/options.cmake b/cmake/options.cmake
index ac40a6d1d..688633fda 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -2,7 +2,6 @@ include(CMakeDependentOption)
 
 option(USE_CUDA "Build with CUDA support" ON)
 option(USE_DML "Build with DML support" OFF)
-option(NO_TOKENIZER "Don't include the Tokenizer" OFF)
 option(ENABLE_PYTHON "Build the Python API." ON)
 option(ENABLE_TESTS "Enable tests" ON)
 option(TEST_PHI2 "Enable tests for Phi2" OFF)
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 4e7aa7343..439ab5c6a 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -94,26 +94,6 @@ std::vector<int32_t> PadInputs(std::span<std::span<const int32_t>> sequences, in
   return result;
 }
 
-#ifdef NO_TOKENIZER
-const std::string& TokenizerStream::Decode(int32_t token) {
-  throw std::runtime_error("Tokenizer not enabled");
-}
-
-std::unique_ptr<TokenizerStream> Tokenizer::CreateStream() const {
-  return std::make_unique<TokenizerStream>();
-}
-
-Tokenizer::Tokenizer(Config& config) {
-}
-
-std::vector<int32_t> Tokenizer::Encode(const char* text) const {
-  throw std::runtime_error("Tokenizer not enabled");
-}
-
-std::string Tokenizer::Decode(std::span<int32_t> tokens) const {
-  throw std::runtime_error("Tokenizer not enabled");
-}
-#else
 void CheckResult(tfmError_t error) {
   if (error != kTfmOK)
     throw std::runtime_error(TfmGetLastErrorMessage());
@@ -179,8 +159,6 @@ std::vector<std::string> Tokenizer::DecodeBatch(std::span<const int32_t> sequenc
   return strings;
 }
 
-#endif
-
 #if USE_CUDA
 // Since Python/Others can and will hold onto a generator object past the model object's lifetime we need to ensure
 // the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory
diff --git a/src/models/model.h b/src/models/model.h
index b569373f8..fe3b9d832 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -1,8 +1,5 @@
 #pragma once
-#ifndef NO_TOKENIZER
 #include "tfmtok_c.h"
-#endif
-
 #include "captured_graph_pool.h"
 
 #if USE_DML
@@ -36,19 +33,6 @@ struct State {
   void ClearIO();                                             // Clear all inputs/outputs
 };
 
-#ifdef NO_TOKENIZER
-struct TokenizerStream {
-  const std::string& Decode(int32_t token);
-};
-
-struct Tokenizer {
-  Tokenizer(Config& config);
-
-  std::vector<int32_t> Encode(const char* text) const;
-  std::string Decode(std::span<int32_t> tokens) const;
-};
-#else
-
 template <typename T>
 struct TfmPtr {
   ~TfmPtr() { TfmDispose(&p_); }
@@ -94,7 +78,6 @@ struct Tokenizer : std::enable_shared_from_this<Tokenizer> {
  private:
   int32_t pad_token_id_;
 };
-#endif
 
 struct SessionInfo {
   SessionInfo(OrtSession& session);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index daf8c40b3..80bb58fcd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -43,12 +43,9 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER)
   target_sources(unit_tests PRIVATE ${cuda_test_srcs})
 endif()
 
-if(NO_TOKENIZER)
-  add_compile_definitions(NO_TOKENIZER=1)
-else()
-  target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT})
-  target_link_libraries(unit_tests PRIVATE tokenizer)
-endif()
+target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT})
+target_link_libraries(unit_tests PRIVATE tokenizer)
+
 set(TEST_MODEL_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/test_models/")
 set(TEST_MODEL_DES_DIR "$<TARGET_FILE_DIR:unit_tests>/test_models/")
 add_custom_command(TARGET unit_tests POST_BUILD
diff --git a/test/model_tests.cpp b/test/model_tests.cpp
index 66ceaee83..73c6464e0 100644
--- a/test/model_tests.cpp
+++ b/test/model_tests.cpp
@@ -203,7 +203,6 @@ TEST(ModelTests, BeamSearchGptCuda) {
 
 TEST(ModelTests, TestApiCuda) {
 #if TEST_PHI2
-#ifndef NO_TOKENIZER
 
   auto prompt = R"(
 def print_prime(n):
@@ -234,15 +233,11 @@ Print all primes between 1 and n
   auto result = generator->GetSequence(0);
 
   std::cout << tokenizer->Decode(result.GetCPU()) << "\r\n";
-#else
-  std::cout << "Test skipped - not built with onnxruntime extensions\r\n";
-#endif
 #endif
 }
 
 TEST(ModelTests, TestHighLevelApiCuda) {
 #if TEST_PHI2
-#ifndef NO_TOKENIZER
   auto prompt = R"(
 def print_prime(n):
 '''
@@ -266,9 +261,6 @@ Print all primes between 1 and n
   auto result = Generators::Generate(*model, *params);
 
   std::cout << tokenizer->Decode(result[0]) << "\r\n";
-#else
-  std::cout << "Test skipped - not built with onnxruntime extensions\r\n";
-#endif
 #endif
 }
 

From 639d176d4a39cb32dc1087644f0a9c2a18bcf5a5 Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Mon, 29 Apr 2024 18:26:33 -0700
Subject: [PATCH 07/15] Use the most performant adapter for DML (#333)

---
 CMakeLists.txt          |  4 ++--
 src/dml/dml_helpers.cpp | 26 +++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7aa8021a1..7eaa9f83c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,8 +150,8 @@ if(USE_DML)
   target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${WIL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
   target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>/directx)
   target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
-  target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib)
-  target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib)
+  target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib dxcore.lib dxguid.lib)
+  target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib dxcore.lib dxguid.lib)
 
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps ABSOLUTE)
   set(DXC_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.Direct3D.DXC.1.7.2308.12)
diff --git a/src/dml/dml_helpers.cpp b/src/dml/dml_helpers.cpp
index e7a0c2f08..2dcd0267f 100644
--- a/src/dml/dml_helpers.cpp
+++ b/src/dml/dml_helpers.cpp
@@ -9,6 +9,29 @@
 
 namespace DmlHelpers {
 
+static ComPtr<IDXCoreAdapter> CreatePerformantAdapter() {
+  ComPtr<IDXCoreAdapterFactory> adapter_factory;
+  THROW_IF_FAILED(DXCoreCreateAdapterFactory(adapter_factory.GetAddressOf()));
+
+  ComPtr<IDXCoreAdapterList> adapter_list;
+  THROW_IF_FAILED(adapter_factory->CreateAdapterList(
+      1,
+      &DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE,
+      adapter_list.GetAddressOf()));
+
+  // We prefer the hightest performance adapter
+  std::array<DXCoreAdapterPreference, 1> adapter_list_preferences = {DXCoreAdapterPreference::HighPerformance};
+
+  THROW_IF_FAILED(adapter_list->Sort(
+      static_cast<uint32_t>(adapter_list_preferences.size()),
+      adapter_list_preferences.data()));
+
+  ComPtr<IDXCoreAdapter> performant_adapter;
+  THROW_IF_FAILED(adapter_list->GetAdapter(0, performant_adapter.GetAddressOf()));
+
+  return performant_adapter;
+}
+
 DmlObjects CreateDmlObjects() {
   D3D12_COMMAND_QUEUE_DESC command_queue_description = {
       D3D12_COMMAND_LIST_TYPE_COMPUTE,
@@ -19,7 +42,8 @@ DmlObjects CreateDmlObjects() {
 
   DmlObjects dml_objects;
 
-  THROW_IF_FAILED(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device)));
+  auto adapter = CreatePerformantAdapter();
+  THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device)));
   THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandQueue(&command_queue_description, IID_PPV_ARGS(&dml_objects.command_queue)));
   THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&dml_objects.command_allocator)));
   THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, dml_objects.command_allocator.Get(), nullptr, IID_PPV_ARGS(&dml_objects.command_list)));

From 514493be37eebbfcdf387c3d83ac71650303d18a Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Mon, 29 Apr 2024 21:05:23 -0700
Subject: [PATCH 08/15] Improve tutorial (#355)

---
 examples/python/phi-3-tutorial.md | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 4e8a5660c..6d2f5f728 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -26,25 +26,40 @@ These model repositories have models that run with DirectML, CPU and CUDA.
 
 ## Install the generate() API package
 
+**Unsure about which installation instructions to follow?** Here's a bit more guidance:
+
+Are you on Windows machine with GPU?
+* I don't know &rarr; Review [this guide](https://www.microsoft.com/en-us/windows/learning-center/how-to-check-gpu) to see whether you have a GPU in your Windows machine.
+* Yes &rarr; Follow the instructions for [DirectML](#directml).
+* No &rarr; Do you have an NVIDIA GPU?
+  * I don't know &rarr; Review [this guide](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#verify-you-have-a-cuda-capable-gpu) to see whether you have a CUDA-capable GPU.
+  * Yes &rarr; Follow the instructions for [NVIDIA CUDA GPU](#nvidia-cuda-gpu).
+  * No &rarr; Follow the instructions for [CPU](#cpu).
+ 
+*Note: Only one package is required based on your hardware.*
+
 ### DirectML
 
+
 ```
 pip install numpy
 pip install --pre onnxruntime-genai-directml
 ```
 
-### CPU
+### NVIDIA CUDA GPU
+
 
 ```
 pip install numpy
-pip install --pre onnxruntime-genai
+pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/
 ```
 
-### CUDA
+### CPU
+
 
 ```
 pip install numpy
-pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/
+pip install --pre onnxruntime-genai
 ```
 
 ## Run the model
@@ -55,6 +70,9 @@ The script accepts a model folder and takes the generation parameters from the c
 
 This example is using the long context model running with DirectML on Windows.
 
+The `-m` argument is the path to the model you downloaded from HuggingFace above.
+The `-l` argument is the length of output you would like to generate with the model.
+
 ```bash
 curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
 python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
@@ -66,4 +84,4 @@ Once the script has loaded the model, it will ask you for input in a loop, strea
 Input: <|user|>Tell me a joke about creative writing<|end|><|assistant|>
  
 Output:  Why don't writers ever get lost? Because they always follow the plot! 
-```
+```
\ No newline at end of file

From a028d7879a2adc92f6a8bf0babe01d1c7398c15f Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 30 Apr 2024 09:31:46 -0700
Subject: [PATCH 09/15] Simulate the chat template (#352)

---
 examples/python/model-qa.py       | 12 ++++-
 examples/python/phi-3-tutorial.md |  6 +--
 examples/python/phi3-qa.py        | 87 +++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 5 deletions(-)
 create mode 100644 examples/python/phi3-qa.py

diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
index 6f323ccc4..57ec9f6db 100644
--- a/examples/python/model-qa.py
+++ b/examples/python/model-qa.py
@@ -15,6 +15,9 @@ def main(args):
     if args.verbose: print("Tokenizer created")
     if args.verbose: print()
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+    if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
+        print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+        exit(1)
 
     # Keep asking for input prompts in a loop
     while True:
@@ -25,7 +28,12 @@ def main(args):
 
         if args.timings: started_timestamp = time.time()
 
-        input_tokens = tokenizer.encode(args.system_prompt + text)
+        # If there is a chat template, use it
+        prompt = text
+        if args.chat_template:
+            prompt = f'{args.chat_template.format(input=text)}'
+
+        input_tokens = tokenizer.encode(prompt)
 
         params = og.GeneratorParams(model)
         params.try_use_cuda_graph_with_max_batch_size(1)
@@ -76,7 +84,7 @@ def main(args):
     parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
     parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
     parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
-    parser.add_argument('-s', '--system_prompt', type=str, default='', help='Prepend a system prompt to the user input prompt. Defaults to empty')
     parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}')
     args = parser.parse_args()
     main(args)
\ No newline at end of file
diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 6d2f5f728..5442886a3 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -64,7 +64,7 @@ pip install --pre onnxruntime-genai
 
 ## Run the model
 
-Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py).
+Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py).
 
 The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.
 
@@ -74,8 +74,8 @@ The `-m` argument is the path to the model you downloaded from HuggingFace above
 The `-l` argument is the length of output you would like to generate with the model.
 
 ```bash
-curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
-python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
+curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
+python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
 ```
 
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
new file mode 100644
index 000000000..9e9392895
--- /dev/null
+++ b/examples/python/phi3-qa.py
@@ -0,0 +1,87 @@
+import onnxruntime_genai as og
+import argparse
+import time
+
+def main(args):
+    if args.verbose: print("Loading model...")
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
+
+    model = og.Model(f'{args.model}')
+    if args.verbose: print("Model loaded")
+    tokenizer = og.Tokenizer(model)
+    tokenizer_stream = tokenizer.create_stream()
+    if args.verbose: print("Tokenizer created")
+    if args.verbose: print()
+    search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+    chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
+
+    # Keep asking for input prompts in a loop
+    while True:
+        text = input("Input: ")
+        if not text:
+            print("Error, input cannot be empty")
+            continue
+
+        if args.timings: started_timestamp = time.time()
+
+        # If there is a chat template, use it
+        prompt = f'{chat_template.format(input=text)}'
+
+        print(f"Prompt: {prompt}")
+
+        input_tokens = tokenizer.encode(prompt)
+
+        params = og.GeneratorParams(model)
+        params.try_use_cuda_graph_with_max_batch_size(1)
+        params.set_search_options(**search_options)
+        params.input_ids = input_tokens
+        generator = og.Generator(model, params)
+        if args.verbose: print("Generator created")
+
+        if args.verbose: print("Running generation loop ...")
+        if args.timings:
+            first = True
+            new_tokens = []
+
+        print()
+        print("Output: ", end='', flush=True)
+
+        try:
+            while not generator.is_done():
+                generator.compute_logits()
+                generator.generate_next_token()
+                if args.timings:
+                    if first:
+                        first_token_timestamp = time.time()
+                        first = False
+
+                new_token = generator.get_next_tokens()[0]
+                print(tokenizer_stream.decode(new_token), end='', flush=True)
+                if args.timings: new_tokens.append(new_token)
+        except KeyboardInterrupt:
+            print("  --control+c pressed, aborting generation--")
+        print()
+        print()
+
+        if args.timings:
+            prompt_time = first_token_timestamp - started_timestamp
+            run_time = time.time() - first_token_timestamp
+            print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai")
+    parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
+    parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
+    parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
+    parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
+    parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+    parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
+    parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
+    parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file

From cb4e3aaf982abcd22adfc01a9d4663f76f645561 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 30 Apr 2024 09:35:37 -0700
Subject: [PATCH 10/15] Update phi-3-tutorial.md (#361)

---
 examples/python/phi-3-tutorial.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 5442886a3..5ee9331e3 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -81,7 +81,7 @@ python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-bl
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
 
 ```bash
-Input: <|user|>Tell me a joke about creative writing<|end|><|assistant|>
+Input: Tell me a joke about creative writing
  
 Output:  Why don't writers ever get lost? Because they always follow the plot! 
-```
\ No newline at end of file
+```

From afd2edc892c58a41f6b69df0200e96ea6785d9e6 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 30 Apr 2024 14:48:45 -0700
Subject: [PATCH 11/15] Make OgaModel* const again (#356)

---
 benchmark/c/main.cpp               |  2 +-
 src/csharp/NativeMethods.cs        |  4 ++--
 src/generators.cpp                 | 20 +++++++++++++++++++-
 src/generators.h                   |  6 ++++++
 src/models/captured_graph_pool.cpp |  2 +-
 src/models/decoder_only.cpp        |  4 ++--
 src/models/model.cpp               | 22 ----------------------
 src/models/model.h                 |  5 -----
 src/ort_genai.h                    |  4 ++--
 src/ort_genai_c.cpp                | 16 +++++-----------
 src/ort_genai_c.h                  |  4 ++--
 src/python/python.cpp              |  5 ++---
 12 files changed, 42 insertions(+), 52 deletions(-)

diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp
index 2d4b62b1f..3a4c9b43b 100644
--- a/benchmark/c/main.cpp
+++ b/benchmark/c/main.cpp
@@ -112,7 +112,7 @@ void WriteE2EStats(std::string_view label,
             << "\n";
 }
 
-std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) {
+std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) {
   const char* const base_prompt = "A";
   auto base_prompt_sequences = OgaSequences::Create();
 
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index f2906f3df..a56e7dd7e 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -71,7 +71,7 @@ internal class NativeLib
                                                                                          IntPtr /* const OgaSequences* */ sequences);
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model,
                                                                         IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                         out IntPtr /* OgaGenerator** */ generator);
 
@@ -129,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
         // This function is used to generate sequences for the given model using the given generator parameters.
         // The OgaSequences object is an array of sequences, where each sequence is an array of tokens.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model,
                                                                  IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                  out IntPtr /* OgaSequences** */ sequences);
 
diff --git a/src/generators.cpp b/src/generators.cpp
index 0c664f341..bc00f8d3e 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -61,7 +61,25 @@ GeneratorParams::GeneratorParams(const Model& model)
       eos_token_id{model.config_->model.eos_token_id},
       vocab_size{model.config_->model.vocab_size},
       device_type{model.device_type_},
-      cuda_stream{model.cuda_stream_} {
+      cuda_stream{model.cuda_stream_},
+      is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} {
+}
+
+void GeneratorParams::TryGraphCapture(int max_bs) {
+  if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) {
+    // no-op
+    return;
+  }
+
+  if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) {
+    if (max_bs == 0) {
+      throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set.");
+    }
+    use_cuda_graph = true;
+    max_batch_size = max_bs;
+  } else {
+    throw std::runtime_error("CUDA graph is not supported on this device");
+  }
 }
 
 std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params) {
diff --git a/src/generators.h b/src/generators.h
index c10868570..c6a510739 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -61,6 +61,7 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
 
   int batch_size{1};
   int max_batch_size{0};
+  bool use_cuda_graph{};
   int sequence_length{};
   int BatchBeamSize() const { return search.num_beams * batch_size; }
 
@@ -97,6 +98,11 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
   std::vector<int32_t> input_ids_owner;  // Backing memory of input_ids in some cases
 
   std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
+
+  void TryGraphCapture(int max_bs);
+
+ private:
+  bool is_cuda_graph_enabled_{};
 };
 
 struct Generator {
diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp
index 140f2a8cd..96cc029b8 100644
--- a/src/models/captured_graph_pool.cpp
+++ b/src/models/captured_graph_pool.cpp
@@ -24,7 +24,7 @@ static std::tuple<int, int, int> MakeKey(int max_batch_size, int max_length, int
 }
 
 CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const {
-  if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
+  if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
     return nullptr;
   }
 
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 83d1f03d3..53f4f6697 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -26,7 +26,7 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArra
 
 RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices) {
   if (first_run_) {
-    if (model_.use_cuda_graph_) {
+    if (params_->use_cuda_graph) {
       model_.run_options_->AddConfigEntry("gpu_graph_id", "-1");
     }
     first_run_ = false;
@@ -37,7 +37,7 @@ RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int3
   State::Run(*model_.session_decoder_, *model_.run_options_);
 
   // Set the graph id for the following runs.
-  if (model_.use_cuda_graph_) {
+  if (params_->use_cuda_graph) {
     int new_batch_size = static_cast<int>(input_ids_.GetShape()[0]);
     if (new_batch_size != current_batch_size_) {
       current_batch_size_ = new_batch_size;
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 439ab5c6a..6f0cc294a 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -515,26 +515,4 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   return expanded;
 }
 
-void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) {
-  bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options);
-  max_batch_size_ = params.max_batch_size;
-
-  if (DeviceType::CUDA == device_type_) {
-    if (is_cuda_graph_enabled) {
-      if (max_batch_size_ == 0) {
-        throw std::runtime_error("CUDA graph is enabled, but max_batch_size is not set.");
-      }
-      use_cuda_graph_ = true;
-    }
-  } else if (DeviceType::DML == device_type_) {
-    if (max_batch_size_ == 0) {
-      throw std::runtime_error("max_batch_size needs to be set when using DirectML.");
-    }
-
-    use_cuda_graph_ = true;
-  } else if (is_cuda_graph_enabled) {
-    throw std::runtime_error("CUDA graph is not supported on this device");
-  }
-}
-
 }  // namespace Generators
diff --git a/src/models/model.h b/src/models/model.h
index fe3b9d832..5b9ec12d9 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -102,8 +102,6 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::unique_ptr<OrtValue> ExpandInputs(std::unique_ptr<OrtValue>& input, int num_beams) const;
 
-  void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params);
-
   CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); }
 
   std::unique_ptr<Config> config_;
@@ -119,9 +117,6 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::shared_ptr<Model> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
-  bool use_cuda_graph_{};
-  int max_batch_size_{};
-
 #if USE_DML
   DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); }
   DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); }
diff --git a/src/ort_genai.h b/src/ort_genai.h
index fb863dae2..b8e55bf19 100644
--- a/src/ort_genai.h
+++ b/src/ort_genai.h
@@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract {
     return std::unique_ptr<OgaModel>(p);
   }
 
-  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) const {
     OgaSequences* p;
     OgaCheckResult(OgaGenerate(this, &params, &p));
     return std::unique_ptr<OgaSequences>(p);
@@ -201,7 +201,7 @@ struct OgaGeneratorParams : OgaAbstract {
 };
 
 struct OgaGenerator : OgaAbstract {
-  static std::unique_ptr<OgaGenerator> Create(OgaModel& model, const OgaGeneratorParams& params) {
+  static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, const OgaGeneratorParams& params) {
     OgaGenerator* p;
     OgaCheckResult(OgaCreateGenerator(&model, &params, &p));
     return std::unique_ptr<OgaGenerator>(p);
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index 13cae5235..d5ab67040 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -108,7 +108,7 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene
 OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) {
   OGA_TRY
   auto* params = reinterpret_cast<Generators::GeneratorParams*>(generator_params);
-  params->max_batch_size = max_batch_size;
+  params->TryGraphCapture(max_batch_size);
   return nullptr;
   OGA_CATCH
 }
@@ -143,23 +143,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera
   OGA_CATCH
 }
 
-OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
+OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
   OGA_TRY
-  auto* model_p = reinterpret_cast<Generators::Model*>(model);
-  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
-  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
-  auto result = Generators::Generate(*model_p, *params);
+  auto result = Generators::Generate(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params));
   *out = reinterpret_cast<OgaSequences*>(std::make_unique<Generators::TokenSequences>(std::move(result)).release());
   return nullptr;
   OGA_CATCH
 }
 
-OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
+OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
   OGA_TRY
-  auto* model_p = reinterpret_cast<Generators::Model*>(model);
-  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
-  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
-  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*model_p, *params).release());
+  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params)).release());
   return nullptr;
   OGA_CATCH
 }
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index 0939d2c36..3e44c29e4 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -117,7 +117,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model);
  *             after it is done using the sequences.
  * \return OgaResult containing the error message if the generation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
 
 /*
  * \brief Creates a OgaGeneratorParams from the given model.
@@ -167,7 +167,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O
  * \param[out] out The created generator.
  * \return OgaResult containing the error message if the generator creation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
 
 /*
  * \brief Destroys the given generator.
diff --git a/src/python/python.cpp b/src/python/python.cpp
index cd974d916..1d8a4e567 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -104,7 +104,7 @@ struct PyGeneratorParams {
   }
 
   void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) {
-    params_->max_batch_size = max_batch_size.cast<int>();
+    params_->TryGraphCapture(max_batch_size.cast<int>());
   }
 
   pybind11::array_t<int32_t> py_input_ids_;
@@ -115,7 +115,6 @@ struct PyGeneratorParams {
 struct PyGenerator {
   PyGenerator(Model& model, PyGeneratorParams& params) {
     params.Prepare();
-    model.GetMaxBatchSizeFromGeneratorParams(params);
     generator_ = CreateGenerator(model, params);
   }
 
@@ -229,7 +228,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def(pybind11::init([](const std::string& config_path) {
         return CreateModel(GetOrtEnv(), config_path.c_str());
       }))
-      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); })
+      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); })
       .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });
 
   pybind11::class_<PyGenerator>(m, "Generator")

From eea971091978a7e8aa6d90a550d327e09c29d445 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Tue, 30 Apr 2024 15:22:55 -0700
Subject: [PATCH 12/15] update readme (#363)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f6cab5472..113a0a5b7 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ prompt = '''def print_prime(n):
 tokens = tokenizer.encode(prompt)
 
 params = og.GeneratorParams(model)
-params.set_search_options({"max_length":200})
+params.set_search_options(max_length=200)
 # Add the following line to enable cuda graph by passing the maximum batch size.
 # params.try_use_cuda_graph_with_max_batch_size(16)
 params.input_ids = tokens

From 7dd45f2a6b3930543e65a0d2685d00eaa30fb522 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 30 Apr 2024 18:34:01 -0400
Subject: [PATCH 13/15] Use ort-nightly build for genai gha ci (#256)

---
 .github/workflows/linux-cpu-x64-build.yml     | 52 +++++++++++++++----
 .github/workflows/linux-gpu-x64-build.yml     | 52 +++++++++++++++----
 .github/workflows/mac-cpu-arm64-build.yml     | 26 +++++-----
 .github/workflows/win-cpu-arm64-build.yml     | 10 ++--
 .github/workflows/win-cpu-x64-build.yml       | 44 ++++++++++------
 .github/workflows/win-cuda-x64-build.yml      | 44 ++++++++++------
 cmake/presets/CMakeMacOSConfigPresets.json    |  2 +-
 nuget.config                                  | 17 +++---
 onnxruntime-genai.sln                         | 36 +++++++++++++
 ...icrosoft.ML.OnnxRuntimeGenAI.Tests.csproj} |  3 +-
 10 files changed, 208 insertions(+), 78 deletions(-)
 create mode 100644 onnxruntime-genai.sln
 rename test/csharp/{Microsoft.OnnxRuntimeGenAI.Tests.csproj => Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj} (92%)

diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 2e1c03aab..744fa567a 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -4,10 +4,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-x64-1.17.3"
-  ort_zip: "onnxruntime-linux-x64-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
-
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
+  ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
+  NUGET_EXE: "mono /usr/local/bin/nuget.exe"
 jobs:
   linux_cpu_x64:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ]
@@ -16,19 +16,49 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
+      - name: install Mono and Nuget
+        run: |
+          sudo apt install ca-certificates gnupg
+          sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+          echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list
+          sudo apt update
+          sudo apt install -y mono-devel
+          sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+          sudo chmod +x /usr/local/bin/nuget.exe
+
+      - name: Install jq and dotnet
+        run: |
+          wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+          sudo dpkg -i packages-microsoft-prod.deb
+          rm packages-microsoft-prod.deb
+          sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq
 
-      - name: Download OnnxRuntime
+      - name: Get the Latest OnnxRuntime Nightly Version
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
 
-      - name: Unzip OnnxRuntime
+      - name: Download OnnxRuntime Nightly
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          ${{ env.NUGET_EXE }} install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
+        continue-on-error: true
+
+      - name: list files
+        shell: bash
+        run: |
+          ls -l
+          ls -R ${{ env.ORT_PACKAGE_NAME }}
+        continue-on-error: true
 
-      - name: Rename OnnxRuntime to ort
+# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version
+      - name: Extra OnnxRuntime library and header files
         run: |
-          mv ${{ env.ort_dir }} ort
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/
+          ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1)
+          cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version
 
       - name: Build with CMake and GCC
         run: |
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index c1e51251b..123ff5f75 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -6,9 +6,11 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  ort_dir: "onnxruntime-linux-x64-gpu-1.17.3"
-  ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz"
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
+  ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
+  NUGET_EXE: "mono /usr/local/bin/nuget.exe"
+
 
 jobs:
   linux-cuda-x64-build:
@@ -29,19 +31,49 @@ jobs:
           clean: true
           path: manylinux
           submodules: true
+      - name: install Mono and Nuget
+        run: |
+          sudo apt install ca-certificates gnupg
+          sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+          echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list
+          sudo apt update
+          sudo apt install -y mono-devel
+          sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+          sudo chmod +x /usr/local/bin/nuget.exe
+
+      - name: Install jq and dotnet
+        run: |
+          wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+          sudo dpkg -i packages-microsoft-prod.deb
+          rm packages-microsoft-prod.deb
+          sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq
 
       - name: Download OnnxRuntime
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }}
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
 
-      - name: Unzip OnnxRuntime
+      - name: Download OnnxRuntime Nightly
+        run: |
+          mono /usr/local/bin/nuget.exe install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
+        continue-on-error: true
+      - name: list files
+        shell: bash
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          ls -l
+          ls -R ${{ env.ORT_PACKAGE_NAME }}
+        continue-on-error: true
 
-      - name: Rename OnnxRuntime to ort
+# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version
+      - name: Extra OnnxRuntime library and header files
         run: |
-          mv ${{ env.ort_dir }} ort
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/
+          ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1)
+          cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version
+
 
       - name: Get Docker Image
         run: |
@@ -78,7 +110,7 @@ jobs:
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -w /ort_genai_src onnxruntimecudabuildx64 \
             bash -c " \
-              /usr/bin/cmake --build --preset linux_gcc_cuda_release --parallel $( nproc )"
+              /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
       - name: Get HuggingFace Token
         run: |
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index 9cb9cdc46..aba92d017 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -4,9 +4,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-osx-arm64-1.17.3"
-  ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz"
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 jobs:
   mac-cpu-arm64-build:
     runs-on: macos-latest
@@ -16,22 +15,21 @@ jobs:
         with:
           submodules: true
 
-      - name: Install ninja
+      - name: Get the Latest OnnxRuntime Nightly Version
         run: |
-          brew install ninja
-
-      - name: Download OnnxRuntime
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
+      - name: Download OnnxRuntime Nightly
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+          nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
 
-      - name: Unzip OnnxRuntime
+      - name: Extra OnnxRuntime library and header files
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/
 
-      - name: Rename OnnxRuntime to ort
-        run: |
-          mv ${{ env.ort_dir }} ort
 
       - name: Configure CMake
         run: |
diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
index 916af3009..ce3bfcf4b 100644
--- a/.github/workflows/win-cpu-arm64-build.yml
+++ b/.github/workflows/win-cpu-arm64-build.yml
@@ -53,6 +53,11 @@ jobs:
       run: |
         cmake --build --preset windows_arm64_cpu_release --parallel
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the Python Wheel and Test Dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -62,10 +67,7 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
index ca0bb6b5b..cf5614dee 100644
--- a/.github/workflows/win-cpu-x64-build.yml
+++ b/.github/workflows/win-cpu-x64-build.yml
@@ -11,10 +11,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-win-x64-1.17.3"
-  ort_zip: "$(ort_dir).zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)"
   binaryDir: 'build/cpu'
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 
 jobs:
   windows-cpu-x64-build:
@@ -33,19 +32,32 @@ jobs:
       with:
         vs-version: '17.5'
 
-    - name: Download OnnxRuntime
+    - uses: actions/setup-dotnet@v4
+      with:
+        dotnet-version: '6.0.x'
+
+    - name : Install jq and nuget
       run: |
-        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip"
-        Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
+        choco install -y jq curl
 
-    - name: Unzip OnnxRuntime
+    - name: Get the Latest OnnxRuntime Nightly Version
+      shell: pwsh
       run: |
-        Expand-Archive $env:ort_zip -DestinationPath .
-        Remove-Item -Path $env:ort_zip
+        $ORT_NIGHTLY_VERSION = $(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+        echo "$ORT_NIGHTLY_VERSION" 
+        "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append
+    - name: Download OnnxRuntime Nightly
+      run: |
+        nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x -NonInteractive
+
+    - run: Get-ChildItem  ${{ env.ORT_PACKAGE_NAME }} -Recurse
+      continue-on-error: true
 
-    - name: Rename OnnxRuntime to ort
+    - name: Extra OnnxRuntime library and header files
       run: |
-        Rename-Item -Path $env:ort_dir -NewName ort
+        mkdir ort/lib
+        move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/
 
     - name: Initialize CodeQL
       uses: github/codeql-action/init@v3
@@ -60,6 +72,11 @@ jobs:
       run: |
         cmake --build --preset windows_x64_cpu_release --parallel
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the python wheel and test dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -76,10 +93,7 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
index a9f602ef8..f0cebbae8 100644
--- a/.github/workflows/win-cuda-x64-build.yml
+++ b/.github/workflows/win-cuda-x64-build.yml
@@ -8,14 +8,12 @@ concurrency:
 env:
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-  ort_dir: "onnxruntime-win-x64-gpu-1.17.3"
-  ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip"
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
   cuda_version: "11.8"
   CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8
   binaryDir: 'build/cuda'
-
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Windows&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime.Gpu.Windows"
 
 jobs:
   windows-cuda-x64-build:
@@ -35,17 +33,32 @@ jobs:
       run: |
         azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ env.cuda_version }}" ${{ env.cuda_dir}}
 
-    - name: Download OnnxRuntime
+    - uses: actions/setup-dotnet@v4
+      with:
+        dotnet-version: '6.0.x'
+
+    - name : Install jq and curl
       run: |
-        Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
+        choco install -y jq curl
 
-    - name: Unzip OnnxRuntime
+    - name: Get the Latest OnnxRuntime Nightly Version
+      shell: pwsh
+      run: |
+        $ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+        echo "$ORT_NIGHTLY_VERSION" 
+        "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append
+    - name: Download OnnxRuntime Nightly
       run: |
-        Expand-Archive $env:ort_zip -DestinationPath .
-        Remove-Item -Path $env:ort_zip
-    - name: Rename OnnxRuntime to ort
+        nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -ExcludeVersion -NonInteractive
+
+    - run: Get-ChildItem  ${{ env.ORT_PACKAGE_NAME }} -Recurse
+      continue-on-error: true
+
+    - name: Extra OnnxRuntime library and header files
       run: |
-        Rename-Item -Path $env:ort_dir -NewName ort
+        mkdir ort/lib
+        move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/    
 
     - name: Configure CMake
       run: |
@@ -59,6 +72,11 @@ jobs:
       run: |
         echo "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the Python Wheel and Test Dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -75,10 +93,6 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json
index cd0c0a0b9..1ea6d85c8 100644
--- a/cmake/presets/CMakeMacOSConfigPresets.json
+++ b/cmake/presets/CMakeMacOSConfigPresets.json
@@ -6,7 +6,7 @@
   "configurePresets": [
         {
       "name": "macos_default",
-      "generator": "Ninja",
+      "generator": "Unix Makefiles",
       "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {
         "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
diff --git a/nuget.config b/nuget.config
index 3e0389a52..63a200340 100644
--- a/nuget.config
+++ b/nuget.config
@@ -3,11 +3,14 @@
   <solution>
     <add key="disableSourceControlIntegration" value="true" />
   </solution>
-  <packageSources>
-    <clear />
-    <add key="NuGet Official" value="https://api.nuget.org/v3/index.json" />
-  </packageSources>
-  <disabledPackageSources>
-    <clear />
-  </disabledPackageSources>
+    <packageRestore>
+        <!--Allow NuGet to download missing packages -->
+        <add key="enabled" value="True" />
+        <!-- Automatically check for missing packages during build in Visual Studio -->
+        <add key="automatic" value="True" />
+    </packageRestore>
+    <packageSources>
+        <add key="nuget.org" value="https://api.nuget.org/v3/index.json" protocolVersion="3" />
+        <add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />
+    </packageSources>
 </configuration>
\ No newline at end of file
diff --git a/onnxruntime-genai.sln b/onnxruntime-genai.sln
new file mode 100644
index 000000000..5e59cc82e
--- /dev/null
+++ b/onnxruntime-genai.sln
@@ -0,0 +1,36 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31903.59
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{2253BDCC-33C9-431E-889A-56E3E75D10BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI", "src\csharp\Microsoft.ML.OnnxRuntimeGenAI.csproj", "{CA0EC087-3AF5-44D5-93F0-489420EBA014}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{505E2406-98C2-46DD-973A-3CEB95CF3626}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI.Tests", "test\csharp\Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj", "{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.Build.0 = Release|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014} = {2253BDCC-33C9-431E-889A-56E3E75D10BA}
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73} = {505E2406-98C2-46DD-973A-3CEB95CF3626}
+	EndGlobalSection
+EndGlobal
diff --git a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
similarity index 92%
rename from test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj
rename to test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
index e4ec8e6d8..978deb04e 100644
--- a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj
+++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
@@ -12,7 +12,8 @@
     <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
     <Configurations>Debug;RelWithDebInfo;Release</Configurations>
-
+    <RestoreAdditionalProjectSources>https://api.nuget.org/v3/index.json</RestoreAdditionalProjectSources>
+    <RestoreSources>$(RestoreAdditionalProjectSources);$(RestoreSources)</RestoreSources>
     <RootNamespace>Microsoft.ML.OnnxRuntimeGenAI.Tests</RootNamespace>
     <AssemblyName>Microsoft.ML.OnnxRuntimeGenAI.Tests</AssemblyName>
   </PropertyGroup>

From f94280f493c2f628726b7ea924592531fdb1bda1 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 30 Apr 2024 17:51:11 -0700
Subject: [PATCH 14/15] Ensure CIs are running on merge (#334)

---
 .github/workflows/linux-cpu-arm64-build.yml  | 9 ++++++++-
 .github/workflows/linux-cpu-x64-build.yml    | 8 +++++++-
 .github/workflows/linux-gpu-x64-build.yml    | 8 +++++++-
 .github/workflows/mac-cpu-arm64-build.yml    | 8 +++++++-
 .github/workflows/win-cuda-x64-build.yml     | 8 +++++++-
 .github/workflows/win-directml-x64-build.yml | 8 +++++++-
 6 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
index 3b55c3fe5..622b73eea 100644
--- a/.github/workflows/linux-cpu-arm64-build.yml
+++ b/.github/workflows/linux-cpu-arm64-build.yml
@@ -1,5 +1,12 @@
 name: "Linux CPU ARM64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 744fa567a..290695c9c 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Linux CPU x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index 123ff5f75..f6cdf0f37 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Linux CUDA x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index aba92d017..f2f90e427 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -1,5 +1,11 @@
 name: "MacOS CPU ARM64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
index f0cebbae8..ccc2f71fe 100644
--- a/.github/workflows/win-cuda-x64-build.yml
+++ b/.github/workflows/win-cuda-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Windows CUDA x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml
index 152b9ab1d..f7dcd89d0 100644
--- a/.github/workflows/win-directml-x64-build.yml
+++ b/.github/workflows/win-directml-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Windows DirectML x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

From b3ff5cec93015ef8b76ce7778be1df0acb3d893c Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:38:06 -0700
Subject: [PATCH 15/15] Add 'add_extra_input' to handle models like QLora
 (#370)

Add a new python api 'add_extra_input' that will take numpy tensors and
turn them into OrtValue inputs internally.
This allows models with extra custom inputs (like QLora) to be specified
in python.

C API to follow soon.
---
 src/generators.h             |  8 ++++++++
 src/models/model.cpp         |  5 +++++
 src/models/model.h           |  2 ++
 src/models/static_buffer.cpp | 19 ++-----------------
 src/models/static_buffer.h   |  1 -
 src/python/python.cpp        | 36 ++++++++++++++++++++++++++++++++++++
 6 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/src/generators.h b/src/generators.h
index c6a510739..e6ad6f0e1 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -99,6 +99,14 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
 
   std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
+  struct Input {
+    std::string name;
+    std::unique_ptr<OrtValue> value;
+  };
+
+  // A list of extra model inputs that will be matched at runtime based on name
+  std::vector<Input> extra_inputs;
+
   void TryGraphCapture(int max_bs);
 
  private:
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 6f0cc294a..35a9b4ad4 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -35,6 +35,11 @@ static std::wstring CurrentModulePath() {
 namespace Generators {
 
 State::State(const GeneratorParams& params) : params_{params.shared_from_this()} {
+  // Add extra user inputs
+  for (auto& input : params.extra_inputs) {
+    input_names_.push_back(input.name.c_str());
+    inputs_.push_back(input.value.get());
+  }
 }
 
 void State::Run(OrtSession& session, OrtRunOptions& run_options) {
diff --git a/src/models/model.h b/src/models/model.h
index 5b9ec12d9..165e7c345 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -16,6 +16,8 @@ struct Tokenizer;
 
 void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream);
 
+size_t GetOrtTypeSize(ONNXTensorElementDataType type);
+
 struct State {
   State(const GeneratorParams& params);
   virtual ~State() = default;
diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp
index 9bc5f50ea..eab776e65 100644
--- a/src/models/static_buffer.cpp
+++ b/src/models/static_buffer.cpp
@@ -1,4 +1,5 @@
 #include "../generators.h"
+#include "model.h"
 #include "static_buffer.h"
 
 namespace Generators {
@@ -8,7 +9,7 @@ StaticBuffer::StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size
 
 std::unique_ptr<OrtValue> StaticBuffer::CreateTensorOnStaticBuffer(std::span<const int64_t> shape,
                                                                    ONNXTensorElementDataType type) {
-  size_t new_bytes = GetElementSize(type) * GetNumElements(shape);
+  size_t new_bytes = GetOrtTypeSize(type) * GetNumElements(shape);
   if (buffer_ == nullptr) {
     // Assuming the first dimension is the batch size
     bytes_ = new_bytes * (max_beam_batch_size_ / shape[0]);
@@ -21,22 +22,6 @@ std::unique_ptr<OrtValue> StaticBuffer::CreateTensorOnStaticBuffer(std::span<con
   return OrtValue::CreateTensor(info_, buffer_, new_bytes, shape, type);
 }
 
-// TODO: same as GetOrtTypeSize() in model.cc. Should be moved to a common place
-size_t StaticBuffer::GetElementSize(ONNXTensorElementDataType type) {
-  switch (type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-      return sizeof(uint16_t);
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return sizeof(float);
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return sizeof(int32_t);
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return sizeof(int64_t);
-    default:
-      throw std::runtime_error("Unsupported tensor element data type");
-  }
-}
-
 size_t StaticBuffer::GetNumElements(std::span<const int64_t> shape) {
   size_t num_elements = 1;
   for (auto dim : shape) {
diff --git a/src/models/static_buffer.h b/src/models/static_buffer.h
index ce9e14686..8c133fdae 100644
--- a/src/models/static_buffer.h
+++ b/src/models/static_buffer.h
@@ -18,7 +18,6 @@ struct StaticBuffer {
                                                        ONNXTensorElementDataType type);
 
  private:
-  size_t GetElementSize(ONNXTensorElementDataType type);
   size_t GetNumElements(std::span<const int64_t> shape);
 
   Ort::Allocator* allocator_{nullptr};
diff --git a/src/python/python.cpp b/src/python/python.cpp
index 1d8a4e567..8bd25a9d3 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -22,6 +22,34 @@ pybind11::array_t<T> ToPython(std::span<T> v) {
   return pybind11::array_t<T>(v.size(), v.data());
 }
 
+ONNXTensorElementDataType ToTensorType(const pybind11::dtype& type) {
+  switch (type.num()) {
+    case pybind11::detail::npy_api::NPY_INT32_:
+      return Ort::TypeToTensorType<int32_t>::type;
+    case pybind11::detail::npy_api::NPY_UINT32_:
+      return Ort::TypeToTensorType<uint32_t>::type;
+    case 23 /*NPY_FLOAT16*/:
+      return Ort::TypeToTensorType<Ort::Float16_t>::type;
+    case pybind11::detail::npy_api::NPY_FLOAT_:
+      return Ort::TypeToTensorType<float>::type;
+    case pybind11::detail::npy_api::NPY_DOUBLE_:
+      return Ort::TypeToTensorType<double>::type;
+    default:
+      throw std::runtime_error("Unsupported numpy type");
+  }
+}
+
+std::unique_ptr<OrtValue> ToTensor(pybind11::array& v) {
+  auto type = ToTensorType(v.dtype());
+
+  std::vector<int64_t> shape(v.ndim());
+  for (pybind11::ssize_t i = 0; i < v.ndim(); i++)
+    shape[i] = v.shape()[i];
+
+  auto p_memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+  return OrtValue::CreateTensor(*p_memory_info, v.mutable_data(), v.nbytes(), shape, type);
+}
+
 namespace Generators {
 
 // A roaming array is one that can be in CPU or GPU memory, and will copy the memory as needed to be used from anywhere
@@ -85,6 +113,11 @@ struct PyGeneratorParams {
     }
   }
 
+  void AddExtraInput(const std::string& name, pybind11::array& value) {
+    params_->extra_inputs.push_back({name, ToTensor(value)});
+    refs_.emplace_back(value);
+  }
+
   void SetSearchOptions(const pybind11::kwargs& dict) {
     for (auto& entry : dict) {
       auto name = entry.first.cast<std::string>();
@@ -110,6 +143,8 @@ struct PyGeneratorParams {
   pybind11::array_t<int32_t> py_input_ids_;
   pybind11::array_t<float> py_whisper_input_features_;
   pybind11::array_t<int32_t> py_whisper_decoder_input_ids_;
+
+  std::vector<pybind11::object> refs_;  // References to data we want to ensure doesn't get garbage collected
 };
 
 struct PyGenerator {
@@ -198,6 +233,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_)
       .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_)
       .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_)
+      .def("add_extra_input", &PyGeneratorParams::AddExtraInput)
       .def("set_search_options", &PyGeneratorParams::SetSearchOptions)  // See config.h 'struct Search' for the options
       .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize);