diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
index 5018bdbb6..3b55c3fe5 100644
--- a/.github/workflows/linux-cpu-arm64-build.yml
+++ b/.github/workflows/linux-cpu-arm64-build.yml
@@ -4,9 +4,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-aarch64-1.17.1"
-  ort_zip: "onnxruntime-linux-aarch64-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-aarch64-1.17.1.tgz"
+  ort_dir: "onnxruntime-linux-aarch64-1.17.3"
+  ort_zip: "onnxruntime-linux-aarch64-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-aarch64-1.17.3.tgz"
 jobs:
   linux-cpu-arm64-build:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2004-ARM-CPU" ]
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index da202e19c..2e1c03aab 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -4,9 +4,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-x64-1.17.1"
-  ort_zip: "onnxruntime-linux-x64-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-1.17.1.tgz"
+  ort_dir: "onnxruntime-linux-x64-1.17.3"
+  ort_zip: "onnxruntime-linux-x64-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
 
 jobs:
   linux_cpu_x64:
diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
index c7a0234b1..fd4fb20fc 100644
--- a/.github/workflows/linux-cpu-x64-nightly-build.yml
+++ b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -12,9 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-x64-1.17.1"
-  ort_zip: "onnxruntime-linux-x64-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-1.17.1.tgz"
+  ort_dir: "onnxruntime-linux-x64-1.17.3"
+  ort_zip: "onnxruntime-linux-x64-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
 jobs:
   job:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ]
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index c4e4c372a..182ccb9f2 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -6,9 +6,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  ort_dir: "onnxruntime-linux-x64-gpu-1.17.1"
-  ort_zip: "onnxruntime-linux-x64-gpu-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-gpu-1.17.1.tgz"
+  ort_dir: "onnxruntime-linux-x64-gpu-1.17.3"
+  ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz"
 
 jobs:
   linux-gpu-x64-build:
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index d757370f4..9cb9cdc46 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -4,9 +4,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-osx-arm64-1.17.1"
-  ort_zip: "onnxruntime-osx-arm64-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-osx-arm64-1.17.1.tgz"
+  ort_dir: "onnxruntime-osx-arm64-1.17.3"
+  ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz"
 jobs:
   mac-cpu-arm64-build:
     runs-on: macos-latest
diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
index 7c64ba8ff..916af3009 100644
--- a/.github/workflows/win-cpu-arm64-build.yml
+++ b/.github/workflows/win-cpu-arm64-build.yml
@@ -11,9 +11,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-win-arm64-1.17.1"
+  ort_dir: "onnxruntime-win-arm64-1.17.3"
   ort_zip: "$(ort_dir).zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)"
   binaryDir: 'build/cpu'
 
 jobs:
@@ -33,7 +33,7 @@ jobs:
 
     - name: Download OnnxRuntime
       run: |
-        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-win-arm64-1.17.1.zip"
+        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-arm64-1.17.3.zip"
         Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
 
     - name: Unzip OnnxRuntime
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
index f13f3c2c8..ca0bb6b5b 100644
--- a/.github/workflows/win-cpu-x64-build.yml
+++ b/.github/workflows/win-cpu-x64-build.yml
@@ -11,9 +11,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-win-x64-1.17.1"
+  ort_dir: "onnxruntime-win-x64-1.17.3"
   ort_zip: "$(ort_dir).zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)"
   binaryDir: 'build/cpu'
 
 jobs:
@@ -35,7 +35,7 @@ jobs:
 
     - name: Download OnnxRuntime
       run: |
-        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-win-x64-1.17.1.zip"
+        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip"
         Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
 
     - name: Unzip OnnxRuntime
diff --git a/.github/workflows/win-gpu-x64-build.yml b/.github/workflows/win-gpu-x64-build.yml
index a3f1d338b..430001e62 100644
--- a/.github/workflows/win-gpu-x64-build.yml
+++ b/.github/workflows/win-gpu-x64-build.yml
@@ -8,9 +8,9 @@ concurrency:
 env:
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-  ort_dir: "onnxruntime-win-x64-gpu-1.17.1"
-  ort_zip: "onnxruntime-win-x64-gpu-1.17.1.zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-win-x64-gpu-1.17.1.zip"
+  ort_dir: "onnxruntime-win-x64-gpu-1.17.3"
+  ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip"
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
   cuda_version: "11.8"
   CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8
diff --git a/.gitignore b/.gitignore
index 60b60827f..1ff9a0f9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,7 @@
 /test/test_models/*
 /cache_models
 /onnxruntime-linux-x64-*
-/*.csv
+*.csv
 .idea
 cache_dir
 example-models
@@ -22,4 +22,6 @@ examples/python/genai_models
 examples/python/hf_cache
 
 !test/test_models/hf-internal-testing/
-!test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx
\ No newline at end of file
+!test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx
+
+.ipynb_checkpoints/
\ No newline at end of file
diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
index 451411fd2..b6fff7111 100644
--- a/.pipelines/nuget-publishing.yml
+++ b/.pipelines/nuget-publishing.yml
@@ -23,7 +23,7 @@ parameters:
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
-  default: '1.17.1'
+  default: '1.17.3'
 
 - name: cuda_version
   displayName: 'CUDA version'
diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml
index edce0b37d..1069ede44 100644
--- a/.pipelines/pypl-publishing.yml
+++ b/.pipelines/pypl-publishing.yml
@@ -22,7 +22,7 @@ parameters:
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
-  default: '1.17.1'
+  default: '1.17.3'
 
 - name: cuda_version
   displayName: 'CUDA version'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de12d6482..b325c2763 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,11 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
   add_compile_definitions(_DEBUG=1)
 endif()
 
+if(MSVC)
+  # set updated value for __cplusplus macro instead of 199711L
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
+endif()
+
 message(STATUS "Adding source files")
 
 file(GLOB generator_srcs CONFIGURE_DEPENDS
@@ -127,6 +132,11 @@ else()
   set(ONNXRUNTIME_EXTENSIONS_LIB "tfmtok_c.so")
 endif()
 
+file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
+if(USE_DML)
+  list(APPEND onnxruntime_libs "${ORT_LIB_DIR}/DirectML.dll")
+endif()
+
 if(NO_TOKENIZEROOT)
   add_compile_definitions(NO_TOKENIZER=1)
   message("----------------Tokenizer Disabled------------------")
@@ -148,6 +158,11 @@ if(ENABLE_PYTHON)
   message("------------------Enabling Python Wheel------------------")
 endif()
 
+if(ENABLE_MODEL_BENCHMARK)
+  add_subdirectory("${CMAKE_SOURCE_DIR}/benchmark/c")
+  message("------------------Enabling model benchmark------------------")
+endif()
+
 if(NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_LIB}")
   message(FATAL_ERROR "Expected the ONNX Runtime library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_LIB}. Actual: Not found.")
 endif()
@@ -158,7 +173,6 @@ if(USE_CUDA AND NOT EXISTS "${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_CUDA_LIB}")
   message(FATAL_ERROR "Expected the ONNX Runtime providers cuda library to be found at ${ORT_LIB_DIR}/${ONNXRUNTIME_PROVIDERS_CUDA_LIB}. Actual: Not found.")
 endif()
 
-file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
 target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR})
 target_link_libraries(onnxruntime-genai PRIVATE ${ONNXRUNTIME_LIB})
 
@@ -182,7 +196,6 @@ if(MSVC)
 endif()
 
 # Copy the onnxruntime binaries into the build folder so it's found on launch
-file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
 foreach(DLL_FILE ${onnxruntime_libs})
   add_custom_command(
     TARGET onnxruntime-genai POST_BUILD
diff --git a/README.md b/README.md
index b572a94fd..0e8b42566 100644
--- a/README.md
+++ b/README.md
@@ -50,10 +50,19 @@ See full documentation at [https://onnxruntime.ai/docs/genai].
 
 [Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package.
 
+1. Build the model
+```shell
+python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./models/phi2
+```
+
+2. Run inference
 ```python
+import os
 import onnxruntime_genai as og
 
-model = og.Model(f'models/microsoft/phi-2')
+model_path = os.path.abspath("./models/phi2")
+
+model = og.Model(model_path)
 
 tokenizer = og.Tokenizer(model)
 
@@ -64,7 +73,7 @@ prompt = '''def print_prime(n):
 
 tokens = tokenizer.encode(prompt)
 
-params = og.SearchParams(model)
+params = og.GeneratorParams(model)
 params.set_search_options({"max_length":200})
 params.input_ids = tokens
 
diff --git a/VERSION_INFO b/VERSION_INFO
index 3e2177af6..b4f09dd42 100644
--- a/VERSION_INFO
+++ b/VERSION_INFO
@@ -1 +1 @@
-0.1.0rc4
\ No newline at end of file
+0.2.0-dev
\ No newline at end of file
diff --git a/benchmark/c/CMakeLists.txt b/benchmark/c/CMakeLists.txt
new file mode 100644
index 000000000..0035f3e5e
--- /dev/null
+++ b/benchmark/c/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+set(model_benchmark_srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/options.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/options.cpp
+)
+
+add_executable(model_benchmark ${model_benchmark_srcs})
+
+target_include_directories(model_benchmark PRIVATE
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_SOURCE_DIR}/src  # directory containing the ort_genai headers
+)
+
+target_link_libraries(model_benchmark PRIVATE onnxruntime-genai-static ${ONNXRUNTIME_LIB})
+
+target_link_directories(model_benchmark PRIVATE ${ORT_LIB_DIR})
+
+add_custom_command(TARGET model_benchmark POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${onnxruntime_libs} $<TARGET_FILE_DIR:model_benchmark>
+)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${model_benchmark_srcs})
diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp
new file mode 100644
index 000000000..0a6840c42
--- /dev/null
+++ b/benchmark/c/main.cpp
@@ -0,0 +1,242 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "ort_genai.h"
+
+#include "options.h"
+
+namespace {
+
+using Clock = std::chrono::steady_clock;
+
+using Duration = Clock::duration;
+using DurationFp = std::chrono::duration<float, Duration::period>;
+
+class Timing {
+ public:
+  Timing(const Timing&) = delete;
+  Timing& operator=(const Timing&) = delete;
+
+  Timing(std::vector<Duration>& measurements)
+      : measurements_{measurements}, start_{Clock::now()} {
+  }
+
+  ~Timing() {
+    const auto measurement = Clock::now() - start_;
+    measurements_.push_back(measurement);
+  }
+
+ private:
+  std::vector<Duration>& measurements_;
+  const Clock::time_point start_;
+};
+
+struct Statistics {
+  DurationFp average{};
+  DurationFp stddev{};
+  DurationFp p50{};
+  DurationFp p90{};
+  DurationFp p99{};
+  size_t n{};
+};
+
+Statistics ComputeStats(const std::vector<Duration>& measurements) {
+  Statistics stats{};
+  if (measurements.empty()) {
+    return stats;
+  }
+
+  stats.n = measurements.size();
+
+  const auto sum = std::accumulate(measurements.begin(), measurements.end(), Duration{0});
+  stats.average = DurationFp{sum} / stats.n;
+
+  std::vector<Duration> sorted = measurements;
+  std::sort(sorted.begin(), sorted.end());
+
+  stats.p50 = sorted[stats.n * 0.5];
+  stats.p90 = sorted[stats.n * 0.9];
+  stats.p99 = sorted[stats.n * 0.99];
+
+  if (stats.n > 1) {
+    const float variance =
+        std::accumulate(
+            measurements.begin(), measurements.end(),
+            0.0f,
+            [mean = stats.average.count()](float accumulator, const Duration& m) -> float {
+              const float distance_from_mean = m.count() - mean;
+              return accumulator + distance_from_mean * distance_from_mean;
+            }) /
+        (stats.n - 1);
+
+    const float stddev = std::sqrt(variance);
+    stats.stddev = DurationFp{stddev};
+  }
+
+  return stats;
+}
+
+void WritePerTokenStats(std::string_view label,
+                        const Statistics& stats,
+                        const size_t tokens_per_measurement) {
+  using MicrosecondsFp = std::chrono::duration<float, std::chrono::microseconds::period>;
+  const auto avg_us = MicrosecondsFp{stats.average};
+  std::cout << label << ":"
+            << "\n\tavg (us):       " << avg_us.count()
+            << "\n\tavg (tokens/s): " << 1.0e6f / avg_us.count() * tokens_per_measurement
+            << "\n\tp50 (us):       " << MicrosecondsFp{stats.p50}.count()
+            << "\n\tstddev (us):    " << MicrosecondsFp{stats.stddev}.count()
+            << "\n\tn:              " << stats.n << " * " << tokens_per_measurement << " token(s)"
+            << "\n";
+}
+
+void WriteE2EStats(std::string_view label,
+                   const Statistics& stats) {
+  using MillisecondsFp = std::chrono::duration<float, std::chrono::milliseconds::period>;
+  std::cout << label << ":"
+            << "\n\tavg (ms):       " << MillisecondsFp{stats.average}.count()
+            << "\n\tp50 (ms):       " << MillisecondsFp{stats.p50}.count()
+            << "\n\tstddev (ms):    " << MillisecondsFp{stats.stddev}.count()
+            << "\n\tn:              " << stats.n
+            << "\n";
+}
+
+std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) {
+  const char* const base_prompt = "A";
+  auto base_prompt_sequences = OgaSequences::Create();
+
+  tokenizer.Encode(base_prompt, *base_prompt_sequences);
+
+  auto params = OgaGeneratorParams::Create(model);
+  params->SetSearchOption("max_length", num_prompt_tokens);
+  params->SetSearchOption("min_length", num_prompt_tokens);
+  params->SetInputSequences(*base_prompt_sequences);
+
+  auto output_sequences = model.Generate(*params);
+  const auto output_sequence_length = output_sequences->SequenceCount(0);
+  const auto* output_sequence_data = output_sequences->SequenceData(0);
+  return std::string{tokenizer.Decode(output_sequence_data, output_sequence_length)};
+}
+
+void RunBenchmark(const benchmark::Options& opts) {
+  auto model = OgaModel::Create(opts.model_path.c_str());
+  auto tokenizer = OgaTokenizer::Create(*model);
+
+  const std::string prompt = GeneratePrompt(opts.num_prompt_tokens, *model, *tokenizer);
+  auto prompt_sequences = OgaSequences::Create();
+
+  if (opts.batch_size < 1) {
+    throw std::runtime_error("Batch size must be at least 1.");
+  }
+
+  for (size_t i = 0; i < opts.batch_size; ++i) {
+    tokenizer->Encode(prompt.c_str(), *prompt_sequences);
+  }
+
+  const size_t num_prompt_tokens = prompt_sequences->SequenceCount(0);
+  const size_t num_tokens = num_prompt_tokens + opts.num_tokens_to_generate;
+
+  auto make_generator_params = [&] {
+    auto params = OgaGeneratorParams::Create(*model);
+    params->SetSearchOption("max_length", num_tokens);
+    params->SetSearchOption("min_length", num_tokens);
+    params->SetInputSequences(*prompt_sequences);
+    return params;
+  };
+
+  const auto generator_params = make_generator_params();
+
+  // warmup
+  if (opts.verbose) std::cout << "Running warmup iterations (" << opts.num_warmup_iterations << ")...\n";
+  for (size_t i = 0; i < opts.num_warmup_iterations; ++i) {
+    auto output_sequences = model->Generate(*generator_params);
+
+    if (opts.verbose && i == 0) {
+      // show prompt and output on first iteration
+      std::cout << "Prompt:\n\t" << prompt << "\n";
+      const auto output_sequence_length = output_sequences->SequenceCount(0);
+      const auto* output_sequence_data = output_sequences->SequenceData(0);
+      const auto output = tokenizer->Decode(output_sequence_data, output_sequence_length);
+      std::cout << "Output:\n\t" << output << "\n";
+    }
+  }
+
+  std::vector<Duration> e2e_gen_times, prompt_processing_times, token_gen_times, sampling_times;
+  // note: be sure to reserve enough to avoid vector reallocations in the measured code
+  e2e_gen_times.reserve(opts.num_iterations);
+  prompt_processing_times.reserve(opts.num_iterations);
+  token_gen_times.reserve(opts.num_iterations * (opts.num_tokens_to_generate - 1));
+  sampling_times.reserve(opts.num_iterations * opts.num_tokens_to_generate);
+
+  if (opts.verbose) std::cout << "Running iterations (" << opts.num_iterations << ")...\n";
+  for (size_t i = 0; i < opts.num_iterations; ++i) {
+    auto generator = OgaGenerator::Create(*model, *generator_params);
+
+    {
+      Timing e2e_gen_timing{e2e_gen_times};
+
+      {
+        Timing prompt_processing_timing{prompt_processing_times};
+        generator->ComputeLogits();
+      }
+
+      {
+        Timing sampling_timing{sampling_times};
+        generator->GenerateNextToken();
+      }
+
+      while (!generator->IsDone()) {
+        {
+          Timing token_gen_timing{token_gen_times};
+          generator->ComputeLogits();
+        }
+
+        {
+          Timing sampling_timing{sampling_times};
+          generator->GenerateNextToken();
+        }
+      }
+    }
+  }
+
+  {
+    std::cout << "Batch size: " << opts.batch_size
+              << ", prompt tokens: " << num_prompt_tokens
+              << ", tokens to generate: " << opts.num_tokens_to_generate
+              << "\n";
+
+    const auto e2e_gen_stats = ComputeStats(e2e_gen_times);
+    const auto prompt_processing_stats = ComputeStats(prompt_processing_times);
+    const auto token_gen_stats = ComputeStats(token_gen_times);
+    const auto sampling_stats = ComputeStats(sampling_times);
+
+    WritePerTokenStats("Prompt processing (time to first token)",
+                       prompt_processing_stats, opts.batch_size * num_prompt_tokens);
+    WritePerTokenStats("Token generation", token_gen_stats, opts.batch_size);
+    WritePerTokenStats("Token sampling", sampling_stats, opts.batch_size);
+    WriteE2EStats("E2E generation (entire generation loop)", e2e_gen_stats);
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  try {
+    const auto opts = benchmark::ParseOptionsFromCommandLine(argc, argv);
+    RunBenchmark(opts);
+    return 0;
+  } catch (const std::exception& e) {
+    std::cerr << "Exception: " << e.what() << "\n";
+    return 1;
+  }
+}
diff --git a/benchmark/c/options.cpp b/benchmark/c/options.cpp
new file mode 100644
index 000000000..7047a4466
--- /dev/null
+++ b/benchmark/c/options.cpp
@@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "options.h"
+
+#include <cstdlib>
+#include <charconv>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <system_error>
+
+namespace benchmark {
+
+namespace {
+
+[[noreturn]] void PrintHelpAndExit(const char* program_name, int exit_code) {
+  Options defaults{};
+  std::ostringstream s;
+
+  s << "Usage: " << program_name << " -i <model path> <other options>\n"
+    << "  Options:\n"
+    << "    -i,--input_folder <path>\n"
+    << "      Path to the ONNX model directory to benchmark, compatible with onnxruntime-genai.\n "
+    << "    -b,--batch_size <number>\n"
+    << "      Number of sequences to generate in parallel. Default: " << defaults.batch_size << "\n"
+    << "    -l,--prompt_length <number>\n"
+    << "      Number of tokens in the prompt. Default: " << defaults.num_prompt_tokens << "\n"
+    << "    -g,--generation_length <number>\n"
+    << "      Number of tokens to generate. Default: " << defaults.num_tokens_to_generate << "\n"
+    << "    -r,--repetitions <number>\n"
+    << "      Number of times to repeat the benchmark. Default: " << defaults.num_iterations << "\n"
+    << "    -w,--warmup <number>\n"
+    << "      Number of warmup runs before benchmarking. Default: " << defaults.num_warmup_iterations << "\n"
+    << "    -v,--verbose\n"
+    << "      Show more informational output.\n"
+    << "    -h,--help\n"
+    << "      Show this help message and exit.\n";
+
+  std::cerr << s.str();
+  std::exit(exit_code);
+}
+
+template <typename T>
+T ParseNumber(std::string_view s) {
+  T n;
+  const auto *s_begin = s.data(), *s_end = s.data() + s.size();
+  const auto [ptr, ec] = std::from_chars(s_begin, s_end, n);
+  if (ec != std::errc{} || ptr != s_end) {
+    throw std::runtime_error(std::string{"Failed to parse option value as number: "}.append(s));
+  }
+  return n;
+}
+
+void VerifyOptions(const Options& opts) {
+  if (opts.model_path.empty()) {
+    throw std::runtime_error("ONNX model directory path must be provided.");
+  }
+}
+
+}  // namespace
+
+Options ParseOptionsFromCommandLine(int argc, const char* const* argv) {
+  const char* const program_name = argc > 0 ? argv[0] : "model_benchmark";
+  try {
+    Options opts{};
+
+    auto next_arg = [argc, argv](int& idx) {
+      if (idx + 1 >= argc) {
+        throw std::runtime_error("Option value not provided.");
+      }
+      return std::string_view{argv[++idx]};
+    };
+
+    for (int i = 1; i < argc; ++i) {
+      std::string_view arg{argv[i]};
+
+      if (arg == "-i" || arg == "--input_folder") {
+        opts.model_path = next_arg(i);
+      } else if (arg == "-b" || arg == "--batch_size") {
+        opts.batch_size = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-l" || arg == "--prompt_length") {
+        opts.num_prompt_tokens = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-g" || arg == "--generation_length") {
+        opts.num_tokens_to_generate = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-r" || arg == "--repetitions") {
+        opts.num_iterations = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-w" || arg == "--warmup") {
+        opts.num_warmup_iterations = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-v" || arg == "--verbose") {
+        opts.verbose = true;
+      } else if (arg == "-h" || arg == "--help") {
+        PrintHelpAndExit(program_name, 0);
+      } else {
+        throw std::runtime_error(std::string{"Unknown option: "}.append(arg));
+      }
+    }
+
+    VerifyOptions(opts);
+
+    return opts;
+  } catch (const std::exception& e) {
+    std::cerr << "Error: " << e.what() << "\n";
+    PrintHelpAndExit(program_name, 1);
+  }
+}
+
+}  // namespace benchmark
diff --git a/benchmark/c/options.h b/benchmark/c/options.h
new file mode 100644
index 000000000..a00d19191
--- /dev/null
+++ b/benchmark/c/options.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+namespace benchmark {
+
+struct Options {
+  std::string model_path{};
+  size_t num_prompt_tokens{16};
+  size_t num_tokens_to_generate{128};
+  size_t batch_size{1};
+  size_t num_iterations{5};
+  size_t num_warmup_iterations{1};
+  bool verbose{false};
+};
+
+Options ParseOptionsFromCommandLine(int argc, const char* const* argv);
+
+}  // namespace benchmark
diff --git a/benchmark/python/README b/benchmark/python/README
index da1174309..67cac3ccb 100644
--- a/benchmark/python/README
+++ b/benchmark/python/README
@@ -2,7 +2,7 @@ This is an end-to-end benchmarking script for any GenAI-supported ONNX model.
 
 
 Prerequisites: 
-0) Install onnxruntime-genai and onnxruntime
+0) Install pandas, onnxruntime-genai and onnxruntime
 
 1) Use builder.py to build the desired ONNX model
 
@@ -10,4 +10,4 @@ Prerequisites:
 
 
 Example call to benchmarking script
-python benchmark_e2e.py -i {model folder} -b 1 -l 128 -g 256 -r 100 -w 10 -k 5 -o {output csv file name}
\ No newline at end of file
+python benchmark_e2e.py -i {model folder} -b 1 -l 128 -g 256 -r 100 -w 10 -k 5 -o {output csv file name}
diff --git a/build.py b/build.py
index 150ba7a54..5c5b8febb 100644
--- a/build.py
+++ b/build.py
@@ -20,6 +20,10 @@ def is_linux():
     """Check if the current platform is Linux."""
     return sys.platform.startswith("linux")
 
+def is_mac():
+    """Check if the current platform is MacOS"""
+    return sys.platform.startswith("darwin")
+
 
 def platform():
     """Get the current platform."""
@@ -110,7 +114,7 @@ def build(
     Args:
         skip_wheel: Whether to skip building the Python wheel. Defaults to False.
     """
-    if not is_windows() and not is_linux():
+    if not is_windows() and not is_linux() and not is_mac():
         raise OSError(f"Unsupported platform {platform()}.")
     
     if cuda_home and not use_cuda:
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 80f004215..ac40a6d1d 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -6,5 +6,6 @@ option(NO_TOKENIZER "Don't include the Tokenizer" OFF)
 option(ENABLE_PYTHON "Build the Python API." ON)
 option(ENABLE_TESTS "Enable tests" ON)
 option(TEST_PHI2 "Enable tests for Phi2" OFF)
+option(ENABLE_MODEL_BENCHMARK "Build model benchmark program" ON)
 
 cmake_dependent_option(BUILD_WHEEL "Build the python wheel" ON "ENABLE_PYTHON" OFF)
\ No newline at end of file
diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt
index d44909286..9b33a3ed3 100644
--- a/examples/c/CMakeLists.txt
+++ b/examples/c/CMakeLists.txt
@@ -4,13 +4,24 @@ project(phi2)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++2a")
 
+set(ORT_GENAI_LIB_DIR ${CMAKE_SOURCE_DIR}/lib)
+
+if(WIN32)
+  set(ONNXRUNTIME_GENAI_LIB "onnxruntime-genai.dll")
+  set(ONNXRUNTIME_GENAI_DEPENDENCY "*.dll")
+elseif(APPLE)
+  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.dylib")
+  set(ONNXRUNTIME_GENAI_DEPENDENCY "*.dylib")
+else()
+  set(ONNXRUNTIME_GENAI_LIB "libonnxruntime-genai.so")
+  set(ONNXRUNTIME_GENAI_DEPENDENCY "*.so")
+endif()
+
 add_executable(phi2 ${CMAKE_SOURCE_DIR}/src/main.cpp)
 
-add_library(onnxruntime-genai SHARED IMPORTED)
-set_target_properties(onnxruntime-genai PROPERTIES
-    IMPORTED_LOCATION_RELEASE ${CMAKE_SOURCE_DIR}/lib/onnxruntime-genai.dll
-    IMPORTED_IMPLIB_RELEASE ${CMAKE_SOURCE_DIR}/lib/onnxruntime-genai.lib
-)
+
+target_link_directories(phi2 PRIVATE ${ORT_GENAI_LIB_DIR})
+target_link_libraries(phi2 PRIVATE ${ONNXRUNTIME_GENAI_LIB})
 target_include_directories(phi2 PRIVATE ${CMAKE_SOURCE_DIR}/include)
 
 target_link_libraries(
@@ -18,5 +29,11 @@ target_link_libraries(
         PUBLIC
         onnxruntime-genai)
 
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/phi-2" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/Release")
-file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/lib/" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/Release")
+file(GLOB ort_genai_libs "${CMAKE_SOURCE_DIR}/lib/${ONNXRUNTIME_GENAI_DEPENDENCY}")
+
+foreach(DLL_FILE ${ort_genai_libs})
+  add_custom_command(
+    TARGET phi2 POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different ${DLL_FILE} $<TARGET_FILE_DIR:phi2>
+  )
+endforeach()
\ No newline at end of file
diff --git a/examples/c/README.md b/examples/c/README.md
index 8cd2168fd..0a45578cd 100644
--- a/examples/c/README.md
+++ b/examples/c/README.md
@@ -48,5 +48,5 @@ cmake --build . --config Release
 
 ```bash
 cd build\\Release
-.\phi2.exe
+.\phi2.exe path_to_model
 ```
diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp
index d9aeb68a8..e4be639f2 100644
--- a/examples/c/src/main.cpp
+++ b/examples/c/src/main.cpp
@@ -4,8 +4,8 @@
 
 // C++ API Example
 
-void CXX_API() {
-  auto model = OgaModel::Create("phi-2");
+void CXX_API(const char* model_path) {
+  auto model = OgaModel::Create(model_path);
   auto tokenizer = OgaTokenizer::Create(*model);
 
   const char* prompt = "def is_prime(num):";
@@ -34,9 +34,9 @@ void CheckResult(OgaResult* result) {
   }
 }
 
-void C_API() {
+void C_API(const char* model_path) {
   OgaModel* model;
-  OgaCreateModel("phi-2", &model);
+  OgaCreateModel(model_path, &model);
 
   OgaTokenizer* tokenizer;
   CheckResult(OgaCreateTokenizer(model, &tokenizer));
@@ -74,16 +74,26 @@ void C_API() {
   OgaDestroyModel(model);
 }
 
-int main() {
+static void print_usage(int /*argc*/, char** argv) {
+  std::cerr << "usage: " << argv[0] << " model_path" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 2) {
+    print_usage(argc, argv);
+    return -1;
+  }
+
+
   std::cout << "-------------" << std::endl;
   std::cout << "Hello, Phi-2!" << std::endl;
   std::cout << "-------------" << std::endl;
 
   std::cout << "C++ API" << std::endl;
-  CXX_API();
+  CXX_API(argv[1]);
 
   std::cout << "C API" << std::endl;
-  C_API();
+  C_API(argv[1]);
 
   return 0;
 }
\ No newline at end of file
diff --git a/examples/csharp/Genny/.gitignore b/examples/csharp/Genny/.gitignore
new file mode 100644
index 000000000..496192431
--- /dev/null
+++ b/examples/csharp/Genny/.gitignore
@@ -0,0 +1,346 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+**/Properties/launchSettings.json
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk 
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# JetBrains Rider
+.idea/
+*.sln.iml
+
+# CodeRush
+.cr/
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output 
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder 
+.mfractor/
+/docs/build
+src/TensorFlowNET.Native/bazel-*
+src/TensorFlowNET.Native/c_api.h
+/.vscode
+test/TensorFlowNET.Examples/mnist
+
+
+# training model resources
+.resources
+/redist
+*.xml
+*.xsd
+
+# docs
+site/
+
+docker-test-output/*
diff --git a/examples/csharp/Genny/Assets/Screenshot1.PNG b/examples/csharp/Genny/Assets/Screenshot1.PNG
new file mode 100644
index 000000000..59ef9f19a
Binary files /dev/null and b/examples/csharp/Genny/Assets/Screenshot1.PNG differ
diff --git a/examples/csharp/Genny/Assets/Screenshot2.PNG b/examples/csharp/Genny/Assets/Screenshot2.PNG
new file mode 100644
index 000000000..d1c635481
Binary files /dev/null and b/examples/csharp/Genny/Assets/Screenshot2.PNG differ
diff --git a/examples/csharp/Genny/Genny.sln b/examples/csharp/Genny/Genny.sln
new file mode 100644
index 000000000..3a30e258e
--- /dev/null
+++ b/examples/csharp/Genny/Genny.sln
@@ -0,0 +1,31 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.9.34622.214
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Genny", "Genny\Genny.csproj", "{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug_Cuda|x64 = Debug_Cuda|x64
+		Debug|x64 = Debug|x64
+		Release_Cuda|x64 = Release_Cuda|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug_Cuda|x64.ActiveCfg = Debug_Cuda|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug_Cuda|x64.Build.0 = Debug_Cuda|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug|x64.ActiveCfg = Debug|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Debug|x64.Build.0 = Debug|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release_Cuda|x64.ActiveCfg = Release_Cuda|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release_Cuda|x64.Build.0 = Release_Cuda|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release|x64.ActiveCfg = Release|x64
+		{831197BD-63C7-4C0F-AD0E-4F6783CBB5C0}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {A7159277-CA72-45A9-8327-E3BF29214643}
+	EndGlobalSection
+EndGlobal
diff --git a/examples/csharp/Genny/Genny/App.xaml b/examples/csharp/Genny/Genny/App.xaml
new file mode 100644
index 000000000..ec5ea8fd1
--- /dev/null
+++ b/examples/csharp/Genny/Genny/App.xaml
@@ -0,0 +1,10 @@
+﻿<Application x:Class="Genny.App"
+             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+             xmlns:local="clr-namespace:Genny"
+             StartupUri="MainWindow.xaml">
+    <Application.Resources>
+        <BitmapImage x:Key="ImageAvatarUser" UriSource="/Images/user.png" />
+        <BitmapImage x:Key="ImageAvatarRobot" UriSource="/Images/robot.png" />
+    </Application.Resources>
+</Application>
diff --git a/examples/csharp/Genny/Genny/App.xaml.cs b/examples/csharp/Genny/Genny/App.xaml.cs
new file mode 100644
index 000000000..b6e61e540
--- /dev/null
+++ b/examples/csharp/Genny/Genny/App.xaml.cs
@@ -0,0 +1,11 @@
+﻿using System.Windows;
+
+namespace Genny
+{
+    /// <summary>
+    /// Interaction logic for App.xaml
+    /// </summary>
+    public partial class App : Application
+    {
+    }
+}
diff --git a/examples/csharp/Genny/Genny/AssemblyInfo.cs b/examples/csharp/Genny/Genny/AssemblyInfo.cs
new file mode 100644
index 000000000..b0ec82757
--- /dev/null
+++ b/examples/csharp/Genny/Genny/AssemblyInfo.cs
@@ -0,0 +1,10 @@
+using System.Windows;
+
+[assembly: ThemeInfo(
+    ResourceDictionaryLocation.None,            //where theme specific resource dictionaries are located
+                                                //(used if a resource is not found in the page,
+                                                // or application resource dictionaries)
+    ResourceDictionaryLocation.SourceAssembly   //where the generic resource dictionary is located
+                                                //(used if a resource is not found in the page,
+                                                // app, or any theme specific resource dictionaries)
+)]
diff --git a/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml b/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml
new file mode 100644
index 000000000..2983243b5
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml
@@ -0,0 +1,82 @@
+﻿<UserControl x:Class="Genny.Controls.SearchOptionsControl"
+             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
+             xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
+             mc:Ignorable="d"
+             d:DesignWidth="300"
+             Name="UI">
+    <Grid DataContext="{Binding ElementName=UI}">
+        <StackPanel Margin="3">
+            <UniformGrid Columns="2" >
+
+                <StackPanel Margin="0,0,4,0">
+                    <DockPanel>
+                        <Label>TopK</Label>
+                        <TextBlock Text="{Binding ElementName=SliderTopK, Path=Value}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderTopK" Value="{Binding SearchOptions.TopK}" Minimum="0" Maximum="200" TickFrequency="1" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+                <StackPanel Margin="4,0,0,0">
+                    <DockPanel>
+                        <Label>TopP</Label>
+                        <TextBlock Text="{Binding ElementName=SliderTopP, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderTopP" Value="{Binding SearchOptions.TopP}" Minimum="0" Maximum="1" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+
+                <StackPanel Margin="0,0,4,0">
+                    <DockPanel>
+                        <Label>Temperature</Label>
+                        <TextBlock Text="{Binding ElementName=SliderTemperature, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderTemperature" Value="{Binding SearchOptions.Temperature}" Minimum="0" Maximum="5" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+                <StackPanel Margin="4,0,0,0">
+                    <DockPanel>
+                        <Label>RepetitionPenalty</Label>
+                        <TextBlock Text="{Binding ElementName=SliderRepetitionPenalty, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderRepetitionPenalty" Value="{Binding SearchOptions.RepetitionPenalty}" Minimum="0" Maximum="5" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+
+                <StackPanel Margin="0,0,4,0">
+                    <DockPanel>
+                        <Label>MinLength</Label>
+                        <TextBlock Text="{Binding ElementName=SliderMinLength, Path=Value}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderMinLength" Value="{Binding SearchOptions.MinLength}" Minimum="0" Maximum="2048" TickFrequency="1" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+                <StackPanel Margin="4,0,0,0">
+                    <DockPanel>
+                        <Label>MaxLength</Label>
+                        <TextBlock Text="{Binding ElementName=SliderMaxLength, Path=Value}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderMaxLength" Value="{Binding SearchOptions.MaxLength}" Minimum="1" Maximum="2048" TickFrequency="1" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+
+                <StackPanel Margin="0,0,4,0">
+                    <DockPanel>
+                        <Label>LengthPenalty</Label>
+                        <TextBlock Text="{Binding ElementName=SliderLengthPenalty, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderLengthPenalty" Value="{Binding SearchOptions.LengthPenalty}" Minimum="0" Maximum="1" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+                <StackPanel Margin="4,0,0,0">
+                    <DockPanel>
+                        <Label>DiversityPenalty</Label>
+                        <TextBlock Text="{Binding ElementName=SliderDiversityPenalty, Path=Value, StringFormat={}{0:N2}}" VerticalAlignment="Center" HorizontalAlignment="Right" FontSize="11"/>
+                    </DockPanel>
+                    <Slider Name="SliderDiversityPenalty" Value="{Binding SearchOptions.DiversityPenalty}" Minimum="0" Maximum="1" TickFrequency="0.01" IsSnapToTickEnabled="true"/>
+                </StackPanel>
+
+            </UniformGrid>
+
+            <StackPanel >
+                <CheckBox Content="DoSample" IsChecked="{Binding SearchOptions.DoSample}" Margin="0,15,0,0"/>
+                <CheckBox Content="EarlyStopping" IsChecked="{Binding SearchOptions.EarlyStopping}" Margin="0,6,0,0"/>
+                <CheckBox Content="PastPresentShareBuffer" IsChecked="{Binding SearchOptions.PastPresentShareBuffer}" Margin="0,6,0,0"/>
+            </StackPanel>
+        </StackPanel>
+    </Grid>
+</UserControl>
diff --git a/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml.cs b/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml.cs
new file mode 100644
index 000000000..6386a43de
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Controls/SearchOptionsControl.xaml.cs
@@ -0,0 +1,30 @@
+﻿using Genny.ViewModel;
+using System.Windows;
+using System.Windows.Controls;
+
+namespace Genny.Controls
+{
+    /// <summary>
+    /// Interaction logic for SearchOptionsControl.xaml
+    /// </summary>
+    public partial class SearchOptionsControl : UserControl
+    {
+        public SearchOptionsControl()
+        {
+            InitializeComponent();
+        }
+
+        public static readonly DependencyProperty SearchOptionsProperty =
+           DependencyProperty.Register(nameof(SearchOptions), typeof(SearchOptionsModel), typeof(SearchOptionsControl), new PropertyMetadata(new SearchOptionsModel()));
+
+
+        /// <summary>
+        /// Gets or sets the search options.
+        /// </summary>
+        public SearchOptionsModel SearchOptions
+        {
+            get { return (SearchOptionsModel)GetValue(SearchOptionsProperty); }
+            set { SetValue(SearchOptionsProperty, value); }
+        }
+    }
+}
diff --git a/examples/csharp/Genny/Genny/Extensions.cs b/examples/csharp/Genny/Genny/Extensions.cs
new file mode 100644
index 000000000..5074df1e2
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Extensions.cs
@@ -0,0 +1,50 @@
+﻿using Genny.ViewModel;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System.Threading;
+using System.Threading.Tasks;
+using System.Windows;
+
+namespace Genny
+{
+    internal static class Extensions
+    {
+
+        /// <summary>
+        /// Applies the search options to the generator parameters.
+        /// </summary>
+        /// <param name="generatorParams">The generator parameters.</param>
+        /// <param name="searchOptions">The search options.</param>
+        internal static void ApplySearchOptions(this GeneratorParams generatorParams, SearchOptionsModel searchOptions)
+        {
+            generatorParams.SetSearchOption("top_p", searchOptions.TopP);
+            generatorParams.SetSearchOption("top_k", searchOptions.TopK);
+            generatorParams.SetSearchOption("temperature", searchOptions.Temperature);
+            generatorParams.SetSearchOption("repetition_penalty", searchOptions.RepetitionPenalty);
+            generatorParams.SetSearchOption("past_present_share_buffer", searchOptions.PastPresentShareBuffer);
+            generatorParams.SetSearchOption("num_return_sequences", searchOptions.NumReturnSequences);
+            generatorParams.SetSearchOption("no_repeat_ngram_size", searchOptions.NoRepeatNgramSize);
+            generatorParams.SetSearchOption("min_length", searchOptions.MinLength);
+            generatorParams.SetSearchOption("max_length", searchOptions.MaxLength);
+            generatorParams.SetSearchOption("length_penalty", searchOptions.LengthPenalty);
+            generatorParams.SetSearchOption("early_stopping", searchOptions.EarlyStopping);
+            generatorParams.SetSearchOption("do_sample", searchOptions.DoSample);
+            generatorParams.SetSearchOption("diversity_penalty", searchOptions.DiversityPenalty);
+        }
+
+        internal static Task<Sequences> EncodeAsync(this Tokenizer tokenizer, string input, CancellationToken cancellationToken = default)
+        {
+            return Application.Current.Dispatcher.Invoke(() =>
+            {
+                return Task.Run(() => tokenizer.Encode(input), cancellationToken);
+            });
+        }
+
+        internal static Task<string> DecodeAsync(this Tokenizer tokenizer, int[] input, CancellationToken cancellationToken = default)
+        {
+            return Application.Current.Dispatcher.Invoke(() =>
+            {
+                return Task.Run(() => tokenizer.Decode(input), cancellationToken);
+            });
+        }
+    }
+}
diff --git a/examples/csharp/Genny/Genny/Genny.csproj b/examples/csharp/Genny/Genny/Genny.csproj
new file mode 100644
index 000000000..d4928ad9f
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Genny.csproj
@@ -0,0 +1,25 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>WinExe</OutputType>
+    <TargetFramework>net6.0-windows</TargetFramework>
+    <Nullable>disable</Nullable>
+    <ImplicitUsings>disable</ImplicitUsings>
+    <UseWPF>true</UseWPF>
+    <UseWindowsForms>true</UseWindowsForms>
+    <PlatformTarget>x64</PlatformTarget>
+    <Platforms>x64</Platforms>
+    <Configurations>Debug;Release;Debug_Cuda;Release_Cuda;</Configurations>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.1.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.1.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+  </ItemGroup>
+
+  <ItemGroup>
+    <Resource Include="Images\user.png" />
+    <Resource Include="Images\robot.png" />
+  </ItemGroup>
+
+</Project>
diff --git a/examples/csharp/Genny/Genny/Images/robot.png b/examples/csharp/Genny/Genny/Images/robot.png
new file mode 100644
index 000000000..96edd0fb1
Binary files /dev/null and b/examples/csharp/Genny/Genny/Images/robot.png differ
diff --git a/examples/csharp/Genny/Genny/Images/user.png b/examples/csharp/Genny/Genny/Images/user.png
new file mode 100644
index 000000000..dcaf32f59
Binary files /dev/null and b/examples/csharp/Genny/Genny/Images/user.png differ
diff --git a/examples/csharp/Genny/Genny/MainWindow.xaml b/examples/csharp/Genny/Genny/MainWindow.xaml
new file mode 100644
index 000000000..3d721f96b
--- /dev/null
+++ b/examples/csharp/Genny/Genny/MainWindow.xaml
@@ -0,0 +1,72 @@
+﻿<Window x:Class="Genny.MainWindow"
+        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
+        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
+        xmlns:views="clr-namespace:Genny.Views"
+        xmlns:controls="clr-namespace:Genny.Controls"
+        mc:Ignorable="d"
+        Title="Genny" Height="450" Width="800" Name="UI">
+    <Grid DataContext="{Binding ElementName=UI}">
+        <DockPanel>
+
+            <!--Input-->
+            <DockPanel DockPanel.Dock="Left" Width="300">
+
+                <GroupBox DockPanel.Dock="Top" Header="Model Folder" Margin="2,5,0,1">
+                    <DockPanel Margin="3">
+                        <UniformGrid DockPanel.Dock="Bottom" Columns="1">
+                            <Button Content="Load Model" Command="{Binding LoadModelCommand}" />
+                        </UniformGrid>
+                        <DockPanel>
+                            <Button DockPanel.Dock="Right" Content="Open" Command="{Binding OpenModelCommand}"/>
+                            <TextBox Text="{Binding ModelPath}"/>
+                        </DockPanel>
+                    </DockPanel>
+                </GroupBox>
+
+                <DockPanel>
+                    <GroupBox DockPanel.Dock="Top" Header="Search Options" Margin="2,5,0,1">
+                        <controls:SearchOptionsControl SearchOptions="{Binding Configuration.SearchOptions}"  />
+                    </GroupBox>
+                </DockPanel>
+
+            </DockPanel>
+
+            <!--Content-->
+            <TabControl>
+
+                <!--StatelessView Tab-->
+                <TabItem Header="Stateless">
+                    <Grid IsEnabled="{Binding IsModelLoaded}">
+                        <views:StatelessView
+                            Model="{Binding Model}"
+                            Tokenizer="{Binding Tokenizer}"
+                            ModelOptions="{Binding Configuration.ModelOptions}"
+                            SearchOptions="{Binding Configuration.SearchOptions}" />
+                    </Grid>
+                </TabItem>
+
+                <!--StatefulView Tab-->
+                <TabItem Header="Stateful">
+                    <Grid IsEnabled="{Binding IsModelLoaded}">
+                        <views:StatefulView 
+                            Model="{Binding Model}"
+                            Tokenizer="{Binding Tokenizer}"
+                            ModelOptions="{Binding Configuration.ModelOptions}"
+                            SearchOptions="{Binding Configuration.SearchOptions}" />
+                    </Grid>
+                </TabItem>
+
+                <!--Tokenizer Tab-->
+                <TabItem Header="Tokenizer">
+                    <Grid IsEnabled="{Binding IsModelLoaded}">
+                        <views:TokenizerView Tokenizer="{Binding Tokenizer}" />
+                    </Grid>
+                </TabItem>
+
+            </TabControl>
+
+        </DockPanel>
+    </Grid>
+</Window>
diff --git a/examples/csharp/Genny/Genny/MainWindow.xaml.cs b/examples/csharp/Genny/Genny/MainWindow.xaml.cs
new file mode 100644
index 000000000..10522632a
--- /dev/null
+++ b/examples/csharp/Genny/Genny/MainWindow.xaml.cs
@@ -0,0 +1,132 @@
+﻿using Genny.Utils;
+using Genny.ViewModel;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System;
+using System.ComponentModel;
+using System.IO;
+using System.Runtime.CompilerServices;
+using System.Text.Json;
+using System.Threading.Tasks;
+using System.Windows;
+
+namespace Genny
+{
+    /// <summary>
+    /// Interaction logic for MainWindow.xaml
+    /// </summary>
+    public partial class MainWindow : Window, INotifyPropertyChanged
+    {
+        private Model _model;
+        private Tokenizer _tokenizer;
+        private ConfigurationModel _configuration;
+        private string _modelPath = "D:\\Repositories\\phi2_onnx";
+        private bool _isModelLoaded;
+
+        public MainWindow()
+        {
+            OpenModelCommand = new RelayCommand(OpenModelAsync);
+            LoadModelCommand = new RelayCommand(LoadModelAsync, CanExecuteLoadModel);
+            InitializeComponent();
+        }
+
+        public RelayCommand OpenModelCommand { get; }
+        public RelayCommand LoadModelCommand { get; }
+
+        public Model Model
+        {
+            get { return _model; }
+            set { _model = value; NotifyPropertyChanged(); }
+        }
+
+        public Tokenizer Tokenizer
+        {
+            get { return _tokenizer; }
+            set { _tokenizer = value; NotifyPropertyChanged(); }
+        }
+
+        public ConfigurationModel Configuration
+        {
+            get { return _configuration; }
+            set { _configuration = value; NotifyPropertyChanged(); }
+        }
+
+
+        public bool IsModelLoaded
+        {
+            get { return _isModelLoaded; }
+            set { _isModelLoaded = value; NotifyPropertyChanged(); }
+        }
+
+        public string ModelPath
+        {
+            get { return _modelPath; }
+            set { _modelPath = value; NotifyPropertyChanged(); }
+        }
+
+
+        private Task OpenModelAsync()
+        {
+            var folderBrowserDialog = new System.Windows.Forms.FolderBrowserDialog
+            {
+                Description = "Model Folder Path",
+                UseDescriptionForTitle = true,
+            };
+            var dialogResult = folderBrowserDialog.ShowDialog();
+            if (dialogResult == System.Windows.Forms.DialogResult.OK)
+                ModelPath = folderBrowserDialog.SelectedPath;
+
+            return Task.CompletedTask;
+        }
+
+
+        private async Task LoadModelAsync()
+        {
+            await UnloadModelAsync();
+            try
+            {
+                Configuration = await LoadConfigAsync(ModelPath);
+                await Task.Run(() =>
+                {
+                    Model = new Model(ModelPath);
+                    Tokenizer = new Tokenizer(_model);
+                });
+                IsModelLoaded = true;
+            }
+            catch (Exception ex)
+            {
+                MessageBox.Show(ex.Message, "Model Load Error", MessageBoxButton.OK, MessageBoxImage.Error);
+            }
+        }
+
+
+        private bool CanExecuteLoadModel()
+        {
+            return !string.IsNullOrWhiteSpace(ModelPath);
+        }
+
+
+        private Task UnloadModelAsync()
+        {
+            _model?.Dispose();
+            _tokenizer?.Dispose();
+            IsModelLoaded = false;
+            return Task.CompletedTask;
+        }
+
+
+        private static async Task<ConfigurationModel> LoadConfigAsync(string modelPath)
+        {
+            var configPath = Path.Combine(modelPath, "genai_config.json");
+            var configJson = await File.ReadAllTextAsync(configPath);
+            return JsonSerializer.Deserialize<ConfigurationModel>(configJson);
+        }
+
+        #region INotifyPropertyChanged
+        public event PropertyChangedEventHandler PropertyChanged;
+        public void NotifyPropertyChanged([CallerMemberName] string property = "")
+        {
+            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
+        }
+        #endregion
+    }
+}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/Utils/AutoScrollBehavior.cs b/examples/csharp/Genny/Genny/Utils/AutoScrollBehavior.cs
new file mode 100644
index 000000000..afc99ee71
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Utils/AutoScrollBehavior.cs
@@ -0,0 +1,47 @@
+﻿using System.Windows;
+using System.Windows.Controls;
+
+namespace Genny.Utils
+{
+    /// <summary>
+    /// Behaviour to auto scroll to the bottom went the content changes, e.g appending text
+    /// </summary>
+    public static class AutoScrollBehavior
+    {
+        public static readonly DependencyProperty AutoScrollProperty =
+            DependencyProperty.RegisterAttached("AutoScroll", typeof(bool), typeof(AutoScrollBehavior), new PropertyMetadata(false, AutoScrollPropertyChanged));
+
+        public static void AutoScrollPropertyChanged(DependencyObject obj, DependencyPropertyChangedEventArgs args)
+        {
+            var scrollViewer = obj as ScrollViewer;
+            if (scrollViewer != null && (bool)args.NewValue)
+            {
+                scrollViewer.ScrollChanged += ScrollViewer_ScrollChanged;
+                scrollViewer.ScrollToEnd();
+            }
+            else
+            {
+                scrollViewer.ScrollChanged -= ScrollViewer_ScrollChanged;
+            }
+        }
+
+        private static void ScrollViewer_ScrollChanged(object sender, ScrollChangedEventArgs e)
+        {
+            if (e.ExtentHeightChange != 0)
+            {
+                var scrollViewer = sender as ScrollViewer;
+                scrollViewer?.ScrollToBottom();
+            }
+        }
+
+        public static bool GetAutoScroll(DependencyObject obj)
+        {
+            return (bool)obj.GetValue(AutoScrollProperty);
+        }
+
+        public static void SetAutoScroll(DependencyObject obj, bool value)
+        {
+            obj.SetValue(AutoScrollProperty, value);
+        }
+    }
+}
diff --git a/examples/csharp/Genny/Genny/Utils/RelayCommand.cs b/examples/csharp/Genny/Genny/Utils/RelayCommand.cs
new file mode 100644
index 000000000..131f27b01
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Utils/RelayCommand.cs
@@ -0,0 +1,111 @@
+﻿using System;
+using System.Threading;
+using System.Threading.Tasks;
+using System.Windows.Input;
+
+namespace Genny.Utils
+{
+    /// <summary>
+    /// Basic Relay command implemtation
+    /// </summary>
+    /// <seealso cref="System.Windows.Input.ICommand" />
+    public class RelayCommand : ICommand
+    {
+        private readonly Func<Task> _execute;
+        private readonly Func<bool> _canExecute;
+        private long _isExecuting;
+
+        public RelayCommand(Func<Task> execute, Func<bool> canExecute = null)
+        {
+            _execute = execute;
+            _canExecute = canExecute ?? (() => true);
+        }
+
+        public event EventHandler CanExecuteChanged
+        {
+            add { CommandManager.RequerySuggested += value; }
+            remove { CommandManager.RequerySuggested -= value; }
+        }
+
+        public void RaiseCanExecuteChanged()
+        {
+            CommandManager.InvalidateRequerySuggested();
+        }
+
+        public bool CanExecute(object parameter)
+        {
+            if (Interlocked.Read(ref _isExecuting) != 0)
+                return false;
+
+            return _canExecute();
+        }
+
+        public async void Execute(object parameter)
+        {
+            Interlocked.Exchange(ref _isExecuting, 1);
+            RaiseCanExecuteChanged();
+
+            try
+            {
+                await _execute();
+            }
+            finally
+            {
+                Interlocked.Exchange(ref _isExecuting, 0);
+                RaiseCanExecuteChanged();
+            }
+        }
+    }
+
+    /// <summary>
+    /// Basic Relay command with type argument implemtation
+    /// </summary>
+    /// <seealso cref="System.Windows.Input.ICommand" />
+    public class RelayCommand<T> : ICommand
+    {
+        private readonly Func<T, Task> _execute;
+        private readonly Func<T, bool> _canExecute;
+        private long _isExecuting;
+
+        public RelayCommand(Func<T, Task> execute, Func<T, bool> canExecute = null)
+        {
+            _execute = execute;
+            _canExecute = canExecute ?? (o => true);
+        }
+
+        public event EventHandler CanExecuteChanged
+        {
+            add { CommandManager.RequerySuggested += value; }
+            remove { CommandManager.RequerySuggested -= value; }
+        }
+
+        public void RaiseCanExecuteChanged()
+        {
+            CommandManager.InvalidateRequerySuggested();
+        }
+
+        public bool CanExecute(object parameter)
+        {
+            if (Interlocked.Read(ref _isExecuting) != 0)
+                return false;
+
+            return _canExecute(parameter is T r ? r : default);
+        }
+
+        public async void Execute(object parameter)
+        {
+            Interlocked.Exchange(ref _isExecuting, 1);
+            RaiseCanExecuteChanged();
+
+            try
+            {
+                await _execute((T)parameter);
+            }
+            finally
+            {
+                Interlocked.Exchange(ref _isExecuting, 0);
+                RaiseCanExecuteChanged();
+            }
+        }
+    }
+}
diff --git a/examples/csharp/Genny/Genny/Utils/ShiftEnterBehavior.cs b/examples/csharp/Genny/Genny/Utils/ShiftEnterBehavior.cs
new file mode 100644
index 000000000..1d922c2fd
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Utils/ShiftEnterBehavior.cs
@@ -0,0 +1,66 @@
+﻿using System;
+using System.Windows;
+using System.Windows.Controls;
+using System.Windows.Input;
+
+namespace Genny.Utils
+{
+    /// <summary>
+    /// Behaviour to use Shift + Enfer to add a new line to a TextBox allowing IsDefault Commands to be fired on Enter
+    /// </summary>
+    public class ShiftEnterBehavior
+    {
+        public static readonly DependencyProperty EnableProperty =
+            DependencyProperty.RegisterAttached("Enable", typeof(bool), typeof(ShiftEnterBehavior), new PropertyMetadata(false, OnEnableChanged));
+
+        public static bool GetEnable(DependencyObject obj)
+        {
+            return (bool)obj.GetValue(EnableProperty);
+        }
+
+        public static void SetEnable(DependencyObject obj, bool value)
+        {
+            obj.SetValue(EnableProperty, value);
+        }
+
+        private static void OnEnableChanged(DependencyObject obj, DependencyPropertyChangedEventArgs e)
+        {
+            if (obj is TextBox textBox)
+            {
+                bool attach = (bool)e.NewValue;
+
+                if (attach)
+                {
+                    DataObject.AddPastingHandler(textBox, TextBox_OnPaste);
+                    textBox.PreviewKeyDown += TextBox_PreviewKeyDown;
+                }
+                else
+                {
+                    DataObject.RemovePastingHandler(textBox, TextBox_OnPaste);
+                    textBox.PreviewKeyDown -= TextBox_PreviewKeyDown;
+                }
+            }
+        }
+
+        private static void TextBox_PreviewKeyDown(object sender, KeyEventArgs e)
+        {
+            // If Shift + Enter is pressed append a new line
+            if (e.Key == Key.Enter && Keyboard.Modifiers == ModifierKeys.Shift && sender is TextBox textBox)
+            {
+                e.Handled = true;
+                textBox.AppendText(Environment.NewLine);
+                textBox.CaretIndex = textBox.Text.Length;
+            }
+        }
+
+        private static void TextBox_OnPaste(object sender, DataObjectPastingEventArgs e)
+        {
+            // Because AcceptsReturn is false we need to intercept paste to allow new lines
+            if (sender is TextBox textBox && e.DataObject.GetDataPresent(DataFormats.UnicodeText))
+            {
+                e.CancelCommand();
+                textBox.AppendText(e.DataObject.GetData(DataFormats.UnicodeText) as string);
+            }
+        }
+    }
+}
diff --git a/examples/csharp/Genny/Genny/ViewModel/ConfigurationModel.cs b/examples/csharp/Genny/Genny/ViewModel/ConfigurationModel.cs
new file mode 100644
index 000000000..5e78ff95b
--- /dev/null
+++ b/examples/csharp/Genny/Genny/ViewModel/ConfigurationModel.cs
@@ -0,0 +1,13 @@
+﻿using System.Text.Json.Serialization;
+
+namespace Genny.ViewModel
+{
+    public class ConfigurationModel
+    {
+        [JsonPropertyName("model")]
+        public ModelOptionsModel ModelOptions { get; set; }
+
+        [JsonPropertyName("search")]
+        public SearchOptionsModel SearchOptions { get; set; }
+    }
+}
diff --git a/examples/csharp/Genny/Genny/ViewModel/ModelOptionsModel.cs b/examples/csharp/Genny/Genny/ViewModel/ModelOptionsModel.cs
new file mode 100644
index 000000000..bb7fc341d
--- /dev/null
+++ b/examples/csharp/Genny/Genny/ViewModel/ModelOptionsModel.cs
@@ -0,0 +1,14 @@
+﻿using System.Text.Json.Serialization;
+
+namespace Genny.ViewModel
+{
+    public class ModelOptionsModel
+    {
+        [JsonPropertyName("type")]
+        public string Type { get; set; }
+
+        [JsonPropertyName("context_length")]
+        public int ContextLength { get; set; }
+    }
+
+}
diff --git a/examples/csharp/Genny/Genny/ViewModel/ResultModel.cs b/examples/csharp/Genny/Genny/ViewModel/ResultModel.cs
new file mode 100644
index 000000000..b51bd66db
--- /dev/null
+++ b/examples/csharp/Genny/Genny/ViewModel/ResultModel.cs
@@ -0,0 +1,34 @@
+﻿using System;
+using System.ComponentModel;
+using System.Runtime.CompilerServices;
+
+namespace Genny.ViewModel
+{
+    public class ResultModel : INotifyPropertyChanged
+    {
+        private string _content;
+        private bool _isUserInput;
+
+        public string Content
+        {
+            get { return _content; }
+            set { _content = value; NotifyPropertyChanged(); }
+        }
+
+        public bool IsUserInput
+        {
+            get { return _isUserInput; }
+            set { _isUserInput = value; NotifyPropertyChanged(); }
+        }
+
+        public DateTime Timestamp { get; } = DateTime.Now;
+
+        #region INotifyPropertyChanged
+        public event PropertyChangedEventHandler PropertyChanged;
+        public void NotifyPropertyChanged([CallerMemberName] string property = "")
+        {
+            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
+        }
+        #endregion
+    }
+}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/ViewModel/SearchOptionsModel.cs b/examples/csharp/Genny/Genny/ViewModel/SearchOptionsModel.cs
new file mode 100644
index 000000000..2fe6b3ab7
--- /dev/null
+++ b/examples/csharp/Genny/Genny/ViewModel/SearchOptionsModel.cs
@@ -0,0 +1,132 @@
+﻿using System.ComponentModel;
+using System.Runtime.CompilerServices;
+using System.Text.Json.Serialization;
+
+namespace Genny.ViewModel
+{
+    public class SearchOptionsModel : INotifyPropertyChanged
+    {
+        private int _topK = 50;
+        private float _topP = 0.9f;
+        private float _temperature = 1;
+        private float _repetitionPenalty = 1;
+        private bool _pastPresentShareBuffer = false;
+        private int _numReturnSequences = 1;
+        private int _numBeams = 1;
+        private int _noRepeatNgramSize = 0;
+        private int _minLength = 0;
+        private int _maxLength = 200;
+        private float _lengthPenalty = 1;
+        private bool _earlyStopping = true;
+        private bool _doSample = false;
+        private float _diversityPenalty = 0;
+
+        [JsonPropertyName("top_k")]
+        public int TopK
+        {
+            get { return _topK; }
+            set { _topK = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("top_p")]
+        public float TopP
+        {
+            get { return _topP; }
+            set { _topP = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("temperature")]
+        public float Temperature
+        {
+            get { return _temperature; }
+            set { _temperature = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("repetition_penalty")]
+        public float RepetitionPenalty
+        {
+            get { return _repetitionPenalty; }
+            set { _repetitionPenalty = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("past_present_share_buffer")]
+        public bool PastPresentShareBuffer
+        {
+            get { return _pastPresentShareBuffer; }
+            set { _pastPresentShareBuffer = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("num_return_sequences")]
+        public int NumReturnSequences
+        {
+            get { return _numReturnSequences; }
+            set { _numReturnSequences = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("num_beams")]
+        public int NumBeams
+        {
+            get { return _numBeams; }
+            set { _numBeams = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("no_repeat_ngram_size")]
+        public int NoRepeatNgramSize
+        {
+            get { return _noRepeatNgramSize; }
+            set { _noRepeatNgramSize = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("min_length")]
+        public int MinLength
+        {
+            get { return _minLength; }
+            set { _minLength = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("max_length")]
+        public int MaxLength
+        {
+            get { return _maxLength; }
+            set { _maxLength = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("length_penalty")]
+        public float LengthPenalty
+        {
+            get { return _lengthPenalty; }
+            set { _lengthPenalty = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("diversity_penalty")]
+        public float DiversityPenalty
+        {
+            get { return _diversityPenalty; }
+            set { _diversityPenalty = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("early_stopping")]
+        public bool EarlyStopping
+        {
+            get { return _earlyStopping; }
+            set { _earlyStopping = value; NotifyPropertyChanged(); }
+        }
+
+        [JsonPropertyName("do_sample")]
+        public bool DoSample
+        {
+            get { return _doSample; }
+            set { _doSample = value; NotifyPropertyChanged(); }
+        }
+
+     
+
+        #region INotifyPropertyChanged
+        public event PropertyChangedEventHandler PropertyChanged;
+        public void NotifyPropertyChanged([CallerMemberName] string property = "")
+        {
+            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
+        }
+        #endregion
+    }
+}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/ViewModel/TokenModel.cs b/examples/csharp/Genny/Genny/ViewModel/TokenModel.cs
new file mode 100644
index 000000000..a87d9a50e
--- /dev/null
+++ b/examples/csharp/Genny/Genny/ViewModel/TokenModel.cs
@@ -0,0 +1,4 @@
+﻿namespace Genny.ViewModel
+{
+    public record TokenModel(int Id, string Content);
+}
\ No newline at end of file
diff --git a/examples/csharp/Genny/Genny/Views/StatefulView.xaml b/examples/csharp/Genny/Genny/Views/StatefulView.xaml
new file mode 100644
index 000000000..71473e6d8
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Views/StatefulView.xaml
@@ -0,0 +1,80 @@
+﻿<UserControl x:Class="Genny.Views.StatefulView"
+             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
+             xmlns:d="http://schemas.microsoft.com/expression/blend/2008" 
+             xmlns:local="clr-namespace:Genny.Views"
+             xmlns:scm="clr-namespace:System.ComponentModel;assembly=WindowsBase"
+             xmlns:utils="clr-namespace:Genny.Utils"
+             Name="UI"
+             mc:Ignorable="d" 
+             d:DesignHeight="450" d:DesignWidth="800">
+    <DockPanel DataContext="{Binding ElementName=UI}">
+
+    <!--Input Controls-->
+        <DockPanel DockPanel.Dock="Bottom" Height="100" Margin="2">
+
+            <!--Buttons-->
+            <DockPanel DockPanel.Dock="Right">
+                <DockPanel>
+                    <UniformGrid Columns="2" DockPanel.Dock="Bottom" Height="30" Width="100">
+                        <Button Content="Clear" Command="{Binding ClearCommand}"/>
+                        <Button Content="Cancel" Command="{Binding CancelCommand}"/>
+                    </UniformGrid>
+                    <Button Content="Send" Command="{Binding GenerateCommand}" IsDefault="True"/>
+                </DockPanel>
+            </DockPanel>
+
+            <!--Prompt-->
+            <TextBox Text="{Binding Prompt, UpdateSourceTrigger=PropertyChanged}" utils:ShiftEnterBehavior.Enable="True"/>
+
+        </DockPanel>
+
+
+        <!--Result List-->
+        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
+            <ScrollViewer CanContentScroll="False" utils:AutoScrollBehavior.AutoScroll="True" HorizontalScrollBarVisibility="Disabled" >
+                <ItemsControl ItemsSource="{Binding}" ScrollViewer.HorizontalScrollBarVisibility="Disabled" ScrollViewer.VerticalScrollBarVisibility="Disabled">
+                    <ItemsControl.DataContext>
+                        <CollectionViewSource Source="{Binding ResultHistory, ElementName=UI}">
+                            <CollectionViewSource.SortDescriptions>
+                                <scm:SortDescription PropertyName="Timestamp" Direction="Ascending" />
+                            </CollectionViewSource.SortDescriptions>
+                        </CollectionViewSource>
+                    </ItemsControl.DataContext>
+                    <ItemsControl.ItemTemplate>
+                        <DataTemplate>
+                            <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
+                                <DockPanel>
+                                    <DockPanel DockPanel.Dock="Left" Margin="10">
+                                        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2" VerticalAlignment="Top">
+                                            <Image Width="60" Height="60" RenderOptions.BitmapScalingMode="Fant">
+                                                <Image.Style>
+                                                    <Style TargetType="{x:Type Image}">
+                                                        <Setter Property="Source" Value="{StaticResource ImageAvatarRobot}" />
+                                                        <Style.Triggers>
+                                                            <DataTrigger Binding="{Binding IsUserInput}" Value="True">
+                                                                <Setter Property="Source" Value="{StaticResource ImageAvatarUser}" />
+                                                            </DataTrigger>
+                                                        </Style.Triggers>
+                                                    </Style>
+                                                </Image.Style>
+                                            </Image>
+                                        </Border>
+                                    </DockPanel>
+                                    <DockPanel DockPanel.Dock="Bottom">
+                                        <TextBlock Text="{Binding Timestamp}" HorizontalAlignment="Right" Margin="0,0,4,2" />
+                                    </DockPanel>
+                                    <DockPanel Margin="5">
+                                        <TextBox Text="{Binding Content}" TextWrapping="Wrap" Style="{x:Null}" BorderThickness="0" IsReadOnly="True" />
+                                    </DockPanel>
+                                </DockPanel>
+                            </Border>
+                        </DataTemplate>
+                    </ItemsControl.ItemTemplate>
+                </ItemsControl>
+            </ScrollViewer>
+        </Border>
+
+    </DockPanel>
+</UserControl>
diff --git a/examples/csharp/Genny/Genny/Views/StatefulView.xaml.cs b/examples/csharp/Genny/Genny/Views/StatefulView.xaml.cs
new file mode 100644
index 000000000..d399d3005
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Views/StatefulView.xaml.cs
@@ -0,0 +1,196 @@
+﻿using Genny.Utils;
+using Genny.ViewModel;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System;
+using System.Collections.Generic;
+using System.Collections.ObjectModel;
+using System.ComponentModel;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
+using System.Windows;
+using System.Windows.Controls;
+
+namespace Genny.Views
+{
+    /// <summary>
+    /// Interaction logic for StatefulView.xaml
+    /// </summary>
+    public partial class StatefulView : UserControl, INotifyPropertyChanged
+    {
+        private string _prompt;
+        private readonly List<int> _pastTokens;
+        private CancellationTokenSource _cancellationTokenSource;
+
+        public StatefulView()
+        {
+            _pastTokens = new List<int>();
+            ClearCommand = new RelayCommand(ClearAsync);
+            CancelCommand = new RelayCommand(CancelAsync);
+            GenerateCommand = new RelayCommand(GenerateAsync, CanExecuteGenerate);
+            ResultHistory = new ObservableCollection<ResultModel>();
+            InitializeComponent();
+        }
+
+        public static readonly DependencyProperty ModelProperty =
+          DependencyProperty.Register(nameof(Model), typeof(Model), typeof(StatefulView));
+
+        public static readonly DependencyProperty TokenizerProperty =
+            DependencyProperty.Register(nameof(Tokenizer), typeof(Tokenizer), typeof(StatefulView));
+
+        public static readonly DependencyProperty ModelOptionsProperty =
+            DependencyProperty.Register(nameof(ModelOptions), typeof(ModelOptionsModel), typeof(StatefulView));
+
+        public static readonly DependencyProperty SearchOptionsProperty =
+            DependencyProperty.Register(nameof(SearchOptions), typeof(SearchOptionsModel), typeof(StatefulView));
+
+        public RelayCommand ClearCommand { get; }
+        public RelayCommand CancelCommand { get; }
+        public RelayCommand GenerateCommand { get; }
+        public ResultModel CurrentResult { get; set; }
+        public ObservableCollection<ResultModel> ResultHistory { get; }
+
+        public Model Model
+        {
+            get { return (Model)GetValue(ModelProperty); }
+            set { SetValue(ModelProperty, value); }
+        }
+
+        public Tokenizer Tokenizer
+        {
+            get { return (Tokenizer)GetValue(TokenizerProperty); }
+            set { SetValue(TokenizerProperty, value); }
+        }
+
+        public ModelOptionsModel ModelOptions
+        {
+            get { return (ModelOptionsModel)GetValue(ModelOptionsProperty); }
+            set { SetValue(ModelOptionsProperty, value); }
+        }
+
+        public SearchOptionsModel SearchOptions
+        {
+            get { return (SearchOptionsModel)GetValue(SearchOptionsProperty); }
+            set { SetValue(SearchOptionsProperty, value); }
+        }
+
+        public string Prompt
+        {
+            get { return _prompt; }
+            set { _prompt = value; NotifyPropertyChanged(); }
+        }
+
+
+        private async Task GenerateAsync()
+        {
+            try
+            {
+                var userInput = new ResultModel
+                {
+                    Content = Prompt,
+                    IsUserInput = true
+                };
+
+                Prompt = null;
+                CurrentResult = null;
+                ResultHistory.Add(userInput);
+                _cancellationTokenSource = new CancellationTokenSource();
+                await foreach (var sentencePiece in RunInferenceAsync(userInput.Content, _cancellationTokenSource.Token))
+                {
+                    if (CurrentResult == null)
+                    {
+                        if (string.IsNullOrWhiteSpace(sentencePiece.Content)) // Ingore preceding '\n'
+                            continue;
+
+                        ResultHistory.Add(CurrentResult = new ResultModel());
+                    }
+                    CurrentResult.Content += sentencePiece.Content;
+                }
+            }
+            catch (OperationCanceledException)
+            {
+                CurrentResult.Content += "\n\n[Operation Canceled]";
+            }
+            catch (Exception ex)
+            {
+                MessageBox.Show(ex.Message, "Inference Error", MessageBoxButton.OK, MessageBoxImage.Error);
+            }
+        }
+
+
+        private bool CanExecuteGenerate()
+        {
+            return !string.IsNullOrWhiteSpace(Prompt);
+        }
+
+
+        private Task CancelAsync()
+        {
+            _cancellationTokenSource?.Cancel();
+            return Task.CompletedTask;
+        }
+
+
+        private Task ClearAsync()
+        {
+            _pastTokens.Clear();
+            ResultHistory.Clear();
+            return Task.CompletedTask;
+        }
+
+
+        private async IAsyncEnumerable<TokenModel> RunInferenceAsync(string prompt, [EnumeratorCancellation] CancellationToken cancellationToken)
+        {
+            var sequences = await Tokenizer.EncodeAsync(prompt, cancellationToken);
+
+            // Add Tokens to history
+            AddPastTokens(sequences);
+
+            using var generatorParams = new GeneratorParams(Model);
+            generatorParams.ApplySearchOptions(SearchOptions);
+
+            // max_length is per message, so increment max_length for next call
+            var newMaxLength = Math.Min(_pastTokens.Count + SearchOptions.MaxLength, ModelOptions.ContextLength);
+            generatorParams.SetSearchOption("max_length", newMaxLength); 
+
+            generatorParams.SetInputIDs(CollectionsMarshal.AsSpan(_pastTokens), (ulong)_pastTokens.Count, 1);
+
+            using var tokenizerStream = Tokenizer.CreateStream();
+            using var generator = new Generator(Model, generatorParams);
+            while (!generator.IsDone())
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+
+                yield return await Task.Run(() =>
+                {
+                    generator.ComputeLogits();
+                    generator.GenerateNextToken();
+
+                    var tokenId = generator.GetSequence(0)[^1];
+                    return new TokenModel(tokenId, tokenizerStream.Decode(tokenId));
+                }, cancellationToken);
+            }
+        }
+
+
+        private void AddPastTokens(Sequences sequences)
+        {
+            _pastTokens.AddRange(sequences[0].ToArray());
+
+            // Only keep (context_length - max_length) worth of history
+            while (_pastTokens.Count > ModelOptions.ContextLength - SearchOptions.MaxLength)
+            {
+                _pastTokens.RemoveAt(0);
+            }
+        }
+
+        #region INotifyPropertyChanged
+        public event PropertyChangedEventHandler PropertyChanged;
+        public void NotifyPropertyChanged([CallerMemberName] string property = "")
+        {
+            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
+        }
+        #endregion
+    }
+}
diff --git a/examples/csharp/Genny/Genny/Views/StatelessView.xaml b/examples/csharp/Genny/Genny/Views/StatelessView.xaml
new file mode 100644
index 000000000..b36b103bf
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Views/StatelessView.xaml
@@ -0,0 +1,80 @@
+﻿<UserControl x:Class="Genny.Views.StatelessView"
+             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
+             xmlns:d="http://schemas.microsoft.com/expression/blend/2008" 
+             xmlns:local="clr-namespace:Genny.Views"
+             xmlns:scm="clr-namespace:System.ComponentModel;assembly=WindowsBase"
+             xmlns:utils="clr-namespace:Genny.Utils"
+             Name="UI"
+             mc:Ignorable="d" 
+             d:DesignHeight="450" d:DesignWidth="800">
+    <DockPanel DataContext="{Binding ElementName=UI}">
+
+    <!--Input Controls-->
+        <DockPanel DockPanel.Dock="Bottom" Height="100" Margin="2">
+
+            <!--Buttons-->
+            <DockPanel DockPanel.Dock="Right">
+                <DockPanel>
+                    <UniformGrid Columns="2" DockPanel.Dock="Bottom" Height="30" Width="100">
+                        <Button Content="Clear" Command="{Binding ClearCommand}"/>
+                        <Button Content="Cancel" Command="{Binding CancelCommand}"/>
+                    </UniformGrid>
+                    <Button Content="Send" Command="{Binding GenerateCommand}" IsDefault="True"/>
+                </DockPanel>
+            </DockPanel>
+
+            <!--Prompt-->
+            <TextBox Text="{Binding Prompt, UpdateSourceTrigger=PropertyChanged}" utils:ShiftEnterBehavior.Enable="True"/>
+
+        </DockPanel>
+
+
+        <!--Result List-->
+        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
+            <ScrollViewer CanContentScroll="False" utils:AutoScrollBehavior.AutoScroll="True" HorizontalScrollBarVisibility="Disabled" >
+                <ItemsControl ItemsSource="{Binding}" ScrollViewer.HorizontalScrollBarVisibility="Disabled" ScrollViewer.VerticalScrollBarVisibility="Disabled">
+                    <ItemsControl.DataContext>
+                        <CollectionViewSource Source="{Binding ResultHistory, ElementName=UI}">
+                            <CollectionViewSource.SortDescriptions>
+                                <scm:SortDescription PropertyName="Timestamp" Direction="Ascending" />
+                            </CollectionViewSource.SortDescriptions>
+                        </CollectionViewSource>
+                    </ItemsControl.DataContext>
+                    <ItemsControl.ItemTemplate>
+                        <DataTemplate>
+                            <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2">
+                                <DockPanel>
+                                    <DockPanel DockPanel.Dock="Left" Margin="10">
+                                        <Border BorderBrush="Gainsboro" BorderThickness="1" Margin="2" VerticalAlignment="Top">
+                                            <Image Width="60" Height="60" RenderOptions.BitmapScalingMode="Fant">
+                                                <Image.Style>
+                                                    <Style TargetType="{x:Type Image}">
+                                                        <Setter Property="Source" Value="{StaticResource ImageAvatarRobot}" />
+                                                        <Style.Triggers>
+                                                            <DataTrigger Binding="{Binding IsUserInput}" Value="True">
+                                                                <Setter Property="Source" Value="{StaticResource ImageAvatarUser}" />
+                                                            </DataTrigger>
+                                                        </Style.Triggers>
+                                                    </Style>
+                                                </Image.Style>
+                                            </Image>
+                                        </Border>
+                                    </DockPanel>
+                                    <DockPanel DockPanel.Dock="Bottom">
+                                        <TextBlock Text="{Binding Timestamp}" HorizontalAlignment="Right" Margin="0,0,4,2" />
+                                    </DockPanel>
+                                    <DockPanel Margin="5">
+                                        <TextBox Text="{Binding Content}" TextWrapping="Wrap" Style="{x:Null}" BorderThickness="0" IsReadOnly="True" />
+                                    </DockPanel>
+                                </DockPanel>
+                            </Border>
+                        </DataTemplate>
+                    </ItemsControl.ItemTemplate>
+                </ItemsControl>
+            </ScrollViewer>
+        </Border>
+
+    </DockPanel>
+</UserControl>
diff --git a/examples/csharp/Genny/Genny/Views/StatelessView.xaml.cs b/examples/csharp/Genny/Genny/Views/StatelessView.xaml.cs
new file mode 100644
index 000000000..a24ef02fb
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Views/StatelessView.xaml.cs
@@ -0,0 +1,171 @@
+﻿using Genny.Utils;
+using Genny.ViewModel;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System;
+using System.Collections.Generic;
+using System.Collections.ObjectModel;
+using System.ComponentModel;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using System.Threading.Tasks;
+using System.Windows;
+using System.Windows.Controls;
+
+namespace Genny.Views
+{
+    /// <summary>
+    /// Interaction logic for StatelessView.xaml
+    /// </summary>
+    public partial class StatelessView : UserControl, INotifyPropertyChanged
+    {
+        private string _prompt;
+        private CancellationTokenSource _cancellationTokenSource;
+
+        public StatelessView()
+        {
+            ClearCommand = new RelayCommand(ClearAsync);
+            CancelCommand = new RelayCommand(CancelAsync);
+            GenerateCommand = new RelayCommand(GenerateAsync, CanExecuteGenerate);
+            ResultHistory = new ObservableCollection<ResultModel>();
+            InitializeComponent();
+        }
+
+        public static readonly DependencyProperty ModelProperty =
+          DependencyProperty.Register(nameof(Model), typeof(Model), typeof(StatelessView));
+
+        public static readonly DependencyProperty TokenizerProperty =
+            DependencyProperty.Register(nameof(Tokenizer), typeof(Tokenizer), typeof(StatelessView));
+
+        public static readonly DependencyProperty ModelOptionsProperty =
+            DependencyProperty.Register(nameof(ModelOptions), typeof(ModelOptionsModel), typeof(StatelessView));
+
+        public static readonly DependencyProperty SearchOptionsProperty =
+            DependencyProperty.Register(nameof(SearchOptions), typeof(SearchOptionsModel), typeof(StatelessView));
+
+        public RelayCommand ClearCommand { get; }
+        public RelayCommand CancelCommand { get; }
+        public RelayCommand GenerateCommand { get; }
+        public ResultModel CurrentResult { get; set; }
+        public ObservableCollection<ResultModel> ResultHistory { get; }
+
+        public Model Model
+        {
+            get { return (Model)GetValue(ModelProperty); }
+            set { SetValue(ModelProperty, value); }
+        }
+
+        public Tokenizer Tokenizer
+        {
+            get { return (Tokenizer)GetValue(TokenizerProperty); }
+            set { SetValue(TokenizerProperty, value); }
+        }
+
+        public ModelOptionsModel ModelOptions
+        {
+            get { return (ModelOptionsModel)GetValue(ModelOptionsProperty); }
+            set { SetValue(ModelOptionsProperty, value); }
+        }
+
+        public SearchOptionsModel SearchOptions
+        {
+            get { return (SearchOptionsModel)GetValue(SearchOptionsProperty); }
+            set { SetValue(SearchOptionsProperty, value); }
+        }
+
+        public string Prompt
+        {
+            get { return _prompt; }
+            set { _prompt = value; NotifyPropertyChanged(); }
+        }
+
+
+        private async Task GenerateAsync()
+        {
+            try
+            {
+                var userInput = new ResultModel
+                {
+                    Content = Prompt,
+                    IsUserInput = true
+                };
+
+                Prompt = null;
+                CurrentResult = null;
+                ResultHistory.Add(userInput);
+                _cancellationTokenSource = new CancellationTokenSource();
+                await foreach (var sentencePiece in RunInferenceAsync(userInput.Content, _cancellationTokenSource.Token))
+                {
+                    if (CurrentResult == null)
+                    {
+                        if (string.IsNullOrWhiteSpace(sentencePiece.Content)) // Ingore preceding '\n'
+                            continue;
+
+                        ResultHistory.Add(CurrentResult = new ResultModel());
+                    }
+                    CurrentResult.Content += sentencePiece.Content;
+                }
+            }
+            catch (OperationCanceledException)
+            {
+                CurrentResult.Content += "\n\n[Operation Canceled]";
+            }
+            catch (Exception ex)
+            {
+                MessageBox.Show(ex.Message, "Inference Error", MessageBoxButton.OK, MessageBoxImage.Error);
+            }
+        }
+
+
+        private bool CanExecuteGenerate()
+        {
+            return !string.IsNullOrWhiteSpace(Prompt);
+        }
+
+
+        private Task CancelAsync()
+        {
+            _cancellationTokenSource?.Cancel();
+            return Task.CompletedTask;
+        }
+
+
+        private Task ClearAsync()
+        {
+            ResultHistory.Clear();
+            return Task.CompletedTask;
+        }
+
+        private async IAsyncEnumerable<TokenModel> RunInferenceAsync(string prompt, [EnumeratorCancellation] CancellationToken cancellationToken)
+        {
+            var sequences = await Tokenizer.EncodeAsync(prompt, cancellationToken);
+
+            using var generatorParams = new GeneratorParams(Model);
+            generatorParams.ApplySearchOptions(SearchOptions);
+            generatorParams.SetInputSequences(sequences);
+
+            using var tokenizerStream = Tokenizer.CreateStream();
+            using var generator = new Generator(Model, generatorParams);
+            while (!generator.IsDone())
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+
+                yield return await Task.Run(() =>
+                {
+                    generator.ComputeLogits();
+                    generator.GenerateNextToken();
+
+                    var tokenId = generator.GetSequence(0)[^1];
+                    return new TokenModel(tokenId, tokenizerStream.Decode(tokenId));
+                }, cancellationToken);
+            }
+        }
+
+        #region INotifyPropertyChanged
+        public event PropertyChangedEventHandler PropertyChanged;
+        public void NotifyPropertyChanged([CallerMemberName] string property = "")
+        {
+            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
+        }
+        #endregion
+    }
+}
diff --git a/examples/csharp/Genny/Genny/Views/TokenizerView.xaml b/examples/csharp/Genny/Genny/Views/TokenizerView.xaml
new file mode 100644
index 000000000..b69e64dee
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Views/TokenizerView.xaml
@@ -0,0 +1,33 @@
+﻿<UserControl x:Class="Genny.Views.TokenizerView"
+             xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
+             xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
+             xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" 
+             xmlns:d="http://schemas.microsoft.com/expression/blend/2008" 
+             xmlns:local="clr-namespace:Genny.Views"
+             Name="UI"
+             mc:Ignorable="d" 
+             d:DesignHeight="450" d:DesignWidth="800">
+    <UniformGrid DataContext="{Binding ElementName=UI}" Rows="2">
+
+        <!--Encode-->
+        <DockPanel Grid.Row="0" Margin="10">
+            <TextBlock DockPanel.Dock="Top" Text="Encode" />
+            <Button DockPanel.Dock="Bottom" HorizontalAlignment="Right" Content="Encode" Command="{Binding EncodeCommand}" CommandParameter="{Binding Text, ElementName=EncodeInputTextbox, UpdateSourceTrigger=PropertyChanged}" Margin="0,4,0,0" Padding="3"/>
+            <UniformGrid Rows="2">
+                <TextBox x:Name="EncodeInputTextbox" TextWrapping="Wrap"/>
+                <TextBox Text="{Binding EncodeResult}" IsReadOnly="True" TextWrapping="Wrap" BorderThickness="1,0,1,1"/>
+            </UniformGrid>
+        </DockPanel>
+
+        <!--Decode-->
+        <DockPanel Grid.Row="1" Margin="10">
+            <TextBlock DockPanel.Dock="Top" Text="Decode" />
+            <Button DockPanel.Dock="Bottom" HorizontalAlignment="Right" Content="Decode" Command="{Binding DecodeCommand}" CommandParameter="{Binding Text, ElementName=DecodeInputTextbox, UpdateSourceTrigger=PropertyChanged}" Margin="0,4,0,0" Padding="3"/>
+            <UniformGrid Rows="2">
+                <TextBox x:Name="DecodeInputTextbox" TextWrapping="Wrap"/>
+                <TextBox Text="{Binding DecodeResult}" IsReadOnly="True" TextWrapping="Wrap" BorderThickness="1,0,1,1"/>
+            </UniformGrid>
+        </DockPanel>
+
+    </UniformGrid>
+</UserControl>
diff --git a/examples/csharp/Genny/Genny/Views/TokenizerView.xaml.cs b/examples/csharp/Genny/Genny/Views/TokenizerView.xaml.cs
new file mode 100644
index 000000000..a6b488fa6
--- /dev/null
+++ b/examples/csharp/Genny/Genny/Views/TokenizerView.xaml.cs
@@ -0,0 +1,93 @@
+﻿using Genny.Utils;
+using Microsoft.ML.OnnxRuntimeGenAI;
+using System;
+using System.ComponentModel;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Threading.Tasks;
+using System.Windows;
+using System.Windows.Controls;
+
+namespace Genny.Views
+{
+    /// <summary>
+    /// Interaction logic for TokenizerView.xaml
+    /// </summary>
+    public partial class TokenizerView : UserControl, INotifyPropertyChanged
+    {
+        private string _encodeResult;
+        private string _decodeResult;
+
+        public TokenizerView()
+        {
+            EncodeCommand = new RelayCommand<string>(EncodeAsync);
+            DecodeCommand = new RelayCommand<string>(DecodeAsync);
+            InitializeComponent();
+        }
+
+        public static readonly DependencyProperty TokenizerProperty =
+           DependencyProperty.Register(nameof(Tokenizer), typeof(Tokenizer), typeof(TokenizerView));
+
+        public RelayCommand<string> EncodeCommand { get; }
+        public RelayCommand<string> DecodeCommand { get; }
+
+        public Tokenizer Tokenizer
+        {
+            get { return (Tokenizer)GetValue(TokenizerProperty); }
+            set { SetValue(TokenizerProperty, value); }
+        }
+
+        public string EncodeResult
+        {
+            get { return _encodeResult; }
+            set { _encodeResult = value; NotifyPropertyChanged(); }
+        }
+
+        public string DecodeResult
+        {
+            get { return _decodeResult; }
+            set { _decodeResult = value; NotifyPropertyChanged(); }
+        }
+
+
+        private async Task EncodeAsync(string input)
+        {
+            EncodeResult = null;
+            try
+            {
+                var sequences = await Tokenizer.EncodeAsync(input);
+                EncodeResult = string.Join(", ", sequences[0].ToArray());
+            }
+            catch (Exception ex)
+            {
+                MessageBox.Show(ex.Message, "Tokenizer Encode Error", MessageBoxButton.OK, MessageBoxImage.Error);
+            }
+        }
+
+
+        private async Task DecodeAsync(string input)
+        {
+            DecodeResult = null;
+            try
+            {
+                var intArray = input
+                     .Split(',', StringSplitOptions.TrimEntries | StringSplitOptions.RemoveEmptyEntries)
+                     .Select(int.Parse)
+                     .ToArray();
+                DecodeResult = await Tokenizer.DecodeAsync(intArray);
+            }
+            catch (Exception ex)
+            {
+                MessageBox.Show(ex.Message, "Tokenizer Decode Error", MessageBoxButton.OK, MessageBoxImage.Error);
+            }
+        }
+
+        #region INotifyPropertyChanged
+        public event PropertyChangedEventHandler PropertyChanged;
+        public void NotifyPropertyChanged([CallerMemberName] string property = "")
+        {
+            PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(property));
+        }
+        #endregion
+    }
+}
diff --git a/examples/csharp/Genny/README.md b/examples/csharp/Genny/README.md
new file mode 100644
index 000000000..117d93722
--- /dev/null
+++ b/examples/csharp/Genny/README.md
@@ -0,0 +1,48 @@
+## Genny
+A example UI for debugging and testing models with OnnxRuntime-GenAI
+
+|   |  |
+| :--- | :--- |
+<img src="Assets/Screenshot1.PNG" /> | <img src="Assets/Screenshot2.PNG" />
+
+______________________
+
+## Run Genny
+* Open `Genny.sln` in VisualStudio and run `Debug` or `Release` to launch the application
+* Enter or Select your model folder path
+* Click Load Model (this may take a few minutes)
+
+
+## CPU or GPU
+* `Debug` or `Release` to launch the application with CPU support
+* `Debug_Cuda` or `Release_Cuda`  to launch the application with CUDA GPU support
+
+
+## Models
+You can generate the model using the ONNX Runtime Generative AI model builder, or bring your own model.
+
+To generate the model with model builder:
+
+1. Install the python package
+
+   Install the Python package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install).
+
+2. Install the model builder script dependencies
+
+   ```bash
+   pip install numpy
+   pip install transformers
+   pip install torch
+   pip install onnx
+   pip install onnxruntime
+   ```
+   
+3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../../src/python/py/models/README.md)
+
+   ```bash
+   python -m onnxruntime_genai.models.builder -m models/phi-2 -e cpu -p int4 -o models/phi2-int4
+   ```
+
+The model builder also generates the configuration needed by the API to run generation. You can modify the config according to your scenario.  
+
+If you bring your own model, you need to provide the configuration. See the [config reference](https://onnxruntime.ai/docs/genai/reference/config).
diff --git a/examples/csharp/HelloPhi2.csproj b/examples/csharp/HelloPhi2/HelloPhi2.csproj
similarity index 97%
rename from examples/csharp/HelloPhi2.csproj
rename to examples/csharp/HelloPhi2/HelloPhi2.csproj
index a431aa126..ae97c015b 100644
--- a/examples/csharp/HelloPhi2.csproj
+++ b/examples/csharp/HelloPhi2/HelloPhi2.csproj
@@ -8,7 +8,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntime.Gpu" Version="1.17.1" />
+    <PackageReference Include="Microsoft.ML.OnnxRuntime.Gpu" Version="1.17.3" />
     <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.1.0" />
     <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Managed" Version="0.1.0" />
   </ItemGroup>
diff --git a/examples/csharp/HelloPhi2.sln b/examples/csharp/HelloPhi2/HelloPhi2.sln
similarity index 100%
rename from examples/csharp/HelloPhi2.sln
rename to examples/csharp/HelloPhi2/HelloPhi2.sln
diff --git a/examples/csharp/Program.cs b/examples/csharp/HelloPhi2/Program.cs
similarity index 100%
rename from examples/csharp/Program.cs
rename to examples/csharp/HelloPhi2/Program.cs
diff --git a/examples/csharp/README.md b/examples/csharp/HelloPhi2/README.md
similarity index 94%
rename from examples/csharp/README.md
rename to examples/csharp/HelloPhi2/README.md
index 7052a02d4..c5fadc7c4 100644
--- a/examples/csharp/README.md
+++ b/examples/csharp/HelloPhi2/README.md
@@ -28,7 +28,7 @@ To generate the model with model builder:
    pip install onnxruntime
    ```
    
-3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../src/python/py/models/README.md)
+3. Run the model builder script to export, optimize, and quantize the model. More details can be found [here](../../../src/python/py/models/README.md)
 
    ```bash
    cd examples/python
diff --git a/examples/python/model-chat.py b/examples/python/model-chat.py
deleted file mode 100644
index 559423989..000000000
--- a/examples/python/model-chat.py
+++ /dev/null
@@ -1,46 +0,0 @@
-﻿import onnxruntime_genai as og
-import argparse
-
-def main(args):
-    if args.verbose: print("Loading model...")
-    model = og.Model(f'{args.model}')
-    if args.verbose: print("Model loaded")
-    tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
-    if args.verbose: print("Tokenizer created")
-    if args.verbose: print()
-
-    # Keep asking for input prompts in an loop
-    while True:
-        text = input("Input: ")
-        if not text:
-            print("Error, input cannot be empty")
-            continue
-
-        input_tokens = tokenizer.encode(text)
-
-        params = og.GeneratorParams(model)
-        params.set_search_options({"do_sample": True, "max_length": args.max_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty})
-        params.input_ids = input_tokens
-        generator = og.Generator(model, params)
-        if args.verbose: print("Generator created")
-
-        if args.verbose: print("Running generation loop ...")
-        print(f'\n{text}', end='')
-        while not generator.is_done():
-            generator.compute_logits()
-            generator.generate_next_token()
-            print(tokenizer_stream.decode(generator.get_next_tokens()[0]), end='', flush=True)
-        print()
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="End-to-end chat-bot example for gen-ai")
-    parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
-    parser.add_argument('-l', '--max_length', type=int, default=512, help='Max number of tokens to generate after prompt')
-    parser.add_argument('-p', '--top_p', type=float, default=0.9, help='Top p probability to sample with')
-    parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from')
-    parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with')
-    parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with')
-    parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
-    args = parser.parse_args()
-    main(args)
\ No newline at end of file
diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py
index d83c35e85..f256daad6 100644
--- a/examples/python/model-generate.py
+++ b/examples/python/model-generate.py
@@ -20,6 +20,8 @@ def main(args):
 
     params = og.GeneratorParams(model)
     params.set_search_options({"max_length": args.max_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty})
+    if args.cuda_graph_with_max_batch_size > 0:
+        params.try_use_cuda_graph_with_max_batch_size(args.cuda_graph_with_max_batch_size)
     params.input_ids = input_tokens
     if args.verbose: print("GeneratorParams created")
 
@@ -33,7 +35,7 @@ def main(args):
         print()
         print(tokenizer.decode(output_tokens[i]))
         print()
-    
+
     print()
     print(f"Tokens: {len(output_tokens[0])} Time: {run_time:.2f} Tokens per second: {len(output_tokens[0])/run_time:.2f}")
     print()
@@ -48,5 +50,6 @@ def main(args):
     parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with')
     parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with')
     parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
+    parser.add_argument('-c', '--cuda_graph_with_max_batch_size', type=int, default=0, help='Max batch size for CUDA graph')
     args = parser.parse_args()
     main(args)
\ No newline at end of file
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
new file mode 100644
index 000000000..99b739a66
--- /dev/null
+++ b/examples/python/model-qa.py
@@ -0,0 +1,76 @@
+﻿import onnxruntime_genai as og
+import argparse
+import time
+
+def main(args):
+    if args.verbose: print("Loading model...")
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
+
+    model = og.Model(f'{args.model}')
+    if args.verbose: print("Model loaded")
+    tokenizer = og.Tokenizer(model)
+    tokenizer_stream = tokenizer.create_stream()
+    if args.verbose: print("Tokenizer created")
+    if args.verbose: print()
+
+    # Keep asking for input prompts in a loop
+    while True:
+        text = input("Input: ")
+        if not text:
+            print("Error, input cannot be empty")
+            continue
+
+        if args.timings: started_timestamp = time.time()
+
+        input_tokens = tokenizer.encode(args.system_prompt + text)
+
+        params = og.GeneratorParams(model)
+        params.set_search_options({"do_sample": False, "max_length": args.max_length, "min_length": args.min_length, "top_p": args.top_p, "top_k": args.top_k, "temperature": args.temperature, "repetition_penalty": args.repetition_penalty})
+        params.input_ids = input_tokens
+        generator = og.Generator(model, params)
+        if args.verbose: print("Generator created")
+
+        if args.verbose: print("Running generation loop ...")
+        if args.timings:
+            first = True
+            new_tokens = []
+
+        print()
+        print("Output: ", end='', flush=True)
+
+        while not generator.is_done():
+            generator.compute_logits()
+            generator.generate_next_token()
+            if args.timings:
+                if first:
+                    first_token_timestamp = time.time()
+                    first = False
+
+            new_token = generator.get_next_tokens()[0]
+            print(tokenizer_stream.decode(new_token), end='', flush=True)
+            if args.timings: new_tokens.append(new_token)
+        print()
+        print()
+
+        if args.timings:
+            prompt_time = first_token_timestamp - started_timestamp
+            run_time = time.time() - first_token_timestamp
+            print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="End-to-end chat-bot example for gen-ai")
+    parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
+    parser.add_argument('-i', '--min_length', type=int, default=0, help='Min number of tokens to generate including the prompt')
+    parser.add_argument('-l', '--max_length', type=int, default=200, help='Max number of tokens to generate including the prompt')
+    parser.add_argument('-p', '--top_p', type=float, default=0.9, help='Top p probability to sample with')
+    parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from')
+    parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with')
+    parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-s', '--system_prompt', type=str, default='', help='Prepend a system prompt to the user input prompt. Defaults to empty')
+    parser.add_argument('-g', '--timings', action='store_true', help='Print timing information for each generation step. Defaults to false')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/src/config.cpp b/src/config.cpp
index 74045a524..283bac3cb 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -121,6 +121,10 @@ struct Inputs_Element : JSON::Element {
       v_.position_ids = value;
     } else if (name == "attention_mask") {
       v_.attention_mask = value;
+    } else if (name == "seqlens_k") {
+      v_.seqlens_k = value;
+    } else if (name == "total_seq_len") {
+      v_.total_sequence_length = value;
     } else if (name == "past_key_names") {
       v_.past_key_names = value;
     } else if (name == "past_value_names") {
@@ -285,6 +289,8 @@ struct Search_Element : JSON::Element {
       v_.diversity_penalty = static_cast<float>(value);
     } else if (name == "length_penalty") {
       v_.length_penalty = static_cast<float>(value);
+    } else if (name == "random_seed") {
+      v_.random_seed = static_cast<int>(value);
     } else
       throw JSON::unknown_value_error{};
   }
@@ -312,6 +318,19 @@ void SetSearchBool(Config::Search& search, std::string_view name, bool value) {
   Search_Element(search).OnBool(name, value);
 }
 
+bool IsCudaGraphEnabled(Config::SessionOptions& session_options) {
+  for (const auto& provider_options : session_options.provider_options) {
+    if (provider_options.name == "cuda") {
+      for (const auto& value : provider_options.options) {
+        if (value.first == "enable_cuda_graph") {
+          return value.second == "1";
+        }
+      }
+    }
+  }
+  return false;
+}
+
 struct Root_Element : JSON::Element {
   explicit Root_Element(Config& config) : config_{config} {}
 
diff --git a/src/config.h b/src/config.h
index 2621edc21..a126419a4 100644
--- a/src/config.h
+++ b/src/config.h
@@ -56,6 +56,8 @@ struct Config {
         std::string input_ids{"input_ids"};
         std::string position_ids{"position_ids"};
         std::string attention_mask{"attention_mask"};
+        std::string seqlens_k{"seqlens_k"};
+        std::string total_sequence_length{"total_seq_len"};
         std::string past_key_names{"past_key_values.%d.key"}, past_value_names{"past_key_values.%d.value"};
         std::string past_names;  // When key/value pairs are combined
         std::string cross_past_key_names, cross_past_value_names;
@@ -86,10 +88,12 @@ struct Config {
     float diversity_penalty{};
     float length_penalty{1.0f};        // Exponential penalty to the length that is used with beam-based generation. length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
     bool past_present_share_buffer{};  // The past/present kv tensors are shared and allocated once to max_length (cuda only)
+    int random_seed{-1};               // -1 = Seed with random device, otherwise use value to seed RNG
   } search;
 };
 
 void SetSearchNumber(Config::Search& search, std::string_view name, double value);
 void SetSearchBool(Config::Search& search, std::string_view name, bool value);
+bool IsCudaGraphEnabled(Config::SessionOptions& session_options);
 
 }  // namespace Generators
\ No newline at end of file
diff --git a/src/csharp/Generator.cs b/src/csharp/Generator.cs
index 64c1c5623..e2772d632 100644
--- a/src/csharp/Generator.cs
+++ b/src/csharp/Generator.cs
@@ -32,8 +32,8 @@ public void GenerateNextToken()
 
         public ReadOnlySpan<int> GetSequence(ulong index)
         {
-            ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceLength(_generatorHandle, (UIntPtr)index).ToUInt64();
-            IntPtr sequencePtr = NativeMethods.OgaGenerator_GetSequence(_generatorHandle, (UIntPtr)index);
+            ulong sequenceLength = NativeMethods.OgaGenerator_GetSequenceCount(_generatorHandle, (UIntPtr)index).ToUInt64();
+            IntPtr sequencePtr = NativeMethods.OgaGenerator_GetSequenceData(_generatorHandle, (UIntPtr)index);
             unsafe
             {
                 return new ReadOnlySpan<int>(sequencePtr.ToPointer(), (int)sequenceLength);
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index 552c9046a..7766c5e02 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -82,15 +82,15 @@ internal class NativeLib
 
         // This function returns the length of the sequence at the given index.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceLength(IntPtr /* const OgaGenerator* */ generator,
-                                                                                 UIntPtr /* size_t */ index);
+        public static extern UIntPtr /* size_t */ OgaGenerator_GetSequenceCount(IntPtr /* const OgaGenerator* */ generator,
+                                                                                UIntPtr /* size_t */ index);
 
         // This function returns the sequence data at the given index. The returned pointer is owned by the
         // OgaGenerator object and will be freed when the OgaGenerator object is destroyed. It is expected
         // that the caller copies the data returned by this function after calling this function.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* const in32_t* */ OgaGenerator_GetSequence(IntPtr /* const OgaGenerator* */ generator,
-                                                                                 UIntPtr /* size_t */ index);
+        public static extern IntPtr /* const in32_t* */ OgaGenerator_GetSequenceData(IntPtr /* const OgaGenerator* */ generator,
+                                                                                     UIntPtr /* size_t */ index);
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaCreateSequences(out IntPtr /* OgaSequences** */ sequences);
diff --git a/src/cuda_sampling.cu b/src/cuda_sampling.cu
index 471d593ae..bef166d9f 100644
--- a/src/cuda_sampling.cu
+++ b/src/cuda_sampling.cu
@@ -11,7 +11,6 @@
 #include "smartptrs.h"
 #include <cuda_runtime.h>
 #include <cub/cub.cuh>
-#include <curand_kernel.h>
 #include <iostream>
 
 namespace Generators {
@@ -20,7 +19,15 @@ namespace cuda {
 constexpr int kMaxThreads = 1024;
 constexpr int kGPUWarpSize = 32;
 
-SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream) {
+__global__ void InitCurandStates(unsigned long long seed, curandState* states, int batch_size) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= batch_size)
+    return;
+
+  curand_init(seed, index, 0, &states[index]);
+}
+
+SamplingData::SamplingData(unsigned long long random_seed, int batch_size, int vocab_size, cudaStream_t stream) {
   indices_sorted = CudaMallocArray<int>(vocab_size * batch_size);
   scores_sorted = CudaMallocArray<float>(vocab_size * batch_size);
   scores_softmaxed = CudaMallocArray<float>(vocab_size * batch_size);
@@ -28,10 +35,13 @@ SamplingData::SamplingData(int batch_size, int vocab_size, cudaStream_t stream)
   thresholds = CudaMallocArray<float>(batch_size);
   indices_in = CudaMallocArray<int>(vocab_size * batch_size);
   offsets = CudaMallocArray<int>(batch_size + 1);
+  curand_states = CudaMallocArray<curandState>(batch_size);
   temp_storage_bytes = 0;
   cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, (float*)nullptr, (float*)nullptr,
     (int*)nullptr, (int*)nullptr, vocab_size*batch_size, batch_size, (int*)nullptr, (int*)nullptr, 0, sizeof(float) * 8, stream);
   temp_buffer = CudaMallocArray<float>(temp_storage_bytes / sizeof(float));
+
+  InitCurandStates<<<int(batch_size / 128) + 1, 128, 0, stream>>>(random_seed, curand_states.get(), batch_size);
 }
 
 // Softmax Kernels and Launchers
@@ -431,37 +441,31 @@ void LaunchGetTopKSubset(cudaStream_t stream, float* scores_in, float* scores_ou
 }
 
 // Sets up random thresholds for top p or top k sampling
-__global__ void RandomThresholdKernelTopPAndK(int seed, float* thresholds, float* prefix_sums, int batch_size, float p, int k) {
+__global__ void RandomThresholdKernelTopPAndK(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, float p, int k) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  curandState state;
-  curand_init(seed, index, 0, &state);
 
   float k_prob = prefix_sums[k-1];
   if (index < batch_size) {
     float min_p = fminf(p, k_prob);
-    thresholds[index] = min_p * curand_uniform(&state);
+    thresholds[index] = min_p * curand_uniform(&curand_states[index]);
   }
 }
 
 // Sets up random thresholds for top p or top k sampling
-__global__ void RandomThresholdKernelTopP(int seed, float* thresholds, float* prefix_sums, int batch_size, float p) {
+__global__ void RandomThresholdKernelTopP(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, float p) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  curandState state;
-  curand_init(seed, index, 0, &state);
 
   if (index < batch_size) {
-    thresholds[index] = p * curand_uniform(&state);
+    thresholds[index] = p * curand_uniform(&curand_states[index]);
   }
 }
 
 // Sets up random thresholds for top p or top k sampling
-__global__ void RandomThresholdKernelTopK(int seed, float* thresholds, float* prefix_sums, int batch_size, int k) {
+__global__ void RandomThresholdKernelTopK(curandState* curand_states, float* thresholds, float* prefix_sums, int batch_size, int k) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
-  curandState state;
-  curand_init(seed, index, 0, &state);
 
   if (index < batch_size) {
-    thresholds[index] = prefix_sums[k-1] * curand_uniform(&state);
+    thresholds[index] = prefix_sums[k - 1] * curand_uniform(&curand_states[index]);
   }
 }
 
@@ -502,16 +506,12 @@ void LaunchSampleKernel(SamplingData* data, cudaStream_t stream, float* scores,
   PrefixSumKernel<256><<<grid, block, 0, stream>>>(scores, prefix_sums.data(), sample_range, batch_size);
   // Random Thresholds for Top P or Top K Sampling
   std::span<float> thresholds{data->thresholds.get(), static_cast<size_t>(batch_size)};
-  std::random_device rd;
-  std::mt19937 eee(rd());
-  std::uniform_int_distribution<int> dist(0, std::numeric_limits<int>::max());
-  int seed = dist(eee);
   if (p > 0.0 && k > 1) {
-    RandomThresholdKernelTopPAndK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p, k);
+    RandomThresholdKernelTopPAndK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, p, k);
   } else if (p > 0.0) {
-    RandomThresholdKernelTopP<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, p);
+    RandomThresholdKernelTopP<<<int(batch_size / 128) + 1, 128, 0, stream>>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, p);
   } else if (k > 1) {
-    RandomThresholdKernelTopK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(seed, thresholds.data(), prefix_sums.data(), batch_size, k);
+    RandomThresholdKernelTopK<<<int(batch_size / 128) + 1, 128, 0, stream>>>(data->curand_states.get(), thresholds.data(), prefix_sums.data(), batch_size, k);
   }
   SampleKernel<256><<<grid, block, 0, stream>>>(prefix_sums.data(), indices, index_out, sample_range, thresholds.data());
 }
diff --git a/src/cuda_sampling.cuh b/src/cuda_sampling.cuh
index f7f74b827..cc8ab9867 100644
--- a/src/cuda_sampling.cuh
+++ b/src/cuda_sampling.cuh
@@ -1,20 +1,22 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "smartptrs.h"
+#include <curand_kernel.h>
 
 namespace Generators {
 namespace cuda {
 
 struct SamplingData {
-  SamplingData(int batch_size, int vocab_size, cudaStream_t stream);
-  std::unique_ptr<int, Generators::CudaDeleter> indices_sorted = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> scores_sorted = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> scores_softmaxed = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> prefix_sums = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> thresholds = nullptr;
-  std::unique_ptr<int, Generators::CudaDeleter> indices_in = nullptr;
-  std::unique_ptr<int, Generators::CudaDeleter> offsets = nullptr;
-  std::unique_ptr<float, Generators::CudaDeleter> temp_buffer = nullptr;
+  SamplingData(unsigned long long random_seed, int batch_size, int vocab_size, cudaStream_t stream);
+  cuda_unique_ptr<int> indices_sorted;
+  cuda_unique_ptr<float> scores_sorted;
+  cuda_unique_ptr<float> scores_softmaxed;
+  cuda_unique_ptr<float> prefix_sums;
+  cuda_unique_ptr<float> thresholds;
+  cuda_unique_ptr<int> indices_in;
+  cuda_unique_ptr<int> offsets;
+  cuda_unique_ptr<float> temp_buffer;
+  cuda_unique_ptr<curandState> curand_states;
   size_t temp_storage_bytes = 0;
 };
 
diff --git a/src/generators.h b/src/generators.h
index 393de8bea..f8289ea56 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -59,6 +59,7 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
   int context_length{};
 
   int batch_size{1};
+  int max_batch_size{0};
   int sequence_length{};
   int BatchBeamSize() const { return search.num_beams * batch_size; }
 
@@ -76,7 +77,7 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
 
   struct T5 {
     std::span<const int32_t> encoder_input_ids;  // Array of [batchsize][sequence_length]
-    std::span<const int32_t> decoder_input_ids;  // Array of [batchsize][sequence_length]  
+    std::span<const int32_t> decoder_input_ids;  // Array of [batchsize][sequence_length]
   };
   using Bart=T5;
 
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 2bab50aaa..2483289ac 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -2,7 +2,6 @@
 #include "decoder_only.h"
 
 namespace Generators {
-
 DecoderOnly_Model::DecoderOnly_Model(std::unique_ptr<Config> config, OrtEnv& ort_env)
     : Model{std::move(config)} {
   session_decoder_ = OrtSession::Create(ort_env, (config_->config_path / config_->model.decoder.filename).c_str(), session_options_.get());
@@ -17,28 +16,45 @@ std::unique_ptr<State> DecoderOnly_Model::CreateState(RoamingArray<int32_t> sequ
 DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArray<int32_t> sequence_lengths_unk, const GeneratorParams& params)
     : State{params},
       model_{model},
-      position_ids_{model, *this, sequence_lengths_unk} {
+      position_inputs_{model, *this, sequence_lengths_unk} {
   input_ids_.Add();
-  position_ids_.Add();
+  position_inputs_.Add();
   logits_.Add();
   kv_cache_.Add();
 }
 
 RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices) {
   if (first_run_) {
+    if (model_.use_cuda_graph_) {
+      model_.run_options_->AddConfigEntry("gpu_graph_id", "-1");
+    }
     first_run_ = false;
   } else {
     UpdateInputs(next_tokens, next_indices, current_length);
   }
 
-  State::Run(*model_.session_decoder_);
+  State::Run(*model_.session_decoder_, *model_.run_options_);
+
+  // Set the graph id for the following runs.
+  if (model_.use_cuda_graph_) {
+    int new_graph_annotation_id = GetGraphAnnotationId();
+    if (new_graph_annotation_id != graph_annotation_id_) {
+      graph_annotation_id_ = new_graph_annotation_id;
+      model_.run_options_->AddConfigEntry("gpu_graph_id", std::to_string(graph_annotation_id_).c_str());
+    }
+  }
   return logits_.Get();
 }
 
 void DecoderOnly_State::UpdateInputs(const RoamingArray<int32_t>& next_tokens_unk, RoamingArray<int32_t> beam_indices, int current_length) {
   input_ids_.Update(next_tokens_unk);
-  position_ids_.Update(current_length);
+  position_inputs_.Update(current_length);
   kv_cache_.Update(beam_indices.GetCPU(), current_length);
 }
 
+int DecoderOnly_State::GetGraphAnnotationId() const {
+  // Here we use the batch size as the graph annotation id.
+  return static_cast<int>(input_ids_.GetShape()[0]);
+}
+
 }  // namespace Generators
diff --git a/src/models/decoder_only.h b/src/models/decoder_only.h
index 5c997ec65..9296e4bf3 100644
--- a/src/models/decoder_only.h
+++ b/src/models/decoder_only.h
@@ -3,7 +3,7 @@
 #include "input_ids.h"
 #include "logits.h"
 #include "kv_cache.h"
-#include "position_ids.h"
+#include "position_inputs.h"
 
 namespace Generators {
 
@@ -21,14 +21,16 @@ struct DecoderOnly_State : State {
 
  private:
   void UpdateInputs(const RoamingArray<int32_t>& next_tokens, RoamingArray<int32_t> next_indices, int current_length);
+  int GetGraphAnnotationId() const;
 
   const DecoderOnly_Model& model_;
   bool first_run_{true};
+  int graph_annotation_id_{0};
 
   InputIDs input_ids_{model_, *this};
   Logits logits_{model_, *this};
   KV_Cache kv_cache_{model_, *this};
-  PositionIDs position_ids_;
+  PositionInputs position_inputs_;
 };
 
 }  // namespace Generators
diff --git a/src/models/gpt.cpp b/src/models/gpt.cpp
index 18d802116..73b74b5e5 100644
--- a/src/models/gpt.cpp
+++ b/src/models/gpt.cpp
@@ -16,9 +16,9 @@ std::unique_ptr<State> Gpt_Model::CreateState(RoamingArray<int32_t> sequence_len
 Gpt_State::Gpt_State(const Gpt_Model& model, RoamingArray<int32_t> sequence_lengths_unk, const GeneratorParams& params)
     : State{params},
       model_{model},
-      position_ids_{model, *this, sequence_lengths_unk} {
+      position_inputs_{model, *this, sequence_lengths_unk} {
   input_ids_.Add();
-  position_ids_.Add();
+  position_inputs_.Add();
   logits_.Add();
   kv_cache_.Add();
 }
@@ -30,13 +30,13 @@ RoamingArray<float> Gpt_State::Run(int current_length, RoamingArray<int32_t> nex
     UpdateInputs(next_tokens, next_indices, current_length);
   }
 
-  State::Run(*model_.session_decoder_);
+  State::Run(*model_.session_decoder_, *model_.run_options_);
   return logits_.Get();
 }
 
 void Gpt_State::UpdateInputs(const RoamingArray<int32_t>& next_tokens, RoamingArray<int32_t> beam_indices, int current_length) {
   input_ids_.Update(next_tokens);
-  position_ids_.Update(current_length);
+  position_inputs_.Update(current_length);
   kv_cache_.Update(beam_indices.GetCPU(), current_length);
 }
 
diff --git a/src/models/gpt.h b/src/models/gpt.h
index 5d9eac22b..af1e2e635 100644
--- a/src/models/gpt.h
+++ b/src/models/gpt.h
@@ -3,7 +3,7 @@
 #include "input_ids.h"
 #include "logits.h"
 #include "kv_cache.h"
-#include "position_ids.h"
+#include "position_inputs.h"
 
 namespace Generators {
 
@@ -28,6 +28,6 @@ struct Gpt_State : State {
   InputIDs input_ids_{model_, *this};
   Logits logits_{model_, *this};
   KV_Cache_Combined kv_cache_{model_, *this};
-  PositionIDs position_ids_;
+  PositionInputs position_inputs_;
 };
 }  // namespace Generators
diff --git a/src/models/input_ids.cpp b/src/models/input_ids.cpp
index 88d2514b5..d569ca9dc 100644
--- a/src/models/input_ids.cpp
+++ b/src/models/input_ids.cpp
@@ -27,6 +27,11 @@ InputIDs::InputIDs(const Model& model, State& state)
 
   value_ = model_.ExpandInputs(value_, state_.params_->search.num_beams);
   shape_[0] *= state_.params_->search.num_beams;
+
+  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
+    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
+    sb_input_ids_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+  }
 }
 
 void InputIDs::Add() {
@@ -40,7 +45,12 @@ void InputIDs::Update(RoamingArray<int32_t> next_tokens_unk) {
   // Resize input_ids shape once if it doesn't match the decoder shape
   if (shape_[1] != 1) {
     shape_[1] = 1;
-    value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+    if (!sb_input_ids_) {
+      value_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+    } else {
+      value_ = sb_input_ids_->CreateTensorOnStaticBuffer(shape_, type_);
+    }
+
     state_.inputs_[input_index_] = value_.get();
   }
 
diff --git a/src/models/input_ids.h b/src/models/input_ids.h
index 900b50b87..1cde87bd6 100644
--- a/src/models/input_ids.h
+++ b/src/models/input_ids.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "static_buffer.h"
+
 namespace Generators {
 
 struct InputIDs {
@@ -19,6 +21,9 @@ struct InputIDs {
   std::array<int64_t, 2> shape_{};
   ONNXTensorElementDataType type_;
   std::unique_ptr<OrtValue> value_;
+
+  // Used for decoding runs with cuda graphs.
+  std::unique_ptr<StaticBuffer> sb_input_ids_;
 };
 
 }  // namespace Generators
diff --git a/src/models/kernels.cu b/src/models/kernels.cu
index c97e6db30..a35a03c12 100644
--- a/src/models/kernels.cu
+++ b/src/models/kernels.cu
@@ -5,14 +5,14 @@
 namespace Generators {
 namespace cuda {
 
-template<typename T>
+template <typename T>
 __global__ void UpdatePositionIds(T* positions, int batch_beam_size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < batch_beam_size)
     positions[i]++;
 }
 
-template<typename T>
+template <typename T>
 void Launch_UpdatePositionIds(T* positions, int batch_beam_size, cudaStream_t stream) {
   UpdatePositionIds<T><<<(batch_beam_size + 255) / 256, 256, 0, stream>>>(positions, batch_beam_size);
 }
@@ -20,7 +20,7 @@ void Launch_UpdatePositionIds(T* positions, int batch_beam_size, cudaStream_t st
 template void Launch_UpdatePositionIds(int32_t* positions, int batch_beam_size, cudaStream_t stream);
 template void Launch_UpdatePositionIds(int64_t* positions, int batch_beam_size, cudaStream_t stream);
 
-template<typename T>
+template <typename T>
 __global__ void UpdateAttentionMask(T* mask_data, const T* old_mask_data, int batch_beam_size, int current_length) {
   int global_index = blockIdx.x * blockDim.x + threadIdx.x;
   int i = global_index / current_length;
@@ -34,7 +34,7 @@ __global__ void UpdateAttentionMask(T* mask_data, const T* old_mask_data, int ba
   }
 }
 
-template<typename T>
+template <typename T>
 void Launch_UpdateAttentionMask(T* mask_data, const T* old_mask_data, int batch_beam_size, int current_length, cudaStream_t stream) {
   UpdateAttentionMask<T><<<(batch_beam_size * current_length + 255) / 256, 256, 0, stream>>>(mask_data, old_mask_data, batch_beam_size, current_length);
 }
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index e418227f1..605a8d889 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -149,9 +149,19 @@ KV_Cache::KV_Cache(const Model& model, State& state)
   else
     shape_[2] = state_.params_->sequence_length;
 
-  for (int i = 0; i < layer_count_; ++i) {
-    presents_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
-    presents_.push_back(OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_));
+  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
+    assert(past_present_share_buffer_);
+    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
+    sb_kv_caches_.reserve(layer_count_ * 2);
+    for (int i = 0; i < layer_count_ * 2; ++i) {
+      sb_kv_caches_.push_back(std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size));
+    }
+  }
+
+  for (int i = 0; i < layer_count_ * 2; ++i) {
+    presents_.push_back(
+        sb_kv_caches_.empty() ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
+                              : sb_kv_caches_[i]->CreateTensorOnStaticBuffer(shape_, type_));
   }
 }
 
diff --git a/src/models/kv_cache.h b/src/models/kv_cache.h
index 0b6e79445..77d37b072 100644
--- a/src/models/kv_cache.h
+++ b/src/models/kv_cache.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "static_buffer.h"
+
 namespace Generators {
 
 struct KV_Cache_Combined {
@@ -49,6 +51,8 @@ struct KV_Cache {
   std::unique_ptr<OrtValue> empty_past_;
   std::vector<std::unique_ptr<OrtValue>> pasts_, presents_;
   std::vector<std::string> input_name_strings_, output_name_strings_;
+
+  std::vector<std::unique_ptr<StaticBuffer>> sb_kv_caches_;
 };
 
 // Very similar to the KV_Cache, but is only created once at the encoder step, then used without modification for every decoder step
diff --git a/src/models/logits.cpp b/src/models/logits.cpp
index d7dd837f3..e004492cb 100644
--- a/src/models/logits.cpp
+++ b/src/models/logits.cpp
@@ -9,24 +9,29 @@ Logits::Logits(const Model& model, State& state)
       state_{state},
       shape_{static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams, state_.params_->sequence_length, state_.params_->vocab_size},
       type_{model_.session_info_->GetOutputDataType(model_.config_->model.decoder.outputs.logits)} {
-  if (model_.device_type_ == DeviceType::CPU && type_ != Ort::TypeToTensorType<float>::type)
-    throw std::runtime_error("Model logits_type can only be float32 on CPU");
-
   auto logits_tensor = OrtValue::CreateTensor(*model.allocator_device_, shape_, type_);
   if (type_ == Ort::TypeToTensorType<float>::type)
     value32_ = std::move(logits_tensor);
   else
     value16_ = std::move(logits_tensor);
+
+  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
+    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
+    if (type_ == Ort::TypeToTensorType<float>::type) {
+      sb_logits32_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+    }
+    if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type) {
+      sb_logits16_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+    }
+  }
 }
 
 RoamingArray<float> Logits::Get() {
   size_t element_count = shape_[0] * shape_[1] * shape_[2];
 
-#if USE_CUDA
   // Convert from float16 to float32 if necessary
-  if (model_.device_type_ == DeviceType::CUDA && type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-    ConvertFp16ToFp32(*model_.allocator_device_, model_.cuda_stream_, *value16_, value32_);
-#endif
+  if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
+    ConvertFp16ToFp32(*model_.allocator_device_, *value16_, value32_, model_.device_type_, model_.cuda_stream_);
 
   // First iteration? Then copy the logits over to a {batch_beams, 1, vocab_size} tensor
   // We'll reuse this tensor for all future iterations
@@ -37,7 +42,9 @@ RoamingArray<float> Logits::Get() {
     const size_t num_beams = state_.params_->search.num_beams;
 
     shape_[1] = 1;
-    auto value_next = OrtValue::CreateTensor<float>(*model_.allocator_device_, shape_);
+    // bugbug: not done yet
+    auto value_next = !sb_logits32_ ? OrtValue::CreateTensor<float>(*model_.allocator_device_, shape_)
+                                    : sb_logits32_->CreateTensorOnStaticBuffer(shape_, type_);
     auto logits_next = cpu_span<float>{value_next->GetTensorMutableData<float>(), element_count};
 
     size_t vocab_index = 0;  // Simpler math to have this index go up by vocab_size for every logit chunk we process
@@ -70,7 +77,8 @@ RoamingArray<float> Logits::Get() {
 
     value32_ = std::move(value_next);
     if (type_ == Ort::TypeToTensorType<Ort::Float16_t>::type)
-      value16_ = OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_);
+      value16_ = !sb_logits16_ ? OrtValue::CreateTensor(*model_.allocator_device_, shape_, type_)
+                               : sb_logits16_->CreateTensorOnStaticBuffer(shape_, type_);
 
     state_.outputs_[output_index_] = type_ == Ort::TypeToTensorType<float>::type ? value32_.get() : value16_.get();
   }
diff --git a/src/models/logits.h b/src/models/logits.h
index c57ce9cce..bb439c952 100644
--- a/src/models/logits.h
+++ b/src/models/logits.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "static_buffer.h"
+
 namespace Generators {
 
 struct Logits {
@@ -17,6 +19,10 @@ struct Logits {
   ONNXTensorElementDataType type_;
   std::unique_ptr<OrtValue> value32_;  // Always fp32 values
   std::unique_ptr<OrtValue> value16_;  // When model output is fp16
+
+  // Used for decoding runs with cuda graphs.
+  std::unique_ptr<StaticBuffer> sb_logits32_;
+  std::unique_ptr<StaticBuffer> sb_logits16_;
 };
 
 }  // namespace Generators
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 50777de5a..374b47a60 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -12,6 +12,23 @@
 //  Because dml_provider_factory includes windows headers that #define min and max, this next line will prevent this from happening
 #define NOMINMAX
 #include "dml_provider_factory.h"
+
+EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+
+static std::wstring CurrentModulePath() {
+  wchar_t path[MAX_PATH];
+  GetModuleFileNameW((HINSTANCE)&__ImageBase, path, _countof(path));
+
+  wchar_t absolute_path[MAX_PATH];
+  wchar_t* name;
+  GetFullPathNameW(path, _countof(path), absolute_path, &name);
+
+  auto idx = std::distance(absolute_path, name);
+  auto out_path = std::wstring(absolute_path);
+  out_path.resize(idx);
+
+  return out_path;
+}
 #endif
 
 namespace Generators {
@@ -19,7 +36,7 @@ namespace Generators {
 State::State(const GeneratorParams& params) : params_{params.shared_from_this()} {
 }
 
-void State::Run(OrtSession& session) {
+void State::Run(OrtSession& session, OrtRunOptions& run_options) {
   if (g_log.enabled && g_log.model_input_values) {
     auto& stream = Log("model_input_values");
     stream << std::endl;
@@ -32,7 +49,7 @@ void State::Run(OrtSession& session) {
     DumpTensors(stream, outputs_.data(), output_names_.data(), output_names_.size(), false);
   }
 
-  session.Run(nullptr, input_names_.data(), inputs_.data(), input_names_.size(), output_names_.data(), outputs_.data(), output_names_.size());
+  session.Run(&run_options, input_names_.data(), inputs_.data(), input_names_.size(), output_names_.data(), outputs_.data(), output_names_.size());
 
   if (g_log.enabled && g_log.model_output_values) {
     auto& stream = Log("model_output_values");
@@ -196,6 +213,14 @@ SessionInfo::SessionInfo(OrtSession& session) {
   }
 }
 
+bool SessionInfo::HasInput(const std::string& name) const {
+  return inputs_.find(name) != inputs_.end();
+}
+
+bool SessionInfo::HasOutput(const std::string& name) const {
+  return outputs_.find(name) != outputs_.end();
+}
+
 ONNXTensorElementDataType SessionInfo::GetInputDataType(const std::string& name) const {
   auto result = inputs_.find(name);
   if (result == inputs_.end())
@@ -211,6 +236,9 @@ ONNXTensorElementDataType SessionInfo::GetOutputDataType(const std::string& name
 }
 
 Model::Model(std::unique_ptr<Config> config) : config_{std::move(config)} {
+  // TODO: add function to create run options
+  run_options_ = OrtRunOptions::Create();
+
   CreateSessionOptions();
 }
 
@@ -305,6 +333,9 @@ void Model::CreateSessionOptions() {
       Ort::ThrowOnError(Ort::api->GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&p_dml_api)));
       if (!p_dml_api)
         throw std::runtime_error("Unexpected nullptr getting OrtDmlApi");
+      auto directml_dll = CurrentModulePath() + L"DirectML.dll";
+      if (LoadLibraryExW(directml_dll.c_str(), nullptr, 0) == NULL)
+        throw std::runtime_error("DirectML.dll not found");
       p_dml_api->SessionOptionsAppendExecutionProvider_DML(&ort_options, 0);
 #endif
     } else
@@ -338,8 +369,7 @@ std::shared_ptr<GeneratorParams> CreateGeneratorParams() {
   return std::make_shared<GeneratorParams>();
 }
 
-#if USE_CUDA
-void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out) {
+void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream) {
   auto shape_info = in.GetTensorTypeAndShapeInfo();
   auto shape = shape_info->GetShape();
   assert(shape_info->GetElementType() == Ort::TypeToTensorType<Ort::Float16_t>::type);
@@ -358,10 +388,23 @@ void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& i
   auto* fp16 = in.GetTensorData<uint16_t>();
   auto* fp32 = p_out->GetTensorMutableData<float>();
 
-  cuda::LaunchFp16ToFp32(fp16, fp32, count, stream);
-}
+  switch (device_type) {
+    case DeviceType::CPU:
+      for (int i = 0; i < count; i++)
+        fp32[i] = Float16ToFloat32(fp16[i]);
+      break;
+
+#ifdef USE_CUDA
+    case DeviceType::CUDA:
+      cuda::LaunchFp16ToFp32(fp16, fp32, count, stream);
+      break;
 #endif
 
+    default:
+      throw std::runtime_error("ConvertFp16ToFp32 - Unsupported device type");
+  }
+}
+
 size_t GetOrtTypeSize(ONNXTensorElementDataType type) {
   switch (type) {
     case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
@@ -445,4 +488,14 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   return expanded;
 }
 
+void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) {
+  max_batch_size_ = params.max_batch_size;
+  if (max_batch_size_ > 0 && DeviceType::CUDA == device_type_) {
+    if (!IsCudaGraphEnabled(config_->model.decoder.session_options)) {
+      throw std::runtime_error("CUDA graphs are not enabled in this model");
+    }
+    use_cuda_graph_ = true;
+  }
+}
+
 }  // namespace Generators
diff --git a/src/models/model.h b/src/models/model.h
index 9af784362..9a7012b19 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -7,7 +7,7 @@ namespace Generators {
 
 struct Tokenizer;
 
-void ConvertFp16ToFp32(OrtAllocator& allocator, cudaStream_t stream, OrtValue& in, std::unique_ptr<OrtValue>& p_out);
+void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream);
 
 struct State {
   State(const GeneratorParams& params);
@@ -21,8 +21,8 @@ struct State {
   std::vector<OrtValue*> inputs_, outputs_;
 
  protected:
-  void Run(OrtSession& session);  // Uses the inputs below to run
-  void ClearIO();                 // Clear all inputs/outputs
+  void Run(OrtSession& session, OrtRunOptions& run_options);  // Uses the inputs below to run
+  void ClearIO();                                             // Clear all inputs/outputs
 };
 
 #ifdef NO_TOKENIZER
@@ -88,6 +88,9 @@ struct Tokenizer : std::enable_shared_from_this<Tokenizer> {
 struct SessionInfo {
   SessionInfo(OrtSession& session);
 
+  bool HasInput(const std::string& name) const;
+  bool HasOutput(const std::string& name) const;
+
   ONNXTensorElementDataType GetInputDataType(const std::string& name) const;
   ONNXTensorElementDataType GetOutputDataType(const std::string& name) const;
 
@@ -105,8 +108,12 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::unique_ptr<OrtValue> ExpandInputs(std::unique_ptr<OrtValue>& input, int num_beams) const;
 
+  void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params);
+
   std::unique_ptr<Config> config_;
   std::unique_ptr<OrtSessionOptions> session_options_;
+  std::unique_ptr<OrtRunOptions> run_options_;
+
   cuda_stream_holder cuda_stream_;
   DeviceType device_type_{DeviceType::CPU};
   Ort::Allocator& allocator_cpu_{Ort::Allocator::GetWithDefaultOptions()};
@@ -116,6 +123,9 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::shared_ptr<Model> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
+  bool use_cuda_graph_{};
+  int max_batch_size_{};
+
  protected:
   void InitDeviceAllocator(OrtSession& session);
   void CreateSessionOptions();
diff --git a/src/models/position_ids.cpp b/src/models/position_ids.cpp
deleted file mode 100644
index a0e8d6b56..000000000
--- a/src/models/position_ids.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-#include "../generators.h"
-#include "model.h"
-#include "position_ids.h"
-#include "kernels.h"
-
-namespace Generators {
-
-PositionIDs::PositionIDs(const Model& model, State& state, RoamingArray<int32_t>& sequence_lengths_unk)
-    : model_{model},
-      state_{state} {
-  type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.position_ids);
-  if (type_ != Ort::TypeToTensorType<int32_t>::type && type_ != Ort::TypeToTensorType<int64_t>::type)
-    throw std::runtime_error("position_ids & attention_mask only support int32 or int64 types");
-
-  std::array<int64_t, 2> shape{state_.params_->batch_size, state_.params_->sequence_length};  // Only batch_size initially, as we haven't expanded over the beams yet
-  position_ids_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_);
-  position_ids_next_ = OrtValue::CreateTensor(model.allocator_cpu_, std::array<int64_t, 2>{shape[0], 1}, type_);
-  attention_mask_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_);
-
-  if (type_ == Ort::TypeToTensorType<int32_t>::type)
-    InitializeTensors<int32_t>(shape, sequence_lengths_unk);
-  else
-    InitializeTensors<int64_t>(shape, sequence_lengths_unk);
-
-  position_ids_ = model_.ExpandInputs(position_ids_, state_.params_->search.num_beams);
-  position_ids_next_ = model_.ExpandInputs(position_ids_next_, state_.params_->search.num_beams);
-  attention_mask_ = model_.ExpandInputs(attention_mask_, state_.params_->search.num_beams);
-  shape[0] *= state_.params_->search.num_beams;
-  position_ids_shape_ = shape;
-  attention_mask_shape_ = shape;
-}
-
-void PositionIDs::Add() {
-  input_index_ = state_.inputs_.size();
-
-  state_.inputs_.push_back(position_ids_.get());
-  state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str());
-
-  state_.inputs_.push_back(attention_mask_.get());
-  state_.input_names_.push_back(model_.config_->model.decoder.inputs.attention_mask.c_str());
-}
-
-void PositionIDs::Update(int current_length) {
-  // Reallocate position_ids for the 2nd and onward shape
-  if (position_ids_next_) {
-    position_ids_ = std::move(position_ids_next_);
-    position_ids_shape_[1] = 1;
-    state_.inputs_[input_index_] = position_ids_.get();
-  } else {  // Just incrementing existing position IDs
-    switch (model_.device_type_) {
-      case DeviceType::CPU: {
-        if (type_ == Ort::TypeToTensorType<int32_t>::type)
-          UpdatePositionIDs<int32_t>();
-        else
-          UpdatePositionIDs<int64_t>();
-        break;
-      }
-#if USE_CUDA
-      case DeviceType::CUDA:
-        if (type_ == Ort::TypeToTensorType<int32_t>::type)
-          cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int32_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
-        else
-          cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int64_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
-        break;
-#endif
-      default:
-        throw std::runtime_error("PositionIDs::Update - Unsupported device type");
-    }
-  }
-
-  {
-    // Update attention mask
-    assert(attention_mask_shape_[1] == current_length - 1);  // We should always be growing by 1
-    attention_mask_shape_[1] = current_length;
-
-    std::unique_ptr<OrtValue> next_attention_mask = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_);
-
-    switch (model_.device_type_) {
-      case DeviceType::CPU: {
-        if (type_ == Ort::TypeToTensorType<int32_t>::type)
-          UpdateAttentionMask(next_attention_mask->GetTensorMutableData<int32_t>(), attention_mask_->GetTensorData<int32_t>(), current_length);
-        else
-          UpdateAttentionMask(next_attention_mask->GetTensorMutableData<int64_t>(), attention_mask_->GetTensorData<int64_t>(), current_length);
-        break;
-      }
-#if USE_CUDA
-      case DeviceType::CUDA:
-        if (type_ == Ort::TypeToTensorType<int32_t>::type)
-          cuda::Launch_UpdateAttentionMask(next_attention_mask->GetTensorMutableData<int32_t>(), attention_mask_->GetTensorData<int32_t>(), static_cast<int>(attention_mask_shape_[0]), current_length, model_.cuda_stream_);
-        else
-          cuda::Launch_UpdateAttentionMask(next_attention_mask->GetTensorMutableData<int64_t>(), attention_mask_->GetTensorData<int64_t>(), static_cast<int>(attention_mask_shape_[0]), current_length, model_.cuda_stream_);
-        break;
-#endif
-      default:
-        throw std::runtime_error("PositionIDs::Update - Unsupported device type");
-    }
-    attention_mask_ = std::move(next_attention_mask);
-    state_.inputs_[input_index_ + 1] = attention_mask_.get();
-  }
-}
-
-template <typename T>
-void PositionIDs::InitializeTensors(std::array<int64_t, 2> shape, cpu_span<int32_t> sequence_lengths) {
-  // Set attention mask to be 0 for pad tokens, and 1 for all other tokens.
-  // Set position id to be 0 for pad tokens, and accumulated sum of mask in a batch for other tokens
-  auto* mask_data = attention_mask_->GetTensorMutableData<T>();
-  auto* position_data = position_ids_->GetTensorMutableData<T>();
-  auto* position_data_next = position_ids_next_->GetTensorMutableData<T>();
-  const auto* word_id = state_.params_->input_ids.data();
-  auto* mask = mask_data;
-  auto* position = position_data;
-  for (int i = 0; i < shape[0]; i++) {
-    T abs_position = 0;
-    for (int j = 0; j < shape[1]; j++, word_id++, mask++, position++) {
-      if (*word_id == state_.params_->pad_token_id) {
-        *mask = 0;
-        *position = 0;
-      } else {
-        *mask = 1;
-        *position = abs_position++;
-      }
-    }
-
-    position_data_next[i] = abs_position;
-    for (int k = 0; k < state_.params_->search.num_beams; k++) {
-      sequence_lengths[i * state_.params_->search.num_beams + k] = static_cast<int32_t>(abs_position);
-    }
-  }
-}
-
-template <typename T>
-void PositionIDs::UpdatePositionIDs() {
-  // Increment position IDs
-  auto* data = position_ids_->GetTensorMutableData<T>();
-  for (int i = 0; i < position_ids_shape_[0]; i++) {
-    data[i]++;
-  }
-};
-
-template <typename T>
-void PositionIDs::UpdateAttentionMask(T* data, const T* old_data, int current_length) {
-  for (int i = 0; i < attention_mask_shape_[0]; i++) {
-    for (int j = 0; j < current_length - 1; j++) {
-      data[i * current_length + j] = old_data[i * (current_length - 1) + j];
-    }
-    data[i * current_length + current_length - 1] = 1;
-  }
-};
-
-}  // namespace Generators
diff --git a/src/models/position_ids.h b/src/models/position_ids.h
deleted file mode 100644
index 411b55c1c..000000000
--- a/src/models/position_ids.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-namespace Generators {
-
-struct PositionIDs {
-  PositionIDs(const Model& model, State& state, RoamingArray<int32_t>& sequence_lengths);
-
-  void Add();
-  void Update(int current_length);
-
- private:
-  template <typename T>
-  void InitializeTensors(std::array<int64_t, 2> shape, cpu_span<int32_t> sequence_lengths);
-
-  template <typename T>
-  void UpdatePositionIDs();
-  template <typename T>
-  void UpdateAttentionMask(T* data, const T* old_data, int current_length);
-
-  const Model& model_;
-  State& state_;
-  size_t input_index_{~0U};
-  ONNXTensorElementDataType type_;  // Common type for position_ids and attention_mask
-
-  std::array<int64_t, 2> position_ids_shape_{};  // {params.batch_size*params.beam_size, params.sequence_length}
-  std::unique_ptr<OrtValue> position_ids_;
-  std::array<int64_t, 2> attention_mask_shape_{};  // {params.batch_size*params.beam_size, params.sequence_length}
-  std::unique_ptr<OrtValue> attention_mask_;
-
-  std::unique_ptr<OrtValue> position_ids_next_;  // Replaces position_ids_ after the first Run() call
-};
-
-}  // namespace Generators
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
new file mode 100644
index 000000000..03463488e
--- /dev/null
+++ b/src/models/position_inputs.cpp
@@ -0,0 +1,280 @@
+#include "../generators.h"
+#include "model.h"
+#include "position_inputs.h"
+#include "kernels.h"
+
+namespace Generators {
+
+PositionInputs::PositionInputs(const Model& model, State& state, RoamingArray<int32_t>& sequence_lengths_unk)
+    : model_{model},
+      state_{state} {
+  has_mask_input_ = model_.session_info_->HasInput(model_.config_->model.decoder.inputs.attention_mask);
+  has_posid_input_ = model_.session_info_->HasInput(model_.config_->model.decoder.inputs.position_ids);
+  has_seqlens_k_input_ = model_.session_info_->HasInput(model_.config_->model.decoder.inputs.seqlens_k);
+  has_total_sequence_length_input_ = model_.session_info_->HasInput(model_.config_->model.decoder.inputs.total_sequence_length);
+
+  type_ = Ort::TypeToTensorType<int32_t>::type;
+  if (has_mask_input_) {
+    type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.attention_mask);
+  }
+  if (has_posid_input_) {
+    if (has_mask_input_) {
+      if (model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.position_ids) != type_) {
+        throw std::runtime_error("position_ids & attention_mask must have the same data type");
+      }
+    }
+    type_ = model_.session_info_->GetInputDataType(model_.config_->model.decoder.inputs.position_ids);
+  }
+
+  if (type_ != Ort::TypeToTensorType<int32_t>::type && type_ != Ort::TypeToTensorType<int64_t>::type)
+    throw std::runtime_error("position_ids & attention_mask only support int32 or int64 types");
+
+  std::array<int64_t, 2> shape{state_.params_->batch_size, state_.params_->sequence_length};  // Only batch_size initially, as we haven't expanded over the beams yet
+  position_ids_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_);
+  position_ids_next_ = OrtValue::CreateTensor(model.allocator_cpu_, std::array<int64_t, 2>{shape[0], 1}, type_);
+  attention_mask_ = OrtValue::CreateTensor(model.allocator_cpu_, shape, type_);
+
+  initial_sequence_lengths_.resize(state_.params_->BatchBeamSize());
+
+  if (type_ == Ort::TypeToTensorType<int32_t>::type)
+    InitializeTensors<int32_t>(shape, sequence_lengths_unk);
+  else
+    InitializeTensors<int64_t>(shape, sequence_lengths_unk);
+
+  position_ids_ = model_.ExpandInputs(position_ids_, state_.params_->search.num_beams);
+  position_ids_next_ = model_.ExpandInputs(position_ids_next_, state_.params_->search.num_beams);
+  attention_mask_ = model_.ExpandInputs(attention_mask_, state_.params_->search.num_beams);
+  shape[0] *= state_.params_->search.num_beams;
+  position_ids_shape_ = shape;
+  attention_mask_shape_ = shape;
+
+  if (model_.device_type_ == DeviceType::CUDA && model_.use_cuda_graph_) {
+    size_t max_beam_batch_size = static_cast<size_t>(model_.config_->search.num_beams) * model_.max_batch_size_;
+    sb_position_ids_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+    sb_seqlens_k_ = std::make_unique<StaticBuffer>(model_.allocator_device_, max_beam_batch_size);
+  }
+}
+
+void PositionInputs::Add() {
+  if (has_posid_input_) {
+    AddPositionIDs();
+  }
+  if (has_seqlens_k_input_) {
+    AddSeqlensK();
+  }
+  if (has_total_sequence_length_input_) {
+    AddTotalSequenceLength();
+  }
+  if (has_mask_input_) {
+    AddAttentionMask();
+  }
+}
+
+void PositionInputs::Update(int current_length) {
+  if (has_posid_input_) {
+    UpdatePositionIDs(current_length);
+  }
+  if (has_seqlens_k_input_) {
+    UpdateSeqlensK(current_length);
+  }
+  if (has_total_sequence_length_input_) {
+    UpdateTotalSequenceLength(current_length);
+  }
+  if (has_mask_input_) {
+    UpdateAttentionMask(current_length);
+  }
+}
+
+void PositionInputs::AddAttentionMask() {
+  mask_input_index_ = state_.inputs_.size();
+
+  state_.inputs_.push_back(attention_mask_.get());
+  state_.input_names_.push_back(model_.config_->model.decoder.inputs.attention_mask.c_str());
+}
+
+void PositionInputs::AddPositionIDs() {
+  posid_input_index_ = state_.inputs_.size();
+
+  state_.inputs_.push_back(position_ids_.get());
+  state_.input_names_.push_back(model_.config_->model.decoder.inputs.position_ids.c_str());
+}
+
+void PositionInputs::AddSeqlensK() {
+  seqlens_k_input_index_ = state_.inputs_.size();
+
+  senlens_k_shape_ = {static_cast<int64_t>(state_.params_->batch_size) * state_.params_->search.num_beams};
+  seqlens_k_ = OrtValue::CreateTensor(model_.allocator_cpu_, senlens_k_shape_, Ort::TypeToTensorType<int32_t>::type);
+
+  std::copy(initial_sequence_lengths_.begin(),
+            initial_sequence_lengths_.end(),
+            seqlens_k_->GetTensorMutableData<int32_t>());
+
+  state_.inputs_.push_back(seqlens_k_.get());
+  state_.input_names_.push_back(model_.config_->model.decoder.inputs.seqlens_k.c_str());
+}
+
+void PositionInputs::AddTotalSequenceLength() {
+  total_sequence_length_input_index_ = state_.inputs_.size();
+  total_sequence_length_ = OrtValue::CreateTensor(model_.allocator_cpu_,
+                                                  {},
+                                                  Ort::TypeToTensorType<int32_t>::type);
+
+  total_sequence_length_->GetTensorMutableData<int32_t>()[0] = state_.params_->sequence_length;
+  state_.inputs_.push_back(total_sequence_length_.get());
+  state_.input_names_.push_back(model_.config_->model.decoder.inputs.total_sequence_length.c_str());
+}
+
+void PositionInputs::UpdatePositionIDs(int current_length) {
+  // Reallocate position_ids for the 2nd and onward shape
+  if (is_first_posid_update_) {
+    position_ids_shape_[1] = 1;
+    if (!sb_position_ids_) {
+      position_ids_ = std::move(position_ids_next_);
+    } else {
+#if USE_CUDA
+      position_ids_ = sb_position_ids_->CreateTensorOnStaticBuffer(position_ids_shape_, type_);
+      assert(model_.device_type_ == DeviceType::CUDA);
+      if (type_ == Ort::TypeToTensorType<int32_t>::type) {
+        cudaMemcpyAsync(position_ids_->GetTensorMutableRawData(),
+                        position_ids_next_->GetTensorData<int32_t>(),
+                        sizeof(int32_t) * position_ids_shape_[0],
+                        cudaMemcpyDeviceToDevice,
+                        model_.cuda_stream_);
+      } else {
+        cudaMemcpyAsync(position_ids_->GetTensorMutableRawData(),
+                        position_ids_next_->GetTensorData<int64_t>(),
+                        sizeof(int64_t) * position_ids_shape_[0],
+                        cudaMemcpyDeviceToDevice,
+                        model_.cuda_stream_);
+      }
+#endif
+    }
+    is_first_posid_update_ = false;
+    state_.inputs_[posid_input_index_] = position_ids_.get();
+  } else {  // Just incrementing existing position IDs
+    switch (model_.device_type_) {
+      case DeviceType::CPU: {
+        if (type_ == Ort::TypeToTensorType<int32_t>::type)
+          UpdatePositionIDsImpl<int32_t>();
+        else
+          UpdatePositionIDsImpl<int64_t>();
+        break;
+      }
+#if USE_CUDA
+      case DeviceType::CUDA:
+        if (type_ == Ort::TypeToTensorType<int32_t>::type)
+          cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int32_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
+        else
+          cuda::Launch_UpdatePositionIds(position_ids_->GetTensorMutableData<int64_t>(), static_cast<int>(position_ids_shape_[0]), model_.cuda_stream_);
+        break;
+#endif
+      default:
+        throw std::runtime_error("PositionIDs::Update - Unsupported device type");
+    }
+  }
+}
+
+void PositionInputs::UpdateSeqlensK(int current_length) {
+#if USE_CUDA
+  assert(type_ == Ort::TypeToTensorType<int32_t>::type);
+  assert(model_.device_type_ == DeviceType::CUDA);
+
+  if (is_first_seqlen_update_) {
+    if (!sb_seqlens_k_) {
+      seqlens_k_ = OrtValue::CreateTensor(*model_.allocator_device_, senlens_k_shape_, Ort::TypeToTensorType<int32_t>::type);
+    } else {
+      seqlens_k_ = sb_seqlens_k_->CreateTensorOnStaticBuffer(senlens_k_shape_, Ort::TypeToTensorType<int32_t>::type);
+    }
+    state_.inputs_[seqlens_k_input_index_] = seqlens_k_.get();
+    cudaMemcpyAsync(seqlens_k_->GetTensorMutableRawData(), initial_sequence_lengths_.data(), sizeof(int32_t) * initial_sequence_lengths_.size(), cudaMemcpyHostToDevice, model_.cuda_stream_);
+    is_first_seqlen_update_ = false;
+  } else {
+    cuda::Launch_UpdatePositionIds(seqlens_k_->GetTensorMutableData<int32_t>(), static_cast<int>(senlens_k_shape_[0]), model_.cuda_stream_);
+  }
+#endif
+}
+
+void PositionInputs::UpdateTotalSequenceLength(int current_length) {
+  total_sequence_length_->GetTensorMutableData<int32_t>()[0] = current_length;
+}
+
+void PositionInputs::UpdateAttentionMask(int current_length) {
+  // Update attention mask
+  assert(attention_mask_shape_[1] == current_length - 1);  // We should always be growing by 1
+  attention_mask_shape_[1] = current_length;
+
+  std::unique_ptr<OrtValue> next_attention_mask = OrtValue::CreateTensor(*model_.allocator_device_, attention_mask_shape_, type_);
+
+  switch (model_.device_type_) {
+    case DeviceType::CPU: {
+      if (type_ == Ort::TypeToTensorType<int32_t>::type)
+        UpdateAttentionMaskImpl(next_attention_mask->GetTensorMutableData<int32_t>(), attention_mask_->GetTensorData<int32_t>(), current_length);
+      else
+        UpdateAttentionMaskImpl(next_attention_mask->GetTensorMutableData<int64_t>(), attention_mask_->GetTensorData<int64_t>(), current_length);
+      break;
+    }
+#if USE_CUDA
+    case DeviceType::CUDA:
+      if (type_ == Ort::TypeToTensorType<int32_t>::type)
+        cuda::Launch_UpdateAttentionMask(next_attention_mask->GetTensorMutableData<int32_t>(), attention_mask_->GetTensorData<int32_t>(), static_cast<int>(attention_mask_shape_[0]), current_length, model_.cuda_stream_);
+      else
+        cuda::Launch_UpdateAttentionMask(next_attention_mask->GetTensorMutableData<int64_t>(), attention_mask_->GetTensorData<int64_t>(), static_cast<int>(attention_mask_shape_[0]), current_length, model_.cuda_stream_);
+      break;
+#endif
+    default:
+      throw std::runtime_error("PositionIDs::Update - Unsupported device type");
+  }
+  attention_mask_ = std::move(next_attention_mask);
+  state_.inputs_[mask_input_index_] = attention_mask_.get();
+}
+
+template <typename T>
+void PositionInputs::InitializeTensors(std::array<int64_t, 2> shape, cpu_span<int32_t> sequence_lengths) {
+  // Set attention mask to be 0 for pad tokens, and 1 for all other tokens.
+  // Set position id to be 0 for pad tokens, and accumulated sum of mask in a batch for other tokens
+  auto* mask_data = attention_mask_->GetTensorMutableData<T>();
+  auto* position_data = position_ids_->GetTensorMutableData<T>();
+  auto* position_data_next = position_ids_next_->GetTensorMutableData<T>();
+  const auto* word_id = state_.params_->input_ids.data();
+  auto* mask = mask_data;
+  auto* position = position_data;
+  for (int i = 0; i < shape[0]; i++) {
+    T abs_position = 0;
+    for (int j = 0; j < shape[1]; j++, word_id++, mask++, position++) {
+      if (*word_id == state_.params_->pad_token_id) {
+        *mask = 0;
+        *position = 0;
+      } else {
+        *mask = 1;
+        *position = abs_position++;
+      }
+    }
+
+    position_data_next[i] = abs_position;
+    for (int k = 0; k < state_.params_->search.num_beams; k++) {
+      sequence_lengths[i * state_.params_->search.num_beams + k] = static_cast<int32_t>(abs_position);
+      initial_sequence_lengths_[i * state_.params_->search.num_beams + k] = static_cast<int32_t>(abs_position);
+    }
+  }
+}
+
+template <typename T>
+void PositionInputs::UpdatePositionIDsImpl() {
+  // Increment position IDs
+  auto* data = position_ids_->GetTensorMutableData<T>();
+  for (int i = 0; i < position_ids_shape_[0]; i++) {
+    data[i]++;
+  }
+};
+
+template <typename T>
+void PositionInputs::UpdateAttentionMaskImpl(T* data, const T* old_data, int current_length) {
+  for (int i = 0; i < attention_mask_shape_[0]; i++) {
+    for (int j = 0; j < current_length - 1; j++) {
+      data[i * current_length + j] = old_data[i * (current_length - 1) + j];
+    }
+    data[i * current_length + current_length - 1] = 1;
+  }
+};
+
+}  // namespace Generators
diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
new file mode 100644
index 000000000..a734e5b73
--- /dev/null
+++ b/src/models/position_inputs.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "static_buffer.h"
+
+namespace Generators {
+
+struct PositionInputs {
+  PositionInputs(const Model& model, State& state, RoamingArray<int32_t>& sequence_lengths);
+
+  void Add();
+  void Update(int current_length);
+
+ private:
+  void AddAttentionMask();
+  void AddPositionIDs();
+  void AddSeqlensK();
+  void AddTotalSequenceLength();
+
+  void UpdatePositionIDs(int current_length);
+  void UpdateAttentionMask(int current_length);
+  void UpdateSeqlensK(int current_length);
+  void UpdateTotalSequenceLength(int current_length);
+
+  template <typename T>
+  void InitializeTensors(std::array<int64_t, 2> shape, cpu_span<int32_t> sequence_lengths);
+
+  template <typename T>
+  void UpdatePositionIDsImpl();
+  template <typename T>
+  void UpdateAttentionMaskImpl(T* data, const T* old_data, int current_length);
+
+  const Model& model_;
+  State& state_;
+
+  size_t mask_input_index_{~0U};
+  size_t posid_input_index_{~0U};
+  size_t seqlens_k_input_index_{~0U};
+  size_t total_sequence_length_input_index_{~0U};
+
+  ONNXTensorElementDataType type_;  // Common type for position_ids and attention_mask
+
+  bool has_mask_input_{false};
+  bool has_posid_input_{false};
+  bool has_seqlens_k_input_{false};
+  bool has_total_sequence_length_input_{false};
+
+  std::array<int64_t, 2> position_ids_shape_{};  // {params.batch_size*params.beam_size, params.sequence_length}
+  std::unique_ptr<OrtValue> position_ids_;
+  std::array<int64_t, 2> attention_mask_shape_{};  // {params.batch_size*params.beam_size, params.sequence_length}
+  std::unique_ptr<OrtValue> attention_mask_;
+  std::array<int64_t, 1> senlens_k_shape_{};  // {params.batch_size*params.beam_size}
+  std::unique_ptr<OrtValue> seqlens_k_;
+  std::unique_ptr<OrtValue> total_sequence_length_;  // Scalar
+
+  std::unique_ptr<OrtValue> position_ids_next_;  // Replaces position_ids_ after the first Run() call
+  std::vector<int32_t> initial_sequence_lengths_;
+
+  // Used for decoding runs with cuda graphs.
+  std::unique_ptr<StaticBuffer> sb_position_ids_;
+  std::unique_ptr<StaticBuffer> sb_seqlens_k_;
+
+  bool is_first_posid_update_{true};
+  bool is_first_seqlen_update_{true};
+};
+
+}  // namespace Generators
diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp
new file mode 100644
index 000000000..c7f2e5f90
--- /dev/null
+++ b/src/models/static_buffer.cpp
@@ -0,0 +1,55 @@
+#include "../generators.h"
+#include "../span.h"
+#include "static_buffer.h"
+
+namespace Generators {
+
+StaticBuffer::StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size) : allocator_{allocator}, info_{allocator_->GetInfo()}, max_beam_batch_size_{max_beam_batch_size} {
+}
+
+std::unique_ptr<OrtValue> StaticBuffer::CreateTensorOnStaticBuffer(std::span<const int64_t> shape,
+                                                                   ONNXTensorElementDataType type) {
+  size_t new_bytes = GetElementSize(type) * GetNumElements(shape);
+  if (buffer_ == nullptr) {
+    // Assuming the first dimension is the batch size
+    bytes_ = new_bytes * (max_beam_batch_size_ / shape[0]);
+    buffer_ = allocator_->Alloc(bytes_);
+    return OrtValue::CreateTensor(info_, buffer_, new_bytes, shape, type);
+  }
+  if (new_bytes > bytes_) {
+    std::runtime_error("StaticBuffer: new_bytes > bytes_");
+  }
+  return OrtValue::CreateTensor(info_, buffer_, new_bytes, shape, type);
+}
+
+// TODO: same as GetOrtTypeSize() in model.cc. Should be moved to a common place
+size_t StaticBuffer::GetElementSize(ONNXTensorElementDataType type) {
+  switch (type) {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+      return sizeof(uint16_t);
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+      return sizeof(float);
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+      return sizeof(int32_t);
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+      return sizeof(int64_t);
+    default:
+      throw std::runtime_error("Unsupported tensor element data type");
+  }
+}
+
+size_t StaticBuffer::GetNumElements(std::span<const int64_t> shape) {
+  size_t num_elements = 1;
+  for (auto dim : shape) {
+    num_elements *= dim;
+  }
+  return num_elements;
+}
+
+StaticBuffer::~StaticBuffer() {
+  if (buffer_ != nullptr) {
+    allocator_->Free(buffer_);
+  }
+}
+
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/static_buffer.h b/src/models/static_buffer.h
new file mode 100644
index 000000000..8b3b43315
--- /dev/null
+++ b/src/models/static_buffer.h
@@ -0,0 +1,24 @@
+#pragma once
+
+namespace Generators {
+
+struct StaticBuffer {
+  // Add max_beam_batch_size to the constructor
+  StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size);
+  ~StaticBuffer();
+
+  std::unique_ptr<OrtValue> CreateTensorOnStaticBuffer(std::span<const int64_t> shape,
+                                                       ONNXTensorElementDataType type);
+
+ private:
+  size_t GetElementSize(ONNXTensorElementDataType type);
+  size_t GetNumElements(std::span<const int64_t> shape);
+
+  Ort::Allocator* allocator_{nullptr};
+  const OrtMemoryInfo& info_;
+  void* buffer_{nullptr};
+  size_t bytes_{0};
+  size_t max_beam_batch_size_{0};
+};
+
+}  // namespace Generators
\ No newline at end of file
diff --git a/src/models/whisper.cpp b/src/models/whisper.cpp
index 5c8fe9d83..6fa28003b 100644
--- a/src/models/whisper.cpp
+++ b/src/models/whisper.cpp
@@ -39,7 +39,7 @@ Whisper_State::Whisper_State(const Whisper_Model& model, RoamingArray<int32_t> s
   kv_cache_.AddEncoder();
   cross_cache_.AddOutputs();
 
-  State::Run(*model_.session_encoder_);
+  State::Run(*model_.session_encoder_, *model_.run_options_);
 
   ClearIO();
 
@@ -55,7 +55,7 @@ RoamingArray<float> Whisper_State::Run(int current_length, RoamingArray<int32_t>
     first_run_ = false;
   } else {
     UpdateInputs(next_tokens, next_indices, current_length);
-    State::Run(*model_.session_decoder_);
+    State::Run(*model_.session_decoder_, *model_.run_options_);
   }
   return logits_.Get();
 }
diff --git a/src/ort_genai.h b/src/ort_genai.h
index 47d8a7a64..e2c560637 100644
--- a/src/ort_genai.h
+++ b/src/ort_genai.h
@@ -1,5 +1,15 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+
+#if __cplusplus >= 202002L
+#include <span>
+#endif
+
 #include "ort_genai_c.h"
 
 // GenAI C++ API
@@ -65,7 +75,7 @@ struct OgaModel : OgaAbstract {
     return std::unique_ptr<OgaModel>(p);
   }
 
-  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) const {
     OgaSequences* p;
     OgaCheckResult(OgaGenerate(this, &params, &p));
     return std::unique_ptr<OgaSequences>(p);
@@ -94,9 +104,19 @@ struct OgaSequences : OgaAbstract {
     return OgaSequencesCount(this);
   }
 
+  size_t SequenceCount(size_t index) const {
+    return OgaSequencesGetSequenceCount(this, index);
+  }
+
+  const int32_t* SequenceData(size_t index) const {
+    return OgaSequencesGetSequenceData(this, index);
+  }
+
+#if __cplusplus >= 202002L
   std::span<const int32_t> Get(size_t index) const {
-    return {OgaSequencesGetSequenceData(this, index), OgaSequencesGetSequenceCount(this, index)};
+    return {SequenceData(index), SequenceCount(index)};
   }
+#endif
 
   static void operator delete(void* p) { OgaDestroySequences(reinterpret_cast<OgaSequences*>(p)); }
 };
@@ -112,11 +132,19 @@ struct OgaTokenizer : OgaAbstract {
     OgaCheckResult(OgaTokenizerEncode(this, str, &sequences));
   }
 
+  OgaString Decode(const int32_t* tokens_data, size_t tokens_length) const {
+    const char* p;
+    OgaCheckResult(OgaTokenizerDecode(this, tokens_data, tokens_length, &p));
+    return p;
+  }
+
+#if __cplusplus >= 202002L
   OgaString Decode(std::span<const int32_t> tokens) const {
     const char* p;
     OgaCheckResult(OgaTokenizerDecode(this, tokens.data(), tokens.size(), &p));
     return p;
   }
+#endif
 
   static void operator delete(void* p) { OgaDestroyTokenizer(reinterpret_cast<OgaTokenizer*>(p)); }
 };
@@ -149,15 +177,11 @@ struct OgaGeneratorParams : OgaAbstract {
     return std::unique_ptr<OgaGeneratorParams>(p);
   }
 
-  void SetSearchOption(const char* name, int value) {
-    OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value));
-  }
-
   void SetSearchOption(const char* name, double value) {
     OgaCheckResult(OgaGeneratorParamsSetSearchNumber(this, name, value));
   }
 
-  void SetSearchOption(const char* name, bool value) {
+  void SetSearchOptionBool(const char* name, bool value) {
     OgaCheckResult(OgaGeneratorParamsSetSearchBool(this, name, value));
   }
 
@@ -191,9 +215,19 @@ struct OgaGenerator : OgaAbstract {
     OgaCheckResult(OgaGenerator_GenerateNextToken(this));
   }
 
+  size_t GetSequenceCount(size_t index) const {
+    return OgaGenerator_GetSequenceCount(this, index);
+  }
+
+  const int32_t* GetSequenceData(size_t index) const {
+    return OgaGenerator_GetSequenceData(this, index);
+  }
+
+#if __cplusplus >= 202002L
   std::span<const int32_t> GetSequence(size_t index) const {
-    return {OgaGenerator_GetSequence(this, index), OgaGenerator_GetSequenceLength(this, index)};
+    return {GetSequenceData(index), GetSequenceCount(index)};
   }
+#endif
 
   static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast<OgaGenerator*>(p)); }
 };
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index f2eff6fb2..529a904d9 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -171,12 +171,12 @@ OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator* generator)
   OGA_CATCH
 }
 
-size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* oga_generator, size_t index) {
+size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().size();
 }
 
-const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* oga_generator, size_t index) {
+const int32_t* OGA_API_CALL OgaGenerator_GetSequenceData(const OgaGenerator* oga_generator, size_t index) {
   auto& generator = *reinterpret_cast<const Generators::Generator*>(oga_generator);
   return generator.GetSequence(static_cast<int>(index)).GetCPU().data();
 }
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index 303974119..c03fd11cf 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -1,5 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
+#pragma once
+
 #include <stdint.h>
 #include <cstddef>
 
@@ -182,17 +185,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerator_GenerateNextToken(OgaGenerator*
  * \param[in] generator The generator to get the count of the tokens for the sequence at the given index.
  * \return The number tokens in the sequence at the given index.
  */
-OGA_EXPORT size_t OGA_API_CALL OgaGenerator_GetSequenceLength(const OgaGenerator* generator, size_t index);
+OGA_EXPORT size_t OGA_API_CALL OgaGenerator_GetSequenceCount(const OgaGenerator* generator, size_t index);
 
 /*
  * \brief Returns a pointer to the sequence data at the given index. The number of tokens in the sequence
- *        is given by OgaGenerator_GetSequenceLength
+ *        is given by OgaGenerator_GetSequenceCount
  * \param[in] generator The generator to get the sequence data for the sequence at the given index.
  * \return The pointer to the sequence data at the given index. The sequence data is owned by the OgaGenerator
  *         and will be freed when the OgaGenerator is destroyed. The caller must copy the data if it needs to
  *         be used after the OgaGenerator is destroyed.
  */
-OGA_EXPORT const int32_t* OGA_API_CALL OgaGenerator_GetSequence(const OgaGenerator* generator, size_t index);
+OGA_EXPORT const int32_t* OGA_API_CALL OgaGenerator_GetSequenceData(const OgaGenerator* generator, size_t index);
 
 OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateTokenizer(const OgaModel* model, OgaTokenizer** out);
 OGA_EXPORT void OGA_API_CALL OgaDestroyTokenizer(OgaTokenizer*);
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 942664246..e2175039a 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -44,7 +44,6 @@ if(BUILD_WHEEL)
   file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/py/" DESTINATION ${WHEEL_TARGET_NAME}/)
   file(COPY "${CMAKE_SOURCE_DIR}/ThirdPartyNotices.txt" DESTINATION ${WHEEL_TARGET_NAME}/)
 
-  file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
   add_custom_command(TARGET python POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
     ${onnxruntime_libs} $<TARGET_FILE:python>
diff --git a/src/python/py/models/README.md b/src/python/py/models/README.md
index 0fdd2c818..ad233fa2d 100644
--- a/src/python/py/models/README.md
+++ b/src/python/py/models/README.md
@@ -3,18 +3,21 @@
 This folder contains the model builder for quickly creating optimized and quantized ONNX models within a few minutes that run with ONNX Runtime GenAI.
 
 # Contents
- - [Current Support](#current-support)
- - [Usage](#usage)
-   - [Full Usage](#full-usage)
-   - [Original PyTorch Model from Hugging Face](#original-pytorch-model-from-hugging-face)
-   - [Original PyTorch Model from Disk](#original-pytorch-model-from-disk)
-   - [Customized or Finetuned PyTorch Model](#customized-or-finetuned-pytorch-model)
-   - [GGUF Model](#gguf-model)
-   - [Extra Options](#extra-options)
-   - [Config Only](#config-only)
-   - [Unit Testing Models](#unit-testing-models)
-     - [Option 1: Use the model builder tool directly](#option-1-use-the-model-builder-tool-directly)
-     - [Option 2: Edit the config.json file](#option-2-edit-the-configjson-file-on-disk-and-then-run-the-model-builder-tool)
+- [Current Support](#current-support)
+- [Usage](#usage)
+  - [Full Usage](#full-usage)
+  - [Original PyTorch Model from Hugging Face](#original-pytorch-model-from-hugging-face)
+  - [Original PyTorch Model from Disk](#original-pytorch-model-from-disk)
+  - [Customized or Finetuned PyTorch Model](#customized-or-finetuned-pytorch-model)
+  - [GGUF Model](#gguf-model)
+  - [Extra Options](#extra-options)
+    - [Config Only](#config-only)
+    - [Exclude Embedding Layer](#exclude-embedding-layer)
+    - [Exclude Language Modeling Head](#exclude-language-modeling-head)
+  - [Unit Testing Models](#unit-testing-models)
+    - [Option 1: Use the model builder directly](#option-1-use-the-model-builder-directly)
+    - [Option 2: Edit the config.json file](#option-2-edit-the-configjson-file-on-disk-and-then-run-the-model-builder)
+- [Design](#design)
 
 ## Current Support
 The tool currently supports the following model architectures.
@@ -62,10 +65,10 @@ python3 builder.py -m model_name -o path_to_output_folder -p precision -e execut
 This scenario is where your PyTorch model has been customized or finetuned for one of the currently supported model architectures and your model can be loaded in Hugging Face.
 ```
 # From wheel:
-python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider
+python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files
 
 # From source:
-python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider
+python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files
 ```
 
 ### GGUF Model
@@ -89,7 +92,7 @@ python3 builder.py -m model_name -o path_to_output_folder -p precision -e execut
 ```
 To see all available options through `--extra_options`, please use the `help` commands in the `Full Usage` section above.
 
-### Config Only
+#### Config Only
 This scenario is for when you already have your optimized and/or quantized ONNX model and you need to create the config files to run with ONNX Runtime GenAI.
 ```
 # From wheel:
@@ -101,6 +104,28 @@ python3 builder.py -m model_name -o path_to_output_folder -p precision -e execut
 
 Afterwards, please open the `genai_config.json` file in the output folder and modify the fields as needed for your model. You should store your ONNX model in the output folder as well.
 
+#### Exclude Embedding Layer
+This scenario is for when you want to exclude the embedding layer from your ONNX model.
+
+```
+# From wheel:
+python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files --extra_options exclude_embeds=true
+
+# From source:
+python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files --extra_options exclude_embeds=true
+```
+
+#### Exclude Language Modeling Head
+This scenario is for when you want to exclude the language modeling head from your ONNX model.
+
+```
+# From wheel:
+python3 -m onnxruntime_genai.models.builder -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files --extra_options exclude_lm_head=true
+
+# From source:
+python3 builder.py -i path_to_local_folder_on_disk -o path_to_output_folder -p precision -e execution_provider -c cache_dir_to_store_temp_files --extra_options exclude_lm_head=true
+```
+
 ### Unit Testing Models
 This scenario is where your PyTorch model is already downloaded locally (either in the default Hugging Face cache directory or in a local folder on disk). If it is not already downloaded locally, here is an example of how you can download it.
 
@@ -117,7 +142,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 tokenizer.save_pretrained(cache_dir)
 ```
 
-#### Option 1: Use the model builder tool directly
+#### Option 1: Use the model builder directly
 This option is the simplest but it will download another copy of the PyTorch model onto disk to accommodate the change in the number of hidden layers.
 ```
 # From wheel:
@@ -127,11 +152,11 @@ python3 -m onnxruntime_genai.models.builder -m model_name -o path_to_output_fold
 python3 builder.py -m model_name -o path_to_output_folder -p precision -e execution_provider --extra_options num_hidden_layers=4
 ```
 
-#### Option 2: Edit the config.json file on disk and then run the model builder tool
+#### Option 2: Edit the config.json file on disk and then run the model builder
 
 1. Navigate to where the PyTorch model and its associated files are saved on disk.
 2. Modify `num_hidden_layers` in `config.json` to your desired target (e.g. 4 layers).
-3. Run the below command for the model builder tool.
+3. Run the below command for the model builder.
 
 ```
 # From wheel:
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index fa022f7aa..37f7063ea 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -19,6 +19,7 @@
 import os
 import textwrap
 
+
 class Model:
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.context_length = config.max_position_embeddings
@@ -36,24 +37,74 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.model_type = config.architectures[0]
         self.io_dtype = io_dtype      # {'fp16', 'fp32'}
         self.onnx_dtype = onnx_dtype  # {"int4", "fp16", "fp32"}
+
+        enable_cuda_graph = "1" if "enable_cuda_graph" in extra_options and extra_options["enable_cuda_graph"] == "1" else "0"
+        # EP-specific variables
         self.ep = ep
+        self.ep_attrs = {
+            "cpu": {},
+            "cuda": {
+                "enable_cuda_graph": enable_cuda_graph
+            },  # "1" if the the model is able to enable cuda graph, "0" otherwise.
+            "dml": {},
+        }
 
         self.cache_dir = cache_dir
         self.filename = extra_options["filename"] if "filename" in extra_options else "model.onnx"
         self.extra_options = extra_options
-        
+
         self.inputs = []
         self.outputs = []
         self.initializers = []
         self.value_infos = []
         self.nodes = []
 
-        # Map input names to input shapes
+        # Map input names to their types and shapes
+        self.input_names = ["input_ids", "attention_mask", "position_ids"]
+        self.input_types = {
+            "input_ids": TensorProto.INT64,                                                                      # For standard models
+            "attention_mask": TensorProto.INT64,                                                                 # For standard models
+            "position_ids": TensorProto.INT64,                                                                   # For standard models
+            "seqlens_k": TensorProto.INT32,                                                                      # For standard models with cuda graph enabled
+            "total_seq_len": TensorProto.INT32,                                                                  # For standard models with cuda graph enabled
+            "inputs_embeds": self.io_dtype,                                                                      # For standard models where you want to remove the embedding layer from the model (note that `inputs_embeds` is written this way to match Hugging Face format)
+            "past_key_values.key": self.io_dtype,                                                                # For standard models (note that `past_key_values.key` is written this way to match Hugging Face format)
+            "past_key_values.value": self.io_dtype,                                                              # For standard models (note that `past_key_values.value` is written this way to match Hugging Face format)
+        }
         self.input_shapes = {
-            "input_ids": ["batch_size", "sequence_length"],
-            "attention_mask": ["batch_size", "total_sequence_length"],
-            "position_ids": ["batch_size", "sequence_length"],
+            "input_ids": ["batch_size", "sequence_length"],                                                      # For standard models
+            "attention_mask": ["batch_size", "total_sequence_length"],                                           # For standard models
+            "position_ids": ["batch_size", "sequence_length"],                                                   # For standard models
+            "seqlens_k": ["batch_size"],                                                                         # For standard models with cuda graph enabled
+            "total_seq_len": [],                                                                                 # For standard models with cuda graph enabled
+            "inputs_embeds": ["batch_size", "sequence_length", self.hidden_size],                                # For standard models where you want to remove the embedding layer from the model (note that `inputs_embeds` is written this way to match Hugging Face format)
+            "past_key_values.key": ["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size],    # For standard models (note that `past_key_values.key` is written this way to match Hugging Face format)
+            "past_key_values.value": ["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size],  # For standard models (note that `past_key_values.value` is written this way to match Hugging Face format)
+        }
+        self.exclude_embeds = "exclude_embeds" in extra_options
+        if self.exclude_embeds:
+            self.input_names = [name.replace("input_ids", "inputs_embeds") for name in self.input_names]
+
+        if self.ep_attrs["cuda"]["enable_cuda_graph"] == '1':
+            self.input_names = ["input_ids", "position_ids", "seqlens_k", "total_seq_len"]
+
+        # Map output names to their types and shapes
+        self.output_names = ["logits"]
+        self.output_types = {
+            "hidden_states": self.io_dtype,                                                                      # For standard models where you want to remove the language modeling head from the model (note that `hidden_states` is written this way to match Hugging Face format)
+            "logits": self.io_dtype,                                                                             # For standard models
+            "present.key": self.io_dtype,                                                                        # For standard models (note that `present.key` is written this way to match Hugging Face format)
+            "present.value": self.io_dtype,                                                                      # For standard models (note that `present.value` is written this way to match Hugging Face format)
         }
+        self.output_shapes = {
+            "hidden_states": ["batch_size", "sequence_length", self.hidden_size],                                # For standard models where you want to remove the language modeling head from the model (note that `hidden_states` is written this way to match Hugging Face format)
+            "logits": ["batch_size", "sequence_length", self.vocab_size],                                        # For standard models
+            "present.key": ["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size],           # For standard models (note that `present.key` is written this way to match Hugging Face format)
+            "present.value": ["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size],         # For standard models (note that `present.value` is written this way to match Hugging Face format)
+        }
+        self.exclude_lm_head = "exclude_lm_head" in extra_options
+        if self.exclude_lm_head:
+            self.output_names = [name.replace("logits", "hidden_states") for name in self.output_names]
 
         # Store names of Constant ops already created
         self.constants = set()
@@ -75,7 +126,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             TensorProto.FLOAT16: "TensorProto.FLOAT16",
             TensorProto.FLOAT: "TensorProto.FLOAT",
         }
-        
+
         # Mask-specific variables
         self.mask_attrs = {
             "mask_name": "",            # Name of node that outputs 4D causal attention mask (used as add_qk in MultiHeadAttention)
@@ -105,19 +156,36 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
         self.rotemb_attrs = {
             "create_rotary_embedding_caches": True,          # Create cos/sin caches for rotary embeddings
+            "theta": rope_theta,                             # Base value if calculating cos/sin caches from scratch
             "partial_rotary_factor": partial_rotary_factor,  # Factor for partial rotary embeddings
+            "interleaved": 0,                                # Interleave the rotary embeddings (e.g. [0, 0, 0, 1, 1, 1] to [0, 1, 0, 1, 0, 1], RotaryEmbedding kernel expects a default value of 0)
             "num_heads": 0,                                  # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0)
             "rotary_embedding_dim": 0,                       # For partial rotary embeddings (RotaryEmbedding kernel expects a default value of 0)
-            "theta": rope_theta,                             # Base value if calculating cos/sin caches from scratch
         }
 
         # Attention-specific variables (MHA, GQA, GQA + Rot.Emb., etc.)
         self.attention_attrs = {
-            "op_type": "MultiHeadAttention",                                # Attention op to use
-            "use_gqa": ep == "cuda" and io_dtype == TensorProto.FLOAT16     # Check if GroupQueryAttention can be used
+            "op_type": "MultiHeadAttention",                 # Attention op to use
+            "use_rotemb_in_attn": False,                     # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op)
+            "use_packed_matmul": False,                      # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
         }
-        if self.attention_attrs["use_gqa"]:
+        if self.ep == "cuda" and self.io_dtype == TensorProto.FLOAT16:
+            # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
+            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 CUDA and FP16 CUDA.")
+
+            self.attention_attrs["use_packed_matmul"] = self.num_attn_heads == self.num_kv_heads
+
+            # GQA + Rot.Emb. does not require `position ids` as input
+            self.attention_attrs["use_rotemb_in_attn"] = True
+            self.input_names.remove("position_ids")
+
+        # MLP-specific variables
+        self.mlp_attrs = {
+            "use_proj": True,           # Use projection style for MLP (GateProj/UpProj/DownProj)
+            "use_fc": False,            # Use fully-connected style for MLP (FC1/FC2)
+            "output_0": "",             # Output 0 for MLP layer
+        }
 
         # Quantization-specific variables (INT4, INT8, etc.)
         self.quant_attrs = {
@@ -129,7 +197,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
 
     def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
         config = GenerationConfig.from_pretrained(model_name_or_path, **extra_kwargs)
-        inputs = dict(zip(self.input_shapes.keys(), self.input_shapes.keys()))
+        inputs = dict(zip(self.input_names, self.input_names))
         inputs.update({
             "past_key_names": "past_key_values.%d.key",
             "past_value_names": "past_key_values.%d.value",
@@ -157,7 +225,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
                     "num_key_value_heads": self.num_kv_heads,
                 },
                 "eos_token_id": config.eos_token_id,
-                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id != None else config.eos_token_id,
+                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id,
                 "type": self.model_type[ : self.model_type.find("For")].lower(),
                 "vocab_size": self.vocab_size,
             },
@@ -179,9 +247,9 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
             },
         }
 
-        if self.ep == "cuda":
-            cuda_options = { "cuda" : { } }
-            genai_config["model"]["decoder"]["session_options"]["provider_options"].append(cuda_options)
+        if self.ep != "cpu":
+            ep_options = { self.ep : self.ep_attrs[self.ep] }
+            genai_config["model"]["decoder"]["session_options"]["provider_options"].append(ep_options)
 
         print(f"Saving GenAI config in {out_dir}")
         with open(os.path.join(out_dir,"genai_config.json"), "w") as f:
@@ -238,6 +306,7 @@ def save_model(self, out_dir):
         if os.path.exists(data_path):
             print(f"Overwriting {data_path}")
             os.remove(data_path)
+
         save_model(
             model,
             out_path,
@@ -305,28 +374,31 @@ def make_graph(self, *args, doc_string=None, **kwargs):
     def make_inputs_and_outputs(self):
         # Add model-specific inputs to list of model inputs
         inputs = []
-        for name in self.model_inputs:
+        for name in self.input_names:
+            dtype = self.input_types[name]
             shape = self.input_shapes[name]
-            inputs.append(helper.make_tensor_value_info(name, TensorProto.INT64, shape=shape))
+            inputs.append(helper.make_tensor_value_info(name, dtype, shape=shape))
 
         # Add model-specific outputs to list of model outputs
-        outputs = [
-            helper.make_tensor_value_info("logits", self.io_dtype, shape=["batch_size", "sequence_length", self.vocab_size])
-        ]
+        outputs = []
+        for name in self.output_names:
+            dtype = self.output_types[name]
+            shape = self.output_shapes[name]
+            outputs.append(helper.make_tensor_value_info(name, dtype, shape=shape))
 
         # Add KV cache to inputs and outputs
         for i in range(self.num_layers):
             # Add KV cache to inputs
             key_name = f"past_key_values.{i}.key"
-            inputs.append(helper.make_tensor_value_info(key_name, self.io_dtype, shape=["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size]))
+            inputs.append(helper.make_tensor_value_info(key_name, self.input_types["past_key_values.key"], shape=self.input_shapes["past_key_values.key"]))
             value_name = f"past_key_values.{i}.value"
-            inputs.append(helper.make_tensor_value_info(value_name, self.io_dtype, shape=["batch_size", self.num_kv_heads, "past_sequence_length", self.head_size]))
+            inputs.append(helper.make_tensor_value_info(value_name, self.input_types["past_key_values.value"], shape=self.input_shapes["past_key_values.value"]))
 
             # Add KV cache to outputs
             key_name = f"present.{i}.key"
-            outputs.append(helper.make_tensor_value_info(key_name, self.io_dtype, shape=["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size]))
+            outputs.append(helper.make_tensor_value_info(key_name, self.output_types["present.key"], shape=self.output_shapes["present.key"]))
             value_name = f"present.{i}.value"
-            outputs.append(helper.make_tensor_value_info(value_name, self.io_dtype, shape=["batch_size", self.num_kv_heads, "total_sequence_length", self.head_size]))
+            outputs.append(helper.make_tensor_value_info(value_name, self.output_types["present.value"], shape=self.output_shapes["present.value"]))
 
         self.inputs = inputs
         self.outputs = outputs
@@ -474,14 +546,18 @@ def make_matmul_fp16_or_fp32(self, matmul, name, root_input, **kwargs):
     #     self.make_node("MatMulNBits", inputs=[root_input, weight, scales], outputs=[output], name=name)
     #     self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
 
-    # TODO: make packed QKV MatMul
-    # def make_packed_matmul(self, q_matmul, k_matmul, v_matmul, name, root_input, **kwargs):
-    #     pass
+    def make_packed_matmul(self, q_matmul, k_matmul, v_matmul, name, root_input, **kwargs):
+        # N = num_heads * head_size, H = hidden_size
+        # Combine 3 Matmuls of shape NxH into 1 packed MatMul of shape 3NxH
+        # Note: Packed MatMul is of shape 3NxH instead of Hx3N because `make_matmul` will apply a transpose before saving
+        N, H = q_matmul.shape
+        matmul = np.stack((q_matmul.transpose(), k_matmul.transpose(), v_matmul.transpose()), axis=1).reshape(H, 3*N).transpose()
+        self.make_matmul(matmul, name, root_input, **kwargs)
 
     def make_add_bias(self, add, name, root_input, **kwargs):
         bias = name[1:].replace("/", ".") + ".bias"
         self.make_external_tensor(add.astype(self.to_numpy_dtype[self.io_dtype]), bias)
-        
+
         add_bias_inputs = [root_input, bias]
         shape = ['batch_size', 'sequence_length', add.shape[0]]
 
@@ -492,6 +568,11 @@ def make_add_bias(self, add, name, root_input, **kwargs):
         else:
             self.make_add(name, add_bias_inputs, dtype=self.io_dtype, shape=shape)
 
+    def make_packed_add(self, q_add, k_add, v_add, name, root_input, **kwargs):
+        # Combine 3 Adds of shape H into 1 packed Add of shape 3H
+        add = np.stack((q_add, k_add, v_add), axis=0).flatten()
+        self.make_add_bias(add, name, root_input, **kwargs)
+
     def make_embedding(self, embedding):
         weight = "model.embed_tokens.weight"
         self.make_external_tensor(embedding.astype(self.to_numpy_dtype[self.io_dtype]), weight)
@@ -539,6 +620,8 @@ def make_layernorm(self, layer_id, layernorm, skip, simple, location):
 
         output_0 = f"/model/layers.{layer_id}/{location}_layernorm/output_0"
         output_3 = f"/model/layers.{layer_id}/{location}_layernorm/output_3"
+        if self.layernorm_attrs["last_layernorm"] and self.exclude_lm_head:
+            output_0 = "hidden_states"
         outputs = [output_0, "", "", output_3] if skip and not self.layernorm_attrs["last_layernorm"] else [output_0]
 
         self.make_node(op_type, inputs=inputs, outputs=outputs, name=name, domain=("com.microsoft" if skip else None), **kwargs)
@@ -587,7 +670,7 @@ def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
 
         inputs = [root_input, kwargs.pop("position_ids"), cos_cache_name, sin_cache_name]
         output = f"{name}/output_0"
-        self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=0, **kwargs)
+        self.make_node("RotaryEmbedding", inputs=inputs, outputs=[output], name=name, domain="com.microsoft", interleaved=self.rotemb_attrs["interleaved"], **kwargs)
         self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * (self.num_kv_heads if "k_rotary" in name else self.num_attn_heads)])
 
     # TODO: This function and any corresponding changes to support it are temporary until ORT supports GQA for CPU
@@ -609,7 +692,7 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
         #                |   present_kv
         #                |
         #        +-------+---------+
-        #        |                 |        
+        #        |                 |
         #        |               Shape
         #        |                 |
         #        |     +-----------+-----------+-----------+
@@ -682,7 +765,7 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
         concat_1_name = f"{basename}/Concat_1"
         concat_1_inputs = [past_kv, f"{transpose_1_name}/output_0"]
         self.make_node("Concat", inputs=concat_1_inputs, outputs=[present_kv], name=concat_1_name, axis=2)
-        
+
         shape_1_name = f"{basename}/Shape_1"
         self.make_shape(shape_1_name, present_kv, shape=[4])
         gather_1_name = f"{basename}/Gather_1"
@@ -771,11 +854,13 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
 
     def make_attention_op(self, name, **kwargs):
         op_type = self.attention_attrs["op_type"]
-        
+
         if op_type == "MultiHeadAttention":
             self.make_multi_head_attention(name, add_qk=f"{self.mask_attrs['mask_name']}/output_0", **kwargs)
         elif op_type == "GroupQueryAttention":
-            self.make_group_query_attention(name, seqlens_k=f"{self.mask_attrs['seqlens_k']}/output_0", total_seq_len=f"{self.mask_attrs['total_seq_len']}/output_0", **kwargs)
+            seqlens_k_name = f"{self.mask_attrs['seqlens_k']}/output_0" if self.ep_attrs["cuda"]["enable_cuda_graph"] == "0" else "seqlens_k"
+            total_seq_len_name = f"{self.mask_attrs['total_seq_len']}/output_0" if self.ep_attrs["cuda"]["enable_cuda_graph"] == "0" else "total_seq_len"
+            self.make_group_query_attention(name, seqlens_k=seqlens_k_name, total_seq_len=total_seq_len_name, **kwargs)
         else:
             raise NotImplementedError(f"The {op_type} op is not currently supported.")
 
@@ -795,10 +880,15 @@ def make_group_query_attention(self, name, **kwargs):
             kwargs["q_path"], kwargs["k_path"], kwargs["v_path"],
             kwargs.get("past_k", ""), kwargs.get("past_v", ""),
             kwargs.get("seqlens_k", ""), kwargs.get("total_seq_len", ""),
+            kwargs.get("cos_cache", ""), kwargs.get("sin_cache", ""),
         ]
         output = f"{name}/output_0"
         outputs = [output, kwargs.get("present_k", ""), kwargs.get("present_v", "")]
-        self.make_node("GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft", num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, local_window_size=self.window_size)
+        self.make_node(
+            "GroupQueryAttention", inputs=inputs, outputs=outputs, name=name, domain="com.microsoft",
+            num_heads=self.num_attn_heads, kv_num_heads=self.num_kv_heads, local_window_size=self.window_size,
+            do_rotary=self.attention_attrs["use_rotemb_in_attn"], rotary_interleaved=self.rotemb_attrs["interleaved"],
+        )
         self.make_value_info(output, self.io_dtype, shape=['batch_size', 'sequence_length', self.head_size * self.num_attn_heads])
 
     def make_attention(self, layer_id, attention, root_input, **kwargs):
@@ -841,60 +931,75 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         v_input_to_attention = ""
 
         # Make MatMul nodes
-        q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul"
-        self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input)
-        q_input_to_attention = f"{q_matmul_name}/output_0"
-        k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul"
-        self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input)
-        k_input_to_attention = f"{k_matmul_name}/output_0"
-        v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul"
-        self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input)
-        v_input_to_attention = f"{v_matmul_name}/output_0"
+        if self.attention_attrs["use_packed_matmul"]:
+            # Combine 3 MatMuls into 1 packed MatMul
+            qkv_matmul_name = f"/model/layers.{layer_id}/attn/qkv_proj/MatMul"
+            self.make_packed_matmul(attention.q_proj.weight.detach().numpy(), attention.k_proj.weight.detach().numpy(), attention.v_proj.weight.detach().numpy(), qkv_matmul_name, root_input)
+            q_input_to_attention = f"{qkv_matmul_name}/output_0"
+        else:
+            q_matmul_name = f"/model/layers.{layer_id}/attn/q_proj/MatMul"
+            self.make_matmul(attention.q_proj.weight.detach().numpy(), q_matmul_name, root_input)
+            q_input_to_attention = f"{q_matmul_name}/output_0"
+            k_matmul_name = f"/model/layers.{layer_id}/attn/k_proj/MatMul"
+            self.make_matmul(attention.k_proj.weight.detach().numpy(), k_matmul_name, root_input)
+            k_input_to_attention = f"{k_matmul_name}/output_0"
+            v_matmul_name = f"/model/layers.{layer_id}/attn/v_proj/MatMul"
+            self.make_matmul(attention.v_proj.weight.detach().numpy(), v_matmul_name, root_input)
+            v_input_to_attention = f"{v_matmul_name}/output_0"
 
         # Make Add nodes (if bias exists)
         q_bias_exists = attention.q_proj.bias is not None
         k_bias_exists = attention.k_proj.bias is not None
         v_bias_exists = attention.v_proj.bias is not None
+        all_bias_exists = q_bias_exists and k_bias_exists and v_bias_exists
 
-        if q_bias_exists:
-            q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add"
-            self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=f"{q_matmul_name}/output_0")
-            q_input_to_attention = f"{q_add_name}/output_0"
-        if k_bias_exists:
-            k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add"
-            self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=f"{k_matmul_name}/output_0")
-            k_input_to_attention = f"{k_add_name}/output_0"
-        if v_bias_exists:
-            v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add"
-            self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=f"{v_matmul_name}/output_0")
-            v_input_to_attention = f"{v_add_name}/output_0"
+        if all_bias_exists and self.attention_attrs["use_packed_matmul"]:
+            # Combine 3 Adds into 1 packed Add
+            qkv_add_name = f"/model/layers.{layer_id}/attn/qkv_proj/Add"
+            self.make_packed_add(attention.q_proj.bias.detach().numpy(), attention.k_proj.bias.detach().numpy(), attention.v_proj.bias.detach().numpy(), qkv_add_name, root_input=q_input_to_attention)
+            q_input_to_attention = f"{qkv_add_name}/output_0"
+        else:
+            if q_bias_exists:
+                q_add_name = f"/model/layers.{layer_id}/attn/q_proj/Add"
+                self.make_add_bias(attention.q_proj.bias.detach().numpy(), q_add_name, root_input=q_input_to_attention)
+                q_input_to_attention = f"{q_add_name}/output_0"
+            if k_bias_exists:
+                k_add_name = f"/model/layers.{layer_id}/attn/k_proj/Add"
+                self.make_add_bias(attention.k_proj.bias.detach().numpy(), k_add_name, root_input=k_input_to_attention)
+                k_input_to_attention = f"{k_add_name}/output_0"
+            if v_bias_exists:
+                v_add_name = f"/model/layers.{layer_id}/attn/v_proj/Add"
+                self.make_add_bias(attention.v_proj.bias.detach().numpy(), v_add_name, root_input=v_input_to_attention)
+                v_input_to_attention = f"{v_add_name}/output_0"
 
         # Make RotaryEmbedding nodes
-        q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding"
-        q_rotary_input = f"{q_matmul_name if not q_bias_exists else q_add_name}/output_0"
-        self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, q_rotary_input, position_ids=kwargs.get("position_ids", "position_ids"))
-        q_input_to_attention = f"{q_rotary_name}/output_0"
-
-        k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding"
-        k_rotary_input = f"{k_matmul_name if not k_bias_exists else k_add_name}/output_0"
-        self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, k_rotary_input, position_ids=kwargs.get("position_ids", "position_ids"))
-        k_input_to_attention = f"{k_rotary_name}/output_0"
+        cos_cache_name, sin_cache_name = "", ""
+        if self.attention_attrs["use_rotemb_in_attn"]:
+            cos_cache_name, sin_cache_name = self.make_rotary_embedding_caches(attention.rotary_emb)
+        else:
+            q_rotary_name = f"/model/layers.{layer_id}/attn/q_rotary/RotaryEmbedding"
+            self.make_rotary_embedding(attention.rotary_emb, q_rotary_name, root_input=q_input_to_attention, position_ids=kwargs.get("position_ids", "position_ids"))
+            q_input_to_attention = f"{q_rotary_name}/output_0"
+            k_rotary_name = f"/model/layers.{layer_id}/attn/k_rotary/RotaryEmbedding"
+            self.make_rotary_embedding(attention.rotary_emb, k_rotary_name, root_input=k_input_to_attention, position_ids=kwargs.get("position_ids", "position_ids"))
+            k_input_to_attention = f"{k_rotary_name}/output_0"
 
         # Make repeat KV nodes (TODO: remove once ORT supports GQA for CPU)
         past_k = f"past_key_values.{layer_id}.key"
         past_v = f"past_key_values.{layer_id}.value"
         present_k = f"present.{layer_id}.key"
         present_v = f"present.{layer_id}.value"
-        if self.num_attn_heads != self.num_kv_heads and not self.attention_attrs['use_gqa']:
-            k_input_to_attention = self.make_repeat_kv(layer_id, k_input_to_attention, past_k, present_k)
-            v_input_to_attention = self.make_repeat_kv(layer_id, v_input_to_attention, past_v, present_v)
+        if self.num_attn_heads != self.num_kv_heads and self.attention_attrs["op_type"] == "MultiHeadAttention":
+            k_input_to_attention = self.make_repeat_kv(layer_id, root_input=k_input_to_attention, past_kv=past_k, present_kv=present_k)
+            v_input_to_attention = self.make_repeat_kv(layer_id, root_input=v_input_to_attention, past_kv=past_v, present_kv=present_v)
             past_k, past_v, present_k, present_v = "", "", "", ""
 
         # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.)
         attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}"
         self.make_attention_op(
             attn_name, q_path=q_input_to_attention, k_path=k_input_to_attention, v_path=v_input_to_attention,
-            past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v, **kwargs,
+            past_k=past_k, past_v=past_v, present_k=present_k, present_v=present_v,
+            cos_cache=cos_cache_name, sin_cache=sin_cache_name, **kwargs,
         )
 
         # Make MatMul node (output projection weight node)
@@ -914,6 +1019,14 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         self.layernorm_attrs["skip_input"] = f"{o_matmul_name if not o_bias_exists else o_add_name}/output_0"
 
     def make_mlp(self, layer_id, mlp, root_input):
+        if self.mlp_attrs["use_proj"]:
+            self.make_mlp_proj(layer_id, mlp, root_input)
+        elif self.mlp_attrs["use_fc"]:
+            self.make_mlp_fc(layer_id, mlp, root_input)
+        else:
+            raise NotImplementedError(f"The MLP layer type is not set.")
+
+    def make_mlp_proj(self, layer_id, mlp, root_input):
         # Make nodes for the MLP subgraph
         #
         #           root_input
@@ -925,7 +1038,7 @@ def make_mlp(self, layer_id, mlp, root_input):
         #             Mul
         #              |
         #        DownProjMatMul
-        
+
         # Make MatMul nodes
         gate_name = f"/model/layers.{layer_id}/mlp/gate_proj/MatMul"
         self.make_matmul(mlp.gate_proj.weight.detach().numpy(), gate_name, root_input)
@@ -947,6 +1060,39 @@ def make_mlp(self, layer_id, mlp, root_input):
         # Assign output 0 of previous MatMul as skip input to next SkipLayerNorm
         self.layernorm_attrs["skip_input"] = f"{down_name}/output_0"
 
+    def make_mlp_fc(self, layer_id, mlp, root_input):
+        # Make nodes for the MLP subgraph
+        #
+        #          root_input
+        #              |
+        #          FC1_MatMul
+        #              |
+        #           FC1_Add
+        #              |
+        #           ActFunc
+        #              |
+        #          FC2_MatMul
+        #              |
+        #           FC2_Add
+
+        # Make first layer of fully connected nodes (FC1)
+        fc1_matmul_name = f"/model/layers.{layer_id}/mlp/fc1/MatMul"
+        self.make_matmul(mlp.fc1.weight.detach().numpy(), fc1_matmul_name, root_input)
+        fc1_add_name = f"/model/layers.{layer_id}/mlp/fc1/Add"
+        self.make_add_bias(mlp.fc1.bias.detach().numpy(), fc1_add_name, root_input=f"{fc1_matmul_name}/output_0")
+
+        # Make activation function
+        act_fn_name = self.make_activation(layer_id, root_input=f"{fc1_add_name}/output_0")
+
+        # Make second layer of fully connected nodes (FC2)
+        fc2_matmul_name = f"/model/layers.{layer_id}/mlp/fc2/MatMul"
+        self.make_matmul(mlp.fc2.weight.detach().numpy(), fc2_matmul_name, root_input=f"{act_fn_name}/output_0")
+        fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add"
+        self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0")
+
+        # Assign output 0 of MLP layer as output of last layer
+        self.mlp_attrs["output_0"] = f"{fc2_add_name}/output_0"
+
     def make_activation_with_mul(self, layer_id, root_input, activation, domain):
         # Make nodes for this activation subgraph
         #
@@ -1017,14 +1163,8 @@ def make_model(self, input_path):
         # Make inputs and outputs to ONNX model
         self.make_inputs_and_outputs()
 
-        # Make attention mask reformatting nodes
-        #
-        #           2D attention mask
-        #                   |
-        #    attention mask reformatting subgraph
-        #                   |
-        #         4D causal attention mask
-        self.make_attention_mask_reformatting()
+        # Make pre-processing nodes
+        self.make_preprocessing_nodes()
 
         # Load weights of original model
         if input_path.endswith(".gguf"):
@@ -1034,31 +1174,40 @@ def make_model(self, input_path):
             self.layernorm_attrs["add_offset"] = 0  # add offset already done for GGUF models
         else:
             # Load PyTorch model
-            extra_kwargs = {} if os.path.exists(self.model_name_or_path) else {"num_hidden_layers": self.num_layers} if "num_hidden_layers" in self.extra_options else {"cache_dir": self.cache_dir, "use_auth_token": True}
+            extra_kwargs = {"trust_remote_code": True} if os.path.exists(self.model_name_or_path) else {"num_hidden_layers": self.num_layers} if "num_hidden_layers" in self.extra_options else {"cache_dir": self.cache_dir, "use_auth_token": True}
             model = AutoModelForCausalLM.from_pretrained(self.model_name_or_path, **extra_kwargs)
-        
+
         # Loop through model and map each module to ONNX/ORT ops
         self.layer_id = 0
         for module in model.modules():
             if isinstance(module, torch.nn.Embedding) or (hasattr(model, "embedding") and module == model.embedding):
                 # Checks (Hugging Face logic) or (GGUF logic)
-                # Embedding layer
-                print("Reading embedding layer")
-                self.make_embedding(module.weight.detach().numpy())
+                if not self.exclude_embeds:
+                    # Embedding layer
+                    print("Reading embedding layer")
+                    self.make_embedding(module.weight.detach().numpy())
+                else:
+                    # Exclude embedding layer from model
+                    self.layernorm_attrs["root_input"] = "inputs_embeds"
+                    self.layernorm_attrs["skip_input"] = "inputs_embeds"
+
             elif module.__class__.__name__.endswith("DecoderLayer"):
                 # Each decoder layer of model
                 print(f"Reading decoder layer {self.layer_id}")
                 self.make_layer(self.layer_id, module)
                 self.layer_id += 1
+
             elif self.layer_id == self.num_layers and self.has_final_norm(module, model):
                 # SkipLayerNorm after last decoder layer (MatMul --> SkipLayerNorm)
                 print("Reading final norm")
                 self.make_layernorm(self.layer_id, module, skip=True, simple=self.layernorm_attrs["simple"], location="final_norm")
+
             elif (isinstance(module, torch.nn.Linear) and module.out_features == self.vocab_size) or (hasattr(model, "lm_head") and module == model.lm_head):
                 # Checks (Hugging Face logic) or (GGUF logic)
-                # Language modeling head (SkipLayerNorm --> logits)
-                print("Reading LM head")
-                self.make_lm_head(module)
+                if not self.exclude_lm_head:
+                    # Language modeling head (SkipLayerNorm --> logits)
+                    print("Reading LM head")
+                    self.make_lm_head(module)
 
         del model
 
@@ -1070,7 +1219,30 @@ def has_final_norm(self, module, model):
         gguf_final_norm = hasattr(model, "final_norm") and module == model.final_norm
         return hf_norm or hf_final_layernorm or gguf_final_norm
 
+    def make_preprocessing_nodes(self):
+        self.make_attention_mask_reformatting()
+        # TODO: add make_position_ids_reformatting() here
+
     def make_attention_mask_reformatting(self):
+        if self.ep_attrs["cuda"]["enable_cuda_graph"] == "1":
+            # ORT does not allow nodes to be placed on mulitple execution providers
+            # with cuda graph enabled. Thus the attention mask is deprecated and the
+            # subgraph is replaced with seqlens_k and total_seq_len as the raw
+            # inputs to GroupQueryAttention.
+            return
+        if self.attention_attrs["op_type"] == "GroupQueryAttention":
+            self.make_attention_mask_reformatting_for_gqa()            
+        elif self.attention_attrs["op_type"] == "MultiHeadAttention":
+            # Make attention mask reformatting nodes
+            #
+            #           2D attention mask
+            #                   |
+            #    attention mask reformatting subgraph
+            #                   |
+            #         4D causal attention mask
+            self.make_attention_mask_reformatting_2d_to_4d()
+
+    def make_attention_mask_reformatting_2d_to_4d(self):
         # Make nodes for the attention mask subgraphs that reformat the
         # 2D attention mask (B, S) to 4D causal attention mask (B, N, S, T)
         #
@@ -1078,9 +1250,9 @@ def make_attention_mask_reformatting(self):
         #            /         \               |
         #         Shape       Shape          Shape
         #          |            |              |
-        #        Gather       Gather         Gather                              
+        #        Gather       Gather         Gather
         #       (idx=0)       (idx=1)        (idx=2)
-        #          |            |    |\      /                                        
+        #          |            |    |\      /
         #          |            |    | \    /
         #          |            |    |   Add                                      attention_mask--------+
         #          |            |    |    |                                       /           \         |
@@ -1217,7 +1389,7 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name):
         concat_3_name = f"{basename}/Concat_3"
         concat_3_inputs = [f"{unsqueeze_7_name}/output_0", "/model/constants/TensorProto.INT64/1D/1"]
         self.make_concat(concat_3_name, concat_3_inputs, dtype=TensorProto.INT64, shape=[2], axis=0)
-        
+
         # Bottom path
         shape_5_name = f"{basename}/Shape_5"
         self.make_shape(shape_5_name, f"{constant_shape_name}/output_0", shape=[2])
@@ -1251,7 +1423,7 @@ def make_input_ids_subgraph(self, basename, past_key_gather_name):
         unsqueeze_9_inputs = [f"{unsqueeze_8_name}/output_0", "/model/constants/TensorProto.INT64/1D/1"]
         self.make_unsqueeze(unsqueeze_9_name, unsqueeze_9_inputs, dtype=self.io_dtype, shape=None)
 
-        expand_name = self.make_common_mask_reformat_subgraph(basename, root_input="input_ids", unsqueeze_for_concat=unsqueeze_3_name, unsqueeze_for_expand=unsqueeze_9_name, input_ids_subgraph=True)
+        expand_name = self.make_common_mask_reformat_subgraph(basename, root_input="input_ids" if not self.exclude_embeds else "inputs_embeds", unsqueeze_for_concat=unsqueeze_3_name, unsqueeze_for_expand=unsqueeze_9_name, input_ids_subgraph=True)
         return unsqueeze_6_name, expand_name
 
     def make_attention_mask_subgraph(self, basename, unsqueeze_for_concat):
@@ -1293,7 +1465,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         #            /         \
         #         Shape       Shape
         #          |            |
-        #        Gather       Gather                              
+        #        Gather       Gather
         #       (idx=0)       (idx=1)
         #          |            |
         #      Unsqueeze   Unsqueeze   Unsqueeze (unsqueeze_for_concat)
@@ -1326,9 +1498,9 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         #                              Expand
 
         shape_1_name = f"{basename}/Shape_1"
-        self.make_shape(shape_1_name, root_input, shape=[2])
+        self.make_shape(shape_1_name, root_input, shape=[3] if self.exclude_embeds and input_ids_subgraph else [2])
         shape_2_name = f"{basename}/Shape_2"
-        self.make_shape(shape_2_name, root_input, shape=[2])
+        self.make_shape(shape_2_name, root_input, shape=[3] if self.exclude_embeds and input_ids_subgraph else [2])
         gather_1_name = f"{basename}/Gather_1"
         gather_1_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/0"]
         self.make_gather(gather_1_name, gather_1_inputs, axis=0)
@@ -1341,7 +1513,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         unsqueeze_2_name = f"{basename}/Unsqueeze_2"
         unsqueeze_2_inputs = [f"{gather_2_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
         self.make_unsqueeze(unsqueeze_2_name, unsqueeze_2_inputs, dtype=TensorProto.INT64, shape=[1])
-        
+
         concat_name = f"{basename}/Concat" if not input_ids_subgraph else f"{basename}/Concat_1"
         concat_first_two_inputs = [f"{unsqueeze_1_name}/output_0", "/model/constants/TensorProto.INT64/1D/1"]
         concat_last_two_inputs = [f"{unsqueeze_for_concat}/output_0", f"{unsqueeze_2_name}/output_0"] if not input_ids_subgraph else [f"{unsqueeze_2_name}/output_0", f"{unsqueeze_for_concat}/output_0"]
@@ -1358,7 +1530,7 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         equal_name = f"{basename}/Equal"
         equal_inputs = [f"{concat_name}/output_0", f"{mul_name}/output_0"]
         self.make_equal(equal_name, equal_inputs, shape=[4])
-        
+
         where_name = f"{basename}/Where_1"
         where_inputs = [f"{equal_name}/output_0", f"{constant_shape_name}/output_0", f"{concat_name}/output_0"]
         self.make_where(where_name, where_inputs, dtype=TensorProto.INT64, shape=[4])
@@ -1369,19 +1541,10 @@ def make_common_mask_reformat_subgraph(self, basename, root_input, unsqueeze_for
         self.make_expand(expand_name, expand_inputs, dtype=expand_dtype, shape=expand_shape)
 
         return expand_name
-    
 
-class LlamaModel(Model):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
-        self.model_inputs = ["input_ids", "attention_mask", "position_ids"]
-
-    def make_attention_mask_reformatting(self):
-        if not self.attention_attrs["use_gqa"]:
-            super().make_attention_mask_reformatting()
-            return
 
-        # Make nodes for the attention mask subgraph that calculates 
+    def make_attention_mask_reformatting_for_gqa(self):
+        # Make nodes for the attention mask subgraph that calculates
         # attributes about the 2D attention mask to use in GroupQueryAttention
         #
         #                attention_mask
@@ -1394,7 +1557,6 @@ def make_attention_mask_reformatting(self):
         #              |                |
         #          seqlens_k      total_seq_len
         #            (1D)             (int)
-
         basename = "/model/attn_mask_reformat"
         attn_mask_basename = f"{basename}/attn_mask_subgraph"
 
@@ -1420,12 +1582,6 @@ def make_attention_mask_reformatting(self):
         self.mask_attrs["seqlens_k"] = cast_1_name
         self.mask_attrs["total_seq_len"] = cast_2_name
 
-
-class MistralModel(LlamaModel):
-    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
-        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
-        self.position_ids_name = self.make_position_ids_reformatting()
-
     def make_position_ids_reformatting(self):
         # Make nodes for the position ids reformatting subgraph
         #
@@ -1441,7 +1597,7 @@ def make_position_ids_reformatting(self):
         #                  \       /
         #                   Reshape
         #                      |
-        #      position_ids input for RotaryEmbedding       
+        #      position_ids input for RotaryEmbedding
 
         basename = "/model/pos_ids_reformat"
         shape_name = f"{basename}/Shape"
@@ -1461,69 +1617,49 @@ def make_position_ids_reformatting(self):
 
         return reshape_name
 
+
+class LlamaModel(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+
+
+class MistralModel(Model):
+    def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
+        super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
+        self.position_ids_name = f"{self.make_position_ids_reformatting()}/output_0" if not self.attention_attrs["use_rotemb_in_attn"] else "position_ids"
+
     def make_attention(self, layer_id, attention, root_input, **kwargs):
-        super().make_attention(layer_id, attention, root_input, position_ids=f"{self.position_ids_name}/output_0", **kwargs)
+        super().make_attention(layer_id, attention, root_input, position_ids=self.position_ids_name, **kwargs)
 
 
-class PhiModel(LlamaModel):
+class PhiModel(Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
         # self.input_shapes["position_ids"] = [1]  # Note: This is optional and only needed if you want position_ids to be an int instead of a 2D tensor
         self.layernorm_attrs["simple"] = False
         self.rotemb_attrs["num_heads"] = self.num_attn_heads
-        self.rotemb_attrs["rotary_embedding_dim"] = self.num_attn_heads
+        self.rotemb_attrs["rotary_embedding_dim"] = int(self.head_size * self.rotemb_attrs["partial_rotary_factor"])
+        self.mlp_attrs["use_proj"], self.mlp_attrs["use_fc"] = False, True
 
     def make_rotary_embedding(self, rotemb, name, root_input, **kwargs):
         super().make_rotary_embedding(rotemb, name, root_input, num_heads=self.rotemb_attrs["num_heads"], rotary_embedding_dim=self.rotemb_attrs["rotary_embedding_dim"], **kwargs)
-        
-    def make_mlp(self, layer_id, mlp, root_input):
-        # Make nodes for the MLP subgraph
-        #
-        #          root_input
-        #              |
-        #          FC1_MatMul
-        #              |
-        #           FC1_Add
-        #              |
-        #           FastGelu
-        #              |
-        #          FC2_MatMul
-        #              |
-        #           FC2_Add
-
-        # Make first layer of fully connected nodes (FC1)
-        fc1_matmul_name = f"/model/layers.{layer_id}/mlp/fc1/MatMul"
-        self.make_matmul(mlp.fc1.weight.detach().numpy(), fc1_matmul_name, root_input)
-        fc1_add_name = f"/model/layers.{layer_id}/mlp/fc1/Add"
-        self.make_add_bias(mlp.fc1.bias.detach().numpy(), fc1_add_name, root_input=f"{fc1_matmul_name}/output_0")
-
-        # Make activation function
-        fast_gelu_name = self.make_activation(layer_id, root_input=f"{fc1_add_name}/output_0")
-
-        # Make second layer of fully connected nodes (FC2)
-        fc2_matmul_name = f"/model/layers.{layer_id}/mlp/fc2/MatMul"
-        self.make_matmul(mlp.fc2.weight.detach().numpy(), fc2_matmul_name, root_input=f"{fast_gelu_name}/output_0")
-        fc2_add_name = f"/model/layers.{layer_id}/mlp/fc2/Add"
-        self.make_add_bias(mlp.fc2.bias.detach().numpy(), fc2_add_name, root_input=f"{fc2_matmul_name}/output_0")
-
-        return fc2_add_name
 
     def make_layer(self, layer_id, layer):
         # Each Phi decoder layer is defined as:
         # input_layernorm --> attention --> MLP --> residual_add
         self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
         self.make_attention(layer_id, layer.self_attn, self.layernorm_attrs["output_0"])
-        fc2_add_name = self.make_mlp(layer_id, layer.mlp, self.layernorm_attrs["output_0"])
+        self.make_mlp(layer_id, layer.mlp, self.layernorm_attrs["output_0"])
 
         residual_add_name = f"/model/layers.{layer_id}/residual_add/Add"
-        residual_add_inputs = [self.layernorm_attrs['skip_input'], f"{fc2_add_name}/output_0"]
+        residual_add_inputs = [self.layernorm_attrs['skip_input'], self.mlp_attrs["output_0"]]
         self.make_add(residual_add_name, residual_add_inputs, dtype=self.io_dtype, shape=['batch_size', 'sequence_length', self.hidden_size])
 
         self.layernorm_attrs["first_layernorm"] = False
         if layer_id == self.num_layers - 1:
             # Norm after last decoder layer of model (last layer --> norm)
             self.layernorm_attrs["last_layernorm"] = True
-        
+
         # Assign output 0 of residual Add as skip input to next SkipLayerNorm
         self.layernorm_attrs["skip_input"] = f"{residual_add_name}/output_0"
 
@@ -1540,7 +1676,7 @@ def parse_extra_options(kv_items):
     Parse key value pairs that are separated by '='
     """
     kv_pairs = {}
-    
+
     if kv_items:
         for kv_str in kv_items:
             kv = kv_str.split('=')
@@ -1556,7 +1692,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
     os.makedirs(cache_dir, exist_ok=True)
 
     # Load model config
-    extra_kwargs = {} if os.path.isdir(input_path) else {"cache_dir": cache_dir, "use_auth_token": True}
+    extra_kwargs = {"trust_remote_code": True} if os.path.isdir(input_path) else {"cache_dir": cache_dir, "use_auth_token": True}
     hf_name = input_path if os.path.isdir(input_path) else model_name
     config = AutoConfig.from_pretrained(hf_name, **extra_kwargs)
 
@@ -1665,6 +1801,15 @@ def get_args():
                     The filename for each component will be '<filename>_<component-name>.onnx' (ex: '<filename>_encoder.onnx', '<filename>_decoder.onnx').
                 config_only = Generate config and pre/post processing files only.
                     Use this option when you already have your optimized and/or quantized ONNX model.
+                exclude_embeds = Remove embedding layer from your ONNX model.
+                    Use this option when you want to remove the embedding layer from within your ONNX model.
+                    Instead of `input_ids`, you will have `inputs_embeds` as the input to your ONNX model.
+                exclude_lm_head = Remove language modeling head from your ONNX model.
+                    Use this option when you want to remove the language modeling head from within your ONNX model.
+                    Instead of `logits`, you will have `hidden_states` as the output to your ONNX model.
+                enable_cuda_graph = 1 : The model can use CUDA graph capture for CUDA execution provider. If enabled, raw inputs(seqlens_k and total_seq_len)
+                    to the GroupQueryAttention are populated to the graph inputs to ensure all nodes are placed on the CUDA EP. Currently there's no gurantee 
+                    that cuda graph be enabled as it depends on the model and the graph structure.
             """),
     )
 
diff --git a/src/python/python.cpp b/src/python/python.cpp
index 295ab00ed..4abb30d03 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -112,6 +112,10 @@ struct PyGeneratorParams {
     }
   }
 
+  void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) {
+    params_->max_batch_size = max_batch_size.cast<int>();
+  }
+
   pybind11::array_t<int32_t> py_input_ids_;
   pybind11::array_t<float> py_whisper_input_features_;
   pybind11::array_t<int32_t> py_whisper_decoder_input_ids_;
@@ -120,6 +124,7 @@ struct PyGeneratorParams {
 struct PyGenerator {
   PyGenerator(Model& model, PyGeneratorParams& params) {
     params.Prepare();
+    model.GetMaxBatchSizeFromGeneratorParams(params);
     generator_ = CreateGenerator(model, params);
   }
 
@@ -195,7 +200,8 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_)
       .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_)
       .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_)
-      .def("set_search_options", &PyGeneratorParams::SetSearchOptions);
+      .def("set_search_options", &PyGeneratorParams::SetSearchOptions)
+      .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize);
 
   // We need to init the OrtApi before we can use it
   Ort::InitApi();
@@ -227,7 +233,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def(pybind11::init([](const std::string& config_path) {
         return CreateModel(GetOrtEnv(), config_path.c_str());
       }))
-      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); })
+      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); })
       .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });
 
   pybind11::class_<PyGenerator>(m, "Generator")
diff --git a/src/search.cpp b/src/search.cpp
index 02952aeb8..0a2c2e488 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -15,7 +15,17 @@ Search_Cpu::Search_Cpu(const GeneratorParams& params)
 }
 
 GreedySearch_Cpu::GreedySearch_Cpu(const GeneratorParams& params)
-    : Search_Cpu(params), gen_(rd_()) {
+    : Search_Cpu(params) {
+  if (params_->search.random_seed != -1)
+    gen_.seed(params_->search.random_seed);
+  else {
+    std::random_device rd;
+    std::array<uint32_t, decltype(gen_)::state_size> data;
+    std::generate(std::begin(data), std::end(data), std::ref(rd));
+    std::seed_seq seq(data.begin(), data.end());
+    gen_.seed(seq);
+  }
+
   next_tokens_buffer_ = AllocateArray<int32_t>(params.batch_size, &next_tokens_);
   memset(next_tokens_.data(), 0, next_tokens_.size_bytes());
 
diff --git a/src/search.h b/src/search.h
index 5a52c11e2..70dab187b 100644
--- a/src/search.h
+++ b/src/search.h
@@ -83,7 +83,6 @@ struct GreedySearch_Cpu : Search_Cpu {
   std::unique_ptr<bool[]> eos_seen_buffer_;
   int not_done_count_{params_->batch_size};  // When zero, every batch entry is done (starts at batch_size_)
 
-  std::random_device rd_;
   std::mt19937 gen_;
 };
 
diff --git a/src/search_cuda.cpp b/src/search_cuda.cpp
index 82a19fa82..43d7d3e17 100644
--- a/src/search_cuda.cpp
+++ b/src/search_cuda.cpp
@@ -33,7 +33,13 @@ GreedySearch_Cuda::GreedySearch_Cuda(const GeneratorParams& params)
     : Search_Cuda{params} {
   next_tokens_buffer_ = CudaMallocArray<int32_t>(params.batch_size, &next_tokens_);
   cudaMemsetAsync(next_tokens_.data(), 0, next_tokens_.size_bytes(), params_->cuda_stream);
-  samplingdata_ = std::make_unique<cuda::SamplingData>(params_->batch_size, params_->vocab_size, params_->cuda_stream);
+
+  unsigned long long random_seed;
+  if (params_->search.random_seed != -1)
+    random_seed = params_->search.random_seed;
+  else
+    random_seed = std::random_device{}();
+  samplingdata_ = std::make_unique<cuda::SamplingData>(random_seed, params_->batch_size, params_->vocab_size, params_->cuda_stream);
 }
 
 BeamSearch_Cuda::BeamSearch_Cuda(const GeneratorParams& params)
diff --git a/src/search_cuda.cu b/src/search_cuda.cu
index f5d852dd5..24fbaf914 100644
--- a/src/search_cuda.cu
+++ b/src/search_cuda.cu
@@ -96,7 +96,7 @@ __global__ void SetScoreProcessor(float* next_token_scores, int batch_beam_size,
   if (index >= batch_beam_size)
     return;
 
-  next_token_scores[index*vocab_size+token] = score;
+  next_token_scores[index * vocab_size + token] = score;
 }
 
 void LaunchSetScoreProcessor(float* next_token_scores, int batch_beam_size, int vocab_size, int token, float score, cudaStream_t stream) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 83571118e..daf8c40b3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -49,7 +49,6 @@ else()
   target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT})
   target_link_libraries(unit_tests PRIVATE tokenizer)
 endif()
-file(GLOB onnxruntime_libs "${ORT_LIB_DIR}/${ONNXRUNTIME_FILES}")
 set(TEST_MODEL_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/test_models/")
 set(TEST_MODEL_DES_DIR "$<TARGET_FILE_DIR:unit_tests>/test_models/")
 add_custom_command(TARGET unit_tests POST_BUILD
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
index a1ec2b923..fbdb4e541 100644
--- a/test/c_api_tests.cpp
+++ b/test/c_api_tests.cpp
@@ -120,10 +120,13 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
 
   // Verify outputs match expected outputs
   for (int i = 0; i < batch_size; i++) {
-    auto sequence = generator->GetSequence(i);
+    const auto sequence_length = generator->GetSequenceCount(i);
+    const auto* sequence_data = generator->GetSequenceData(i);
 
-    auto* expected_output_start = &expected_output[i * max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
+    ASSERT_LE(sequence_length, max_length);
+
+    const auto* expected_output_start = &expected_output[i * max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
   }
 
   // Test high level API
@@ -131,10 +134,13 @@ TEST(CAPITests, GreedySearchGptFp32CAPI) {
 
   // Verify outputs match expected outputs
   for (int i = 0; i < batch_size; i++) {
-    auto sequence = sequences->Get(i);
+    const auto sequence_length = sequences->SequenceCount(i);
+    const auto* sequence_data = sequences->SequenceData(i);
+
+    ASSERT_LE(sequence_length, max_length);
 
-    auto* expected_output_start = &expected_output[i * max_length];
-    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence.data(), max_length * sizeof(int32_t)));
+    const auto* expected_output_start = &expected_output[i * max_length];
+    EXPECT_TRUE(0 == std::memcmp(expected_output_start, sequence_data, sequence_length * sizeof(int32_t)));
   }
 }
 
@@ -199,7 +205,7 @@ struct Phi2Test {
 TEST(CAPITests, TopKCAPI) {
   Phi2Test test;
 
-  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOptionBool("do_sample", true);
   test.params_->SetSearchOption("top_k", 50);
   test.params_->SetSearchOption("temperature", 0.6f);
 
@@ -209,7 +215,7 @@ TEST(CAPITests, TopKCAPI) {
 TEST(CAPITests, TopPCAPI) {
   Phi2Test test;
 
-  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOptionBool("do_sample", true);
   test.params_->SetSearchOption("top_p", 0.6f);
   test.params_->SetSearchOption("temperature", 0.6f);
 
@@ -219,7 +225,7 @@ TEST(CAPITests, TopPCAPI) {
 TEST(CAPITests, TopKTopPCAPI) {
   Phi2Test test;
 
-  test.params_->SetSearchOption("do_sample", true);
+  test.params_->SetSearchOptionBool("do_sample", true);
   test.params_->SetSearchOption("top_k", 50);
   test.params_->SetSearchOption("top_p", 0.6f);
   test.params_->SetSearchOption("temperature", 0.6f);
diff --git a/test/python/test_onnxruntime_genai_e2e.py b/test/python/test_onnxruntime_genai_e2e.py
index cc6f9dde2..a08090012 100644
--- a/test/python/test_onnxruntime_genai_e2e.py
+++ b/test/python/test_onnxruntime_genai_e2e.py
@@ -10,9 +10,11 @@
 
 
 def download_model(
-    download_path: str | bytes | os.PathLike, device: str, model_identifier: str
+    download_path: str | bytes | os.PathLike, device: str, model_identifier: str, precision: str
 ):
     # python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cpu -o download_path
+    # Or with cuda graph enabled:
+    # python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -p int4 -e cuda --extra_options enable_cuda_graph=1 -o download_path
     command = [
         sys.executable,
         "-m",
@@ -20,12 +22,15 @@ def download_model(
         "-m",
         model_identifier,
         "-p",
-        "int4",
+        precision,
         "-e",
         device,
         "-o",
         download_path,
     ]
+    if device == "cuda":
+        command.append("--extra_options")
+        command.append("enable_cuda_graph=1")
     run_subprocess(command).check_returncode()
 
 
@@ -42,6 +47,7 @@ def run_model(model_path: str | bytes | os.PathLike):
     sequences = tokenizer.encode_batch(prompts)
     params = og.GeneratorParams(model)
     params.set_search_options({"max_length": 200})
+    params.try_use_cuda_graph_with_max_batch_size(16)
     params.input_ids = sequences
 
     output_sequences = model.generate(params)
@@ -51,7 +57,8 @@ def run_model(model_path: str | bytes | os.PathLike):
 
 if __name__ == "__main__":
     for model_name in ["microsoft/phi-2"]:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            device = "cuda" if og.is_cuda_available() else "cpu"
-            download_model(temp_dir, device, model_name)
-            run_model(temp_dir)
+        for precision in ["int4", "fp32"]:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                device = "cuda" if og.is_cuda_available() else "cpu"
+                download_model(temp_dir, device, model_name, precision)
+                run_model(temp_dir)