From a028d7879a2adc92f6a8bf0babe01d1c7398c15f Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 30 Apr 2024 09:31:46 -0700
Subject: [PATCH 1/7] Simulate the chat template (#352)

---
 examples/python/model-qa.py       | 12 ++++-
 examples/python/phi-3-tutorial.md |  6 +--
 examples/python/phi3-qa.py        | 87 +++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 5 deletions(-)
 create mode 100644 examples/python/phi3-qa.py

diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
index 6f323ccc4..57ec9f6db 100644
--- a/examples/python/model-qa.py
+++ b/examples/python/model-qa.py
@@ -15,6 +15,9 @@ def main(args):
     if args.verbose: print("Tokenizer created")
     if args.verbose: print()
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+    if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
+        print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+        exit(1)
 
     # Keep asking for input prompts in a loop
     while True:
@@ -25,7 +28,12 @@ def main(args):
 
         if args.timings: started_timestamp = time.time()
 
-        input_tokens = tokenizer.encode(args.system_prompt + text)
+        # If there is a chat template, use it
+        prompt = text
+        if args.chat_template:
+            prompt = f'{args.chat_template.format(input=text)}'
+
+        input_tokens = tokenizer.encode(prompt)
 
         params = og.GeneratorParams(model)
         params.try_use_cuda_graph_with_max_batch_size(1)
@@ -76,7 +84,7 @@ def main(args):
     parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
     parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
     parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
-    parser.add_argument('-s', '--system_prompt', type=str, default='', help='Prepend a system prompt to the user input prompt. Defaults to empty')
     parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}')
     args = parser.parse_args()
     main(args)
\ No newline at end of file
diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 6d2f5f728..5442886a3 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -64,7 +64,7 @@ pip install --pre onnxruntime-genai
 
 ## Run the model
 
-Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py).
+Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py).
 
 The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.
 
@@ -74,8 +74,8 @@ The `-m` argument is the path to the model you downloaded from HuggingFace above
 The `-l` argument is the length of output you would like to generate with the model.
 
 ```bash
-curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py
-python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
+curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
+python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048
 ```
 
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
new file mode 100644
index 000000000..9e9392895
--- /dev/null
+++ b/examples/python/phi3-qa.py
@@ -0,0 +1,87 @@
+import onnxruntime_genai as og
+import argparse
+import time
+
+def main(args):
+    if args.verbose: print("Loading model...")
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
+
+    model = og.Model(f'{args.model}')
+    if args.verbose: print("Model loaded")
+    tokenizer = og.Tokenizer(model)
+    tokenizer_stream = tokenizer.create_stream()
+    if args.verbose: print("Tokenizer created")
+    if args.verbose: print()
+    search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+    chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
+
+    # Keep asking for input prompts in a loop
+    while True:
+        text = input("Input: ")
+        if not text:
+            print("Error, input cannot be empty")
+            continue
+
+        if args.timings: started_timestamp = time.time()
+
+        # If there is a chat template, use it
+        prompt = f'{chat_template.format(input=text)}'
+
+        print(f"Prompt: {prompt}")
+
+        input_tokens = tokenizer.encode(prompt)
+
+        params = og.GeneratorParams(model)
+        params.try_use_cuda_graph_with_max_batch_size(1)
+        params.set_search_options(**search_options)
+        params.input_ids = input_tokens
+        generator = og.Generator(model, params)
+        if args.verbose: print("Generator created")
+
+        if args.verbose: print("Running generation loop ...")
+        if args.timings:
+            first = True
+            new_tokens = []
+
+        print()
+        print("Output: ", end='', flush=True)
+
+        try:
+            while not generator.is_done():
+                generator.compute_logits()
+                generator.generate_next_token()
+                if args.timings:
+                    if first:
+                        first_token_timestamp = time.time()
+                        first = False
+
+                new_token = generator.get_next_tokens()[0]
+                print(tokenizer_stream.decode(new_token), end='', flush=True)
+                if args.timings: new_tokens.append(new_token)
+        except KeyboardInterrupt:
+            print("  --control+c pressed, aborting generation--")
+        print()
+        print()
+
+        if args.timings:
+            prompt_time = first_token_timestamp - started_timestamp
+            run_time = time.time() - first_token_timestamp
+            print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai")
+    parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
+    parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
+    parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
+    parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
+    parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+    parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
+    parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
+    parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file

From cb4e3aaf982abcd22adfc01a9d4663f76f645561 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 30 Apr 2024 09:35:37 -0700
Subject: [PATCH 2/7] Update phi-3-tutorial.md (#361)

---
 examples/python/phi-3-tutorial.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index 5442886a3..5ee9331e3 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -81,7 +81,7 @@ python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-bl
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:
 
 ```bash
-Input: <|user|>Tell me a joke about creative writing<|end|><|assistant|>
+Input: Tell me a joke about creative writing
  
 Output:  Why don't writers ever get lost? Because they always follow the plot! 
-```
\ No newline at end of file
+```

From afd2edc892c58a41f6b69df0200e96ea6785d9e6 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 30 Apr 2024 14:48:45 -0700
Subject: [PATCH 3/7] Make OgaModel* const again (#356)

---
 benchmark/c/main.cpp               |  2 +-
 src/csharp/NativeMethods.cs        |  4 ++--
 src/generators.cpp                 | 20 +++++++++++++++++++-
 src/generators.h                   |  6 ++++++
 src/models/captured_graph_pool.cpp |  2 +-
 src/models/decoder_only.cpp        |  4 ++--
 src/models/model.cpp               | 22 ----------------------
 src/models/model.h                 |  5 -----
 src/ort_genai.h                    |  4 ++--
 src/ort_genai_c.cpp                | 16 +++++-----------
 src/ort_genai_c.h                  |  4 ++--
 src/python/python.cpp              |  5 ++---
 12 files changed, 42 insertions(+), 52 deletions(-)

diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp
index 2d4b62b1f..3a4c9b43b 100644
--- a/benchmark/c/main.cpp
+++ b/benchmark/c/main.cpp
@@ -112,7 +112,7 @@ void WriteE2EStats(std::string_view label,
             << "\n";
 }
 
-std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) {
+std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) {
   const char* const base_prompt = "A";
   auto base_prompt_sequences = OgaSequences::Create();
 
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index f2906f3df..a56e7dd7e 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -71,7 +71,7 @@ internal class NativeLib
                                                                                          IntPtr /* const OgaSequences* */ sequences);
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model,
                                                                         IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                         out IntPtr /* OgaGenerator** */ generator);
 
@@ -129,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
         // This function is used to generate sequences for the given model using the given generator parameters.
         // The OgaSequences object is an array of sequences, where each sequence is an array of tokens.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model,
                                                                  IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                  out IntPtr /* OgaSequences** */ sequences);
 
diff --git a/src/generators.cpp b/src/generators.cpp
index 0c664f341..bc00f8d3e 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -61,7 +61,25 @@ GeneratorParams::GeneratorParams(const Model& model)
       eos_token_id{model.config_->model.eos_token_id},
       vocab_size{model.config_->model.vocab_size},
       device_type{model.device_type_},
-      cuda_stream{model.cuda_stream_} {
+      cuda_stream{model.cuda_stream_},
+      is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} {
+}
+
+void GeneratorParams::TryGraphCapture(int max_bs) {
+  if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) {
+    // no-op
+    return;
+  }
+
+  if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) {
+    if (max_bs == 0) {
+      throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set.");
+    }
+    use_cuda_graph = true;
+    max_batch_size = max_bs;
+  } else {
+    throw std::runtime_error("CUDA graph is not supported on this device");
+  }
 }
 
 std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params) {
diff --git a/src/generators.h b/src/generators.h
index c10868570..c6a510739 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -61,6 +61,7 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
 
   int batch_size{1};
   int max_batch_size{0};
+  bool use_cuda_graph{};
   int sequence_length{};
   int BatchBeamSize() const { return search.num_beams * batch_size; }
 
@@ -97,6 +98,11 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
   std::vector<int32_t> input_ids_owner;  // Backing memory of input_ids in some cases
 
   std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
+
+  void TryGraphCapture(int max_bs);
+
+ private:
+  bool is_cuda_graph_enabled_{};
 };
 
 struct Generator {
diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp
index 140f2a8cd..96cc029b8 100644
--- a/src/models/captured_graph_pool.cpp
+++ b/src/models/captured_graph_pool.cpp
@@ -24,7 +24,7 @@ static std::tuple<int, int, int> MakeKey(int max_batch_size, int max_length, int
 }
 
 CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const {
-  if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
+  if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
     return nullptr;
   }
 
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 83d1f03d3..53f4f6697 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -26,7 +26,7 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArra
 
 RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices) {
   if (first_run_) {
-    if (model_.use_cuda_graph_) {
+    if (params_->use_cuda_graph) {
       model_.run_options_->AddConfigEntry("gpu_graph_id", "-1");
     }
     first_run_ = false;
@@ -37,7 +37,7 @@ RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int3
   State::Run(*model_.session_decoder_, *model_.run_options_);
 
   // Set the graph id for the following runs.
-  if (model_.use_cuda_graph_) {
+  if (params_->use_cuda_graph) {
     int new_batch_size = static_cast<int>(input_ids_.GetShape()[0]);
     if (new_batch_size != current_batch_size_) {
       current_batch_size_ = new_batch_size;
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 439ab5c6a..6f0cc294a 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -515,26 +515,4 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   return expanded;
 }
 
-void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) {
-  bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options);
-  max_batch_size_ = params.max_batch_size;
-
-  if (DeviceType::CUDA == device_type_) {
-    if (is_cuda_graph_enabled) {
-      if (max_batch_size_ == 0) {
-        throw std::runtime_error("CUDA graph is enabled, but max_batch_size is not set.");
-      }
-      use_cuda_graph_ = true;
-    }
-  } else if (DeviceType::DML == device_type_) {
-    if (max_batch_size_ == 0) {
-      throw std::runtime_error("max_batch_size needs to be set when using DirectML.");
-    }
-
-    use_cuda_graph_ = true;
-  } else if (is_cuda_graph_enabled) {
-    throw std::runtime_error("CUDA graph is not supported on this device");
-  }
-}
-
 }  // namespace Generators
diff --git a/src/models/model.h b/src/models/model.h
index fe3b9d832..5b9ec12d9 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -102,8 +102,6 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::unique_ptr<OrtValue> ExpandInputs(std::unique_ptr<OrtValue>& input, int num_beams) const;
 
-  void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params);
-
   CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); }
 
   std::unique_ptr<Config> config_;
@@ -119,9 +117,6 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::shared_ptr<Model> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
-  bool use_cuda_graph_{};
-  int max_batch_size_{};
-
 #if USE_DML
   DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); }
   DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); }
diff --git a/src/ort_genai.h b/src/ort_genai.h
index fb863dae2..b8e55bf19 100644
--- a/src/ort_genai.h
+++ b/src/ort_genai.h
@@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract {
     return std::unique_ptr<OgaModel>(p);
   }
 
-  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) const {
     OgaSequences* p;
     OgaCheckResult(OgaGenerate(this, &params, &p));
     return std::unique_ptr<OgaSequences>(p);
@@ -201,7 +201,7 @@ struct OgaGeneratorParams : OgaAbstract {
 };
 
 struct OgaGenerator : OgaAbstract {
-  static std::unique_ptr<OgaGenerator> Create(OgaModel& model, const OgaGeneratorParams& params) {
+  static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, const OgaGeneratorParams& params) {
     OgaGenerator* p;
     OgaCheckResult(OgaCreateGenerator(&model, &params, &p));
     return std::unique_ptr<OgaGenerator>(p);
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index 13cae5235..d5ab67040 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -108,7 +108,7 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene
 OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) {
   OGA_TRY
   auto* params = reinterpret_cast<Generators::GeneratorParams*>(generator_params);
-  params->max_batch_size = max_batch_size;
+  params->TryGraphCapture(max_batch_size);
   return nullptr;
   OGA_CATCH
 }
@@ -143,23 +143,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera
   OGA_CATCH
 }
 
-OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
+OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
   OGA_TRY
-  auto* model_p = reinterpret_cast<Generators::Model*>(model);
-  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
-  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
-  auto result = Generators::Generate(*model_p, *params);
+  auto result = Generators::Generate(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params));
   *out = reinterpret_cast<OgaSequences*>(std::make_unique<Generators::TokenSequences>(std::move(result)).release());
   return nullptr;
   OGA_CATCH
 }
 
-OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
+OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
   OGA_TRY
-  auto* model_p = reinterpret_cast<Generators::Model*>(model);
-  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
-  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
-  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*model_p, *params).release());
+  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params)).release());
   return nullptr;
   OGA_CATCH
 }
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index 0939d2c36..3e44c29e4 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -117,7 +117,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model);
  *             after it is done using the sequences.
  * \return OgaResult containing the error message if the generation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
 
 /*
  * \brief Creates a OgaGeneratorParams from the given model.
@@ -167,7 +167,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O
  * \param[out] out The created generator.
  * \return OgaResult containing the error message if the generator creation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
 
 /*
  * \brief Destroys the given generator.
diff --git a/src/python/python.cpp b/src/python/python.cpp
index cd974d916..1d8a4e567 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -104,7 +104,7 @@ struct PyGeneratorParams {
   }
 
   void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) {
-    params_->max_batch_size = max_batch_size.cast<int>();
+    params_->TryGraphCapture(max_batch_size.cast<int>());
   }
 
   pybind11::array_t<int32_t> py_input_ids_;
@@ -115,7 +115,6 @@ struct PyGeneratorParams {
 struct PyGenerator {
   PyGenerator(Model& model, PyGeneratorParams& params) {
     params.Prepare();
-    model.GetMaxBatchSizeFromGeneratorParams(params);
     generator_ = CreateGenerator(model, params);
   }
 
@@ -229,7 +228,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def(pybind11::init([](const std::string& config_path) {
         return CreateModel(GetOrtEnv(), config_path.c_str());
       }))
-      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); })
+      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); })
       .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });
 
   pybind11::class_<PyGenerator>(m, "Generator")

From eea971091978a7e8aa6d90a550d327e09c29d445 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Tue, 30 Apr 2024 15:22:55 -0700
Subject: [PATCH 4/7] update readme (#363)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f6cab5472..113a0a5b7 100644
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ prompt = '''def print_prime(n):
 tokens = tokenizer.encode(prompt)
 
 params = og.GeneratorParams(model)
-params.set_search_options({"max_length":200})
+params.set_search_options(max_length=200)
 # Add the following line to enable cuda graph by passing the maximum batch size.
 # params.try_use_cuda_graph_with_max_batch_size(16)
 params.input_ids = tokens

From 7dd45f2a6b3930543e65a0d2685d00eaa30fb522 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 30 Apr 2024 18:34:01 -0400
Subject: [PATCH 5/7] Use ort-nightly build for genai gha ci (#256)

---
 .github/workflows/linux-cpu-x64-build.yml     | 52 +++++++++++++++----
 .github/workflows/linux-gpu-x64-build.yml     | 52 +++++++++++++++----
 .github/workflows/mac-cpu-arm64-build.yml     | 26 +++++-----
 .github/workflows/win-cpu-arm64-build.yml     | 10 ++--
 .github/workflows/win-cpu-x64-build.yml       | 44 ++++++++++------
 .github/workflows/win-cuda-x64-build.yml      | 44 ++++++++++------
 cmake/presets/CMakeMacOSConfigPresets.json    |  2 +-
 nuget.config                                  | 17 +++---
 onnxruntime-genai.sln                         | 36 +++++++++++++
 ...icrosoft.ML.OnnxRuntimeGenAI.Tests.csproj} |  3 +-
 10 files changed, 208 insertions(+), 78 deletions(-)
 create mode 100644 onnxruntime-genai.sln
 rename test/csharp/{Microsoft.OnnxRuntimeGenAI.Tests.csproj => Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj} (92%)

diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 2e1c03aab..744fa567a 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -4,10 +4,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-x64-1.17.3"
-  ort_zip: "onnxruntime-linux-x64-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
-
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
+  ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
+  NUGET_EXE: "mono /usr/local/bin/nuget.exe"
 jobs:
   linux_cpu_x64:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ]
@@ -16,19 +16,49 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
+      - name: install Mono and Nuget
+        run: |
+          sudo apt install ca-certificates gnupg
+          sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+          echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list
+          sudo apt update
+          sudo apt install -y mono-devel
+          sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+          sudo chmod +x /usr/local/bin/nuget.exe
+
+      - name: Install jq and dotnet
+        run: |
+          wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+          sudo dpkg -i packages-microsoft-prod.deb
+          rm packages-microsoft-prod.deb
+          sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq
 
-      - name: Download OnnxRuntime
+      - name: Get the Latest OnnxRuntime Nightly Version
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
 
-      - name: Unzip OnnxRuntime
+      - name: Download OnnxRuntime Nightly
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          ${{ env.NUGET_EXE }} install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
+        continue-on-error: true
+
+      - name: list files
+        shell: bash
+        run: |
+          ls -l
+          ls -R ${{ env.ORT_PACKAGE_NAME }}
+        continue-on-error: true
 
-      - name: Rename OnnxRuntime to ort
+# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version
+      - name: Extra OnnxRuntime library and header files
         run: |
-          mv ${{ env.ort_dir }} ort
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/
+          ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1)
+          cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version
 
       - name: Build with CMake and GCC
         run: |
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index c1e51251b..123ff5f75 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -6,9 +6,11 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  ort_dir: "onnxruntime-linux-x64-gpu-1.17.3"
-  ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz"
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
+  ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
+  NUGET_EXE: "mono /usr/local/bin/nuget.exe"
+
 
 jobs:
   linux-cuda-x64-build:
@@ -29,19 +31,49 @@ jobs:
           clean: true
           path: manylinux
           submodules: true
+      - name: install Mono and Nuget
+        run: |
+          sudo apt install ca-certificates gnupg
+          sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+          echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list
+          sudo apt update
+          sudo apt install -y mono-devel
+          sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+          sudo chmod +x /usr/local/bin/nuget.exe
+
+      - name: Install jq and dotnet
+        run: |
+          wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+          sudo dpkg -i packages-microsoft-prod.deb
+          rm packages-microsoft-prod.deb
+          sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq
 
       - name: Download OnnxRuntime
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }}
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
 
-      - name: Unzip OnnxRuntime
+      - name: Download OnnxRuntime Nightly
+        run: |
+          mono /usr/local/bin/nuget.exe install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
+        continue-on-error: true
+      - name: list files
+        shell: bash
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          ls -l
+          ls -R ${{ env.ORT_PACKAGE_NAME }}
+        continue-on-error: true
 
-      - name: Rename OnnxRuntime to ort
+# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version
+      - name: Extra OnnxRuntime library and header files
         run: |
-          mv ${{ env.ort_dir }} ort
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/
+          ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1)
+          cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version
+
 
       - name: Get Docker Image
         run: |
@@ -78,7 +110,7 @@ jobs:
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -w /ort_genai_src onnxruntimecudabuildx64 \
             bash -c " \
-              /usr/bin/cmake --build --preset linux_gcc_cuda_release --parallel $( nproc )"
+              /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
       - name: Get HuggingFace Token
         run: |
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index 9cb9cdc46..aba92d017 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -4,9 +4,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-osx-arm64-1.17.3"
-  ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz"
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 jobs:
   mac-cpu-arm64-build:
     runs-on: macos-latest
@@ -16,22 +15,21 @@ jobs:
         with:
           submodules: true
 
-      - name: Install ninja
+      - name: Get the Latest OnnxRuntime Nightly Version
         run: |
-          brew install ninja
-
-      - name: Download OnnxRuntime
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
+      - name: Download OnnxRuntime Nightly
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+          nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
 
-      - name: Unzip OnnxRuntime
+      - name: Extra OnnxRuntime library and header files
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/
 
-      - name: Rename OnnxRuntime to ort
-        run: |
-          mv ${{ env.ort_dir }} ort
 
       - name: Configure CMake
         run: |
diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
index 916af3009..ce3bfcf4b 100644
--- a/.github/workflows/win-cpu-arm64-build.yml
+++ b/.github/workflows/win-cpu-arm64-build.yml
@@ -53,6 +53,11 @@ jobs:
       run: |
         cmake --build --preset windows_arm64_cpu_release --parallel
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the Python Wheel and Test Dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -62,10 +67,7 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
index ca0bb6b5b..cf5614dee 100644
--- a/.github/workflows/win-cpu-x64-build.yml
+++ b/.github/workflows/win-cpu-x64-build.yml
@@ -11,10 +11,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-win-x64-1.17.3"
-  ort_zip: "$(ort_dir).zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)"
   binaryDir: 'build/cpu'
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 
 jobs:
   windows-cpu-x64-build:
@@ -33,19 +32,32 @@ jobs:
       with:
         vs-version: '17.5'
 
-    - name: Download OnnxRuntime
+    - uses: actions/setup-dotnet@v4
+      with:
+        dotnet-version: '6.0.x'
+
+    - name : Install jq and nuget
       run: |
-        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip"
-        Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
+        choco install -y jq curl
 
-    - name: Unzip OnnxRuntime
+    - name: Get the Latest OnnxRuntime Nightly Version
+      shell: pwsh
       run: |
-        Expand-Archive $env:ort_zip -DestinationPath .
-        Remove-Item -Path $env:ort_zip
+        $ORT_NIGHTLY_VERSION = $(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+        echo "$ORT_NIGHTLY_VERSION" 
+        "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append
+    - name: Download OnnxRuntime Nightly
+      run: |
+        nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x -NonInteractive
+
+    - run: Get-ChildItem  ${{ env.ORT_PACKAGE_NAME }} -Recurse
+      continue-on-error: true
 
-    - name: Rename OnnxRuntime to ort
+    - name: Extra OnnxRuntime library and header files
       run: |
-        Rename-Item -Path $env:ort_dir -NewName ort
+        mkdir ort/lib
+        move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/
 
     - name: Initialize CodeQL
       uses: github/codeql-action/init@v3
@@ -60,6 +72,11 @@ jobs:
       run: |
         cmake --build --preset windows_x64_cpu_release --parallel
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the python wheel and test dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -76,10 +93,7 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
index a9f602ef8..f0cebbae8 100644
--- a/.github/workflows/win-cuda-x64-build.yml
+++ b/.github/workflows/win-cuda-x64-build.yml
@@ -8,14 +8,12 @@ concurrency:
 env:
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-  ort_dir: "onnxruntime-win-x64-gpu-1.17.3"
-  ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip"
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
   cuda_version: "11.8"
   CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8
   binaryDir: 'build/cuda'
-
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Windows&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime.Gpu.Windows"
 
 jobs:
   windows-cuda-x64-build:
@@ -35,17 +33,32 @@ jobs:
       run: |
         azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ env.cuda_version }}" ${{ env.cuda_dir}}
 
-    - name: Download OnnxRuntime
+    - uses: actions/setup-dotnet@v4
+      with:
+        dotnet-version: '6.0.x'
+
+    - name : Install jq and curl
       run: |
-        Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
+        choco install -y jq curl
 
-    - name: Unzip OnnxRuntime
+    - name: Get the Latest OnnxRuntime Nightly Version
+      shell: pwsh
+      run: |
+        $ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+        echo "$ORT_NIGHTLY_VERSION" 
+        "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append
+    - name: Download OnnxRuntime Nightly
       run: |
-        Expand-Archive $env:ort_zip -DestinationPath .
-        Remove-Item -Path $env:ort_zip
-    - name: Rename OnnxRuntime to ort
+        nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -ExcludeVersion -NonInteractive
+
+    - run: Get-ChildItem  ${{ env.ORT_PACKAGE_NAME }} -Recurse
+      continue-on-error: true
+
+    - name: Extra OnnxRuntime library and header files
       run: |
-        Rename-Item -Path $env:ort_dir -NewName ort
+        mkdir ort/lib
+        move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/    
 
     - name: Configure CMake
       run: |
@@ -59,6 +72,11 @@ jobs:
       run: |
         echo "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the Python Wheel and Test Dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -75,10 +93,6 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json
index cd0c0a0b9..1ea6d85c8 100644
--- a/cmake/presets/CMakeMacOSConfigPresets.json
+++ b/cmake/presets/CMakeMacOSConfigPresets.json
@@ -6,7 +6,7 @@
   "configurePresets": [
         {
       "name": "macos_default",
-      "generator": "Ninja",
+      "generator": "Unix Makefiles",
       "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {
         "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
diff --git a/nuget.config b/nuget.config
index 3e0389a52..63a200340 100644
--- a/nuget.config
+++ b/nuget.config
@@ -3,11 +3,14 @@
   <solution>
     <add key="disableSourceControlIntegration" value="true" />
   </solution>
-  <packageSources>
-    <clear />
-    <add key="NuGet Official" value="https://api.nuget.org/v3/index.json" />
-  </packageSources>
-  <disabledPackageSources>
-    <clear />
-  </disabledPackageSources>
+    <packageRestore>
+        <!--Allow NuGet to download missing packages -->
+        <add key="enabled" value="True" />
+        <!-- Automatically check for missing packages during build in Visual Studio -->
+        <add key="automatic" value="True" />
+    </packageRestore>
+    <packageSources>
+        <add key="nuget.org" value="https://api.nuget.org/v3/index.json" protocolVersion="3" />
+        <add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />
+    </packageSources>
 </configuration>
\ No newline at end of file
diff --git a/onnxruntime-genai.sln b/onnxruntime-genai.sln
new file mode 100644
index 000000000..5e59cc82e
--- /dev/null
+++ b/onnxruntime-genai.sln
@@ -0,0 +1,36 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31903.59
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{2253BDCC-33C9-431E-889A-56E3E75D10BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI", "src\csharp\Microsoft.ML.OnnxRuntimeGenAI.csproj", "{CA0EC087-3AF5-44D5-93F0-489420EBA014}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{505E2406-98C2-46DD-973A-3CEB95CF3626}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI.Tests", "test\csharp\Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj", "{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.Build.0 = Release|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014} = {2253BDCC-33C9-431E-889A-56E3E75D10BA}
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73} = {505E2406-98C2-46DD-973A-3CEB95CF3626}
+	EndGlobalSection
+EndGlobal
diff --git a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
similarity index 92%
rename from test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj
rename to test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
index e4ec8e6d8..978deb04e 100644
--- a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj
+++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
@@ -12,7 +12,8 @@
     <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
     <Configurations>Debug;RelWithDebInfo;Release</Configurations>
-
+    <RestoreAdditionalProjectSources>https://api.nuget.org/v3/index.json</RestoreAdditionalProjectSources>
+    <RestoreSources>$(RestoreAdditionalProjectSources);$(RestoreSources)</RestoreSources>
     <RootNamespace>Microsoft.ML.OnnxRuntimeGenAI.Tests</RootNamespace>
     <AssemblyName>Microsoft.ML.OnnxRuntimeGenAI.Tests</AssemblyName>
   </PropertyGroup>

From f94280f493c2f628726b7ea924592531fdb1bda1 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 30 Apr 2024 17:51:11 -0700
Subject: [PATCH 6/7] Ensure CIs are running on merge (#334)

---
 .github/workflows/linux-cpu-arm64-build.yml  | 9 ++++++++-
 .github/workflows/linux-cpu-x64-build.yml    | 8 +++++++-
 .github/workflows/linux-gpu-x64-build.yml    | 8 +++++++-
 .github/workflows/mac-cpu-arm64-build.yml    | 8 +++++++-
 .github/workflows/win-cuda-x64-build.yml     | 8 +++++++-
 .github/workflows/win-directml-x64-build.yml | 8 +++++++-
 6 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
index 3b55c3fe5..622b73eea 100644
--- a/.github/workflows/linux-cpu-arm64-build.yml
+++ b/.github/workflows/linux-cpu-arm64-build.yml
@@ -1,5 +1,12 @@
 name: "Linux CPU ARM64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 744fa567a..290695c9c 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Linux CPU x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index 123ff5f75..f6cdf0f37 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Linux CUDA x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index aba92d017..f2f90e427 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -1,5 +1,11 @@
 name: "MacOS CPU ARM64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
index f0cebbae8..ccc2f71fe 100644
--- a/.github/workflows/win-cuda-x64-build.yml
+++ b/.github/workflows/win-cuda-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Windows CUDA x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml
index 152b9ab1d..f7dcd89d0 100644
--- a/.github/workflows/win-directml-x64-build.yml
+++ b/.github/workflows/win-directml-x64-build.yml
@@ -1,5 +1,11 @@
 name: "Windows DirectML x64 Build"
-on: [ workflow_dispatch, pull_request ]
+on:
+  workflow_dispatch:
+  push:
+    branches:
+    - main
+    - rel-*
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

From b3ff5cec93015ef8b76ce7778be1df0acb3d893c Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:38:06 -0700
Subject: [PATCH 7/7] Add 'add_extra_input' to handle models like QLora (#370)

Add a new python api 'add_extra_input' that will take numpy tensors and
turn them into OrtValue inputs internally.
This allows models with extra custom inputs (like QLora) to be specified
in python.

C API to follow soon.
---
 src/generators.h             |  8 ++++++++
 src/models/model.cpp         |  5 +++++
 src/models/model.h           |  2 ++
 src/models/static_buffer.cpp | 19 ++-----------------
 src/models/static_buffer.h   |  1 -
 src/python/python.cpp        | 36 ++++++++++++++++++++++++++++++++++++
 6 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/src/generators.h b/src/generators.h
index c6a510739..e6ad6f0e1 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -99,6 +99,14 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
 
   std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
+  struct Input {
+    std::string name;
+    std::unique_ptr<OrtValue> value;
+  };
+
+  // A list of extra model inputs that will be matched at runtime based on name
+  std::vector<Input> extra_inputs;
+
   void TryGraphCapture(int max_bs);
 
  private:
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 6f0cc294a..35a9b4ad4 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -35,6 +35,11 @@ static std::wstring CurrentModulePath() {
 namespace Generators {
 
 State::State(const GeneratorParams& params) : params_{params.shared_from_this()} {
+  // Add extra user inputs
+  for (auto& input : params.extra_inputs) {
+    input_names_.push_back(input.name.c_str());
+    inputs_.push_back(input.value.get());
+  }
 }
 
 void State::Run(OrtSession& session, OrtRunOptions& run_options) {
diff --git a/src/models/model.h b/src/models/model.h
index 5b9ec12d9..165e7c345 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -16,6 +16,8 @@ struct Tokenizer;
 
 void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr<OrtValue>& p_out, DeviceType device_type, cudaStream_t stream);
 
+size_t GetOrtTypeSize(ONNXTensorElementDataType type);
+
 struct State {
   State(const GeneratorParams& params);
   virtual ~State() = default;
diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp
index 9bc5f50ea..eab776e65 100644
--- a/src/models/static_buffer.cpp
+++ b/src/models/static_buffer.cpp
@@ -1,4 +1,5 @@
 #include "../generators.h"
+#include "model.h"
 #include "static_buffer.h"
 
 namespace Generators {
@@ -8,7 +9,7 @@ StaticBuffer::StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size
 
 std::unique_ptr<OrtValue> StaticBuffer::CreateTensorOnStaticBuffer(std::span<const int64_t> shape,
                                                                    ONNXTensorElementDataType type) {
-  size_t new_bytes = GetElementSize(type) * GetNumElements(shape);
+  size_t new_bytes = GetOrtTypeSize(type) * GetNumElements(shape);
   if (buffer_ == nullptr) {
     // Assuming the first dimension is the batch size
     bytes_ = new_bytes * (max_beam_batch_size_ / shape[0]);
@@ -21,22 +22,6 @@ std::unique_ptr<OrtValue> StaticBuffer::CreateTensorOnStaticBuffer(std::span<con
   return OrtValue::CreateTensor(info_, buffer_, new_bytes, shape, type);
 }
 
-// TODO: same as GetOrtTypeSize() in model.cc. Should be moved to a common place
-size_t StaticBuffer::GetElementSize(ONNXTensorElementDataType type) {
-  switch (type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-      return sizeof(uint16_t);
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return sizeof(float);
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return sizeof(int32_t);
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return sizeof(int64_t);
-    default:
-      throw std::runtime_error("Unsupported tensor element data type");
-  }
-}
-
 size_t StaticBuffer::GetNumElements(std::span<const int64_t> shape) {
   size_t num_elements = 1;
   for (auto dim : shape) {
diff --git a/src/models/static_buffer.h b/src/models/static_buffer.h
index ce9e14686..8c133fdae 100644
--- a/src/models/static_buffer.h
+++ b/src/models/static_buffer.h
@@ -18,7 +18,6 @@ struct StaticBuffer {
                                                        ONNXTensorElementDataType type);
 
  private:
-  size_t GetElementSize(ONNXTensorElementDataType type);
   size_t GetNumElements(std::span<const int64_t> shape);
 
   Ort::Allocator* allocator_{nullptr};
diff --git a/src/python/python.cpp b/src/python/python.cpp
index 1d8a4e567..8bd25a9d3 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -22,6 +22,34 @@ pybind11::array_t<T> ToPython(std::span<T> v) {
   return pybind11::array_t<T>(v.size(), v.data());
 }
 
+ONNXTensorElementDataType ToTensorType(const pybind11::dtype& type) {
+  switch (type.num()) {
+    case pybind11::detail::npy_api::NPY_INT32_:
+      return Ort::TypeToTensorType<int32_t>::type;
+    case pybind11::detail::npy_api::NPY_UINT32_:
+      return Ort::TypeToTensorType<uint32_t>::type;
+    case 23 /*NPY_FLOAT16*/:
+      return Ort::TypeToTensorType<Ort::Float16_t>::type;
+    case pybind11::detail::npy_api::NPY_FLOAT_:
+      return Ort::TypeToTensorType<float>::type;
+    case pybind11::detail::npy_api::NPY_DOUBLE_:
+      return Ort::TypeToTensorType<double>::type;
+    default:
+      throw std::runtime_error("Unsupported numpy type");
+  }
+}
+
+std::unique_ptr<OrtValue> ToTensor(pybind11::array& v) {
+  auto type = ToTensorType(v.dtype());
+
+  std::vector<int64_t> shape(v.ndim());
+  for (pybind11::ssize_t i = 0; i < v.ndim(); i++)
+    shape[i] = v.shape()[i];
+
+  auto p_memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
+  return OrtValue::CreateTensor(*p_memory_info, v.mutable_data(), v.nbytes(), shape, type);
+}
+
 namespace Generators {
 
 // A roaming array is one that can be in CPU or GPU memory, and will copy the memory as needed to be used from anywhere
@@ -85,6 +113,11 @@ struct PyGeneratorParams {
     }
   }
 
+  void AddExtraInput(const std::string& name, pybind11::array& value) {
+    params_->extra_inputs.push_back({name, ToTensor(value)});
+    refs_.emplace_back(value);
+  }
+
   void SetSearchOptions(const pybind11::kwargs& dict) {
     for (auto& entry : dict) {
       auto name = entry.first.cast<std::string>();
@@ -110,6 +143,8 @@ struct PyGeneratorParams {
   pybind11::array_t<int32_t> py_input_ids_;
   pybind11::array_t<float> py_whisper_input_features_;
   pybind11::array_t<int32_t> py_whisper_decoder_input_ids_;
+
+  std::vector<pybind11::object> refs_;  // References to data we want to ensure doesn't get garbage collected
 };
 
 struct PyGenerator {
@@ -198,6 +233,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_)
       .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_)
       .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_)
+      .def("add_extra_input", &PyGeneratorParams::AddExtraInput)
       .def("set_search_options", &PyGeneratorParams::SetSearchOptions)  // See config.h 'struct Search' for the options
       .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize);