From a028d7879a2adc92f6a8bf0babe01d1c7398c15f Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Tue, 30 Apr 2024 09:31:46 -0700 Subject: [PATCH 1/7] Simulate the chat template (#352) --- examples/python/model-qa.py | 12 ++++- examples/python/phi-3-tutorial.md | 6 +-- examples/python/phi3-qa.py | 87 +++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 examples/python/phi3-qa.py diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index 6f323ccc4..57ec9f6db 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -15,6 +15,9 @@ def main(args): if args.verbose: print("Tokenizer created") if args.verbose: print() search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: + print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") + exit(1) # Keep asking for input prompts in a loop while True: @@ -25,7 +28,12 @@ def main(args): if args.timings: started_timestamp = time.time() - input_tokens = tokenizer.encode(args.system_prompt + text) + # If there is a chat template, use it + prompt = text + if args.chat_template: + prompt = f'{args.chat_template.format(input=text)}' + + input_tokens = tokenizer.encode(prompt) params = og.GeneratorParams(model) params.try_use_cuda_graph_with_max_batch_size(1) @@ -76,7 +84,7 @@ def main(args): parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-s', '--system_prompt', type=str, default='', help='Prepend a system prompt to the user input prompt. Defaults to empty') parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') + parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}') args = parser.parse_args() main(args) \ No newline at end of file diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 6d2f5f728..5442886a3 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -64,7 +64,7 @@ pip install --pre onnxruntime-genai ## Run the model -Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). +Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line. @@ -74,8 +74,8 @@ The `-m` argument is the path to the model you downloaded from HuggingFace above The `-l` argument is the length of output you would like to generate with the model. ```bash -curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py -python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 +curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py +python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 ``` Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py new file mode 100644 index 000000000..9e9392895 --- /dev/null +++ b/examples/python/phi3-qa.py @@ -0,0 +1,87 @@ +import onnxruntime_genai as og +import argparse +import time + +def main(args): + if args.verbose: print("Loading model...") + if args.timings: + started_timestamp = 0 + first_token_timestamp = 0 + + model = og.Model(f'{args.model}') + if args.verbose: print("Model loaded") + tokenizer = og.Tokenizer(model) + tokenizer_stream = tokenizer.create_stream() + if args.verbose: print("Tokenizer created") + if args.verbose: print() + search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' + + # Keep asking for input prompts in a loop + while True: + text = input("Input: ") + if not text: + print("Error, input cannot be empty") + continue + + if args.timings: started_timestamp = time.time() + + # If there is a chat template, use it + prompt = f'{chat_template.format(input=text)}' + + print(f"Prompt: {prompt}") + + input_tokens = tokenizer.encode(prompt) + + params = og.GeneratorParams(model) + params.try_use_cuda_graph_with_max_batch_size(1) + params.set_search_options(**search_options) + params.input_ids = input_tokens + generator = og.Generator(model, params) + if args.verbose: print("Generator created") + + if args.verbose: print("Running generation loop ...") + if args.timings: + first = True + new_tokens = [] + + print() + print("Output: ", end='', flush=True) + + try: + while not generator.is_done(): + generator.compute_logits() + generator.generate_next_token() + if args.timings: + if first: + first_token_timestamp = time.time() + first = False + + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end='', flush=True) + if args.timings: new_tokens.append(new_token) + except KeyboardInterrupt: + print(" --control+c pressed, aborting generation--") + print() + print() + + if args.timings: + prompt_time = first_token_timestamp - started_timestamp + run_time = time.time() - first_token_timestamp + print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") + parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)') + parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') + parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') + parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') + parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') + parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') + parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') + parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') + parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') + args = parser.parse_args() + main(args) \ No newline at end of file From cb4e3aaf982abcd22adfc01a9d4663f76f645561 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Tue, 30 Apr 2024 09:35:37 -0700 Subject: [PATCH 2/7] Update phi-3-tutorial.md (#361) --- examples/python/phi-3-tutorial.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 5442886a3..5ee9331e3 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -81,7 +81,7 @@ python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-bl Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: ```bash -Input: <|user|>Tell me a joke about creative writing<|end|><|assistant|> +Input: Tell me a joke about creative writing Output: Why don't writers ever get lost? Because they always follow the plot! -``` \ No newline at end of file +``` From afd2edc892c58a41f6b69df0200e96ea6785d9e6 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Tue, 30 Apr 2024 14:48:45 -0700 Subject: [PATCH 3/7] Make OgaModel* const again (#356) --- benchmark/c/main.cpp | 2 +- src/csharp/NativeMethods.cs | 4 ++-- src/generators.cpp | 20 +++++++++++++++++++- src/generators.h | 6 ++++++ src/models/captured_graph_pool.cpp | 2 +- src/models/decoder_only.cpp | 4 ++-- src/models/model.cpp | 22 ---------------------- src/models/model.h | 5 ----- src/ort_genai.h | 4 ++-- src/ort_genai_c.cpp | 16 +++++----------- src/ort_genai_c.h | 4 ++-- src/python/python.cpp | 5 ++--- 12 files changed, 42 insertions(+), 52 deletions(-) diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp index 2d4b62b1f..3a4c9b43b 100644 --- a/benchmark/c/main.cpp +++ b/benchmark/c/main.cpp @@ -112,7 +112,7 @@ void WriteE2EStats(std::string_view label, << "\n"; } -std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) { +std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) { const char* const base_prompt = "A"; auto base_prompt_sequences = OgaSequences::Create(); diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index f2906f3df..a56e7dd7e 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -71,7 +71,7 @@ internal class NativeLib IntPtr /* const OgaSequences* */ sequences); [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaGenerator** */ generator); @@ -129,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq // This function is used to generate sequences for the given model using the given generator parameters. // The OgaSequences object is an array of sequences, where each sequence is an array of tokens. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaSequences** */ sequences); diff --git a/src/generators.cpp b/src/generators.cpp index 0c664f341..bc00f8d3e 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -61,7 +61,25 @@ GeneratorParams::GeneratorParams(const Model& model) eos_token_id{model.config_->model.eos_token_id}, vocab_size{model.config_->model.vocab_size}, device_type{model.device_type_}, - cuda_stream{model.cuda_stream_} { + cuda_stream{model.cuda_stream_}, + is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} { +} + +void GeneratorParams::TryGraphCapture(int max_bs) { + if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) { + // no-op + return; + } + + if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) { + if (max_bs == 0) { + throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set."); + } + use_cuda_graph = true; + max_batch_size = max_bs; + } else { + throw std::runtime_error("CUDA graph is not supported on this device"); + } } std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params) { diff --git a/src/generators.h b/src/generators.h index c10868570..c6a510739 100644 --- a/src/generators.h +++ b/src/generators.h @@ -61,6 +61,7 @@ struct GeneratorParams : std::enable_shared_from_this { int batch_size{1}; int max_batch_size{0}; + bool use_cuda_graph{}; int sequence_length{}; int BatchBeamSize() const { return search.num_beams * batch_size; } @@ -97,6 +98,11 @@ struct GeneratorParams : std::enable_shared_from_this { std::vector input_ids_owner; // Backing memory of input_ids in some cases std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime + + void TryGraphCapture(int max_bs); + + private: + bool is_cuda_graph_enabled_{}; }; struct Generator { diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp index 140f2a8cd..96cc029b8 100644 --- a/src/models/captured_graph_pool.cpp +++ b/src/models/captured_graph_pool.cpp @@ -24,7 +24,7 @@ static std::tuple MakeKey(int max_batch_size, int max_length, int } CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const { - if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { + if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { return nullptr; } diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index 83d1f03d3..53f4f6697 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -26,7 +26,7 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArra RoamingArray DecoderOnly_State::Run(int current_length, RoamingArray next_tokens, RoamingArray next_indices) { if (first_run_) { - if (model_.use_cuda_graph_) { + if (params_->use_cuda_graph) { model_.run_options_->AddConfigEntry("gpu_graph_id", "-1"); } first_run_ = false; @@ -37,7 +37,7 @@ RoamingArray DecoderOnly_State::Run(int current_length, RoamingArrayuse_cuda_graph) { int new_batch_size = static_cast(input_ids_.GetShape()[0]); if (new_batch_size != current_batch_size_) { current_batch_size_ = new_batch_size; diff --git a/src/models/model.cpp b/src/models/model.cpp index 439ab5c6a..6f0cc294a 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -515,26 +515,4 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, return expanded; } -void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) { - bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options); - max_batch_size_ = params.max_batch_size; - - if (DeviceType::CUDA == device_type_) { - if (is_cuda_graph_enabled) { - if (max_batch_size_ == 0) { - throw std::runtime_error("CUDA graph is enabled, but max_batch_size is not set."); - } - use_cuda_graph_ = true; - } - } else if (DeviceType::DML == device_type_) { - if (max_batch_size_ == 0) { - throw std::runtime_error("max_batch_size needs to be set when using DirectML."); - } - - use_cuda_graph_ = true; - } else if (is_cuda_graph_enabled) { - throw std::runtime_error("CUDA graph is not supported on this device"); - } -} - } // namespace Generators diff --git a/src/models/model.h b/src/models/model.h index fe3b9d832..5b9ec12d9 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -102,8 +102,6 @@ struct Model : std::enable_shared_from_this { std::unique_ptr ExpandInputs(std::unique_ptr& input, int num_beams) const; - void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params); - CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); } std::unique_ptr config_; @@ -119,9 +117,6 @@ struct Model : std::enable_shared_from_this { std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime - bool use_cuda_graph_{}; - int max_batch_size_{}; - #if USE_DML DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); } DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); } diff --git a/src/ort_genai.h b/src/ort_genai.h index fb863dae2..b8e55bf19 100644 --- a/src/ort_genai.h +++ b/src/ort_genai.h @@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract { return std::unique_ptr(p); } - std::unique_ptr Generate(const OgaGeneratorParams& params) { + std::unique_ptr Generate(const OgaGeneratorParams& params) const { OgaSequences* p; OgaCheckResult(OgaGenerate(this, ¶ms, &p)); return std::unique_ptr(p); @@ -201,7 +201,7 @@ struct OgaGeneratorParams : OgaAbstract { }; struct OgaGenerator : OgaAbstract { - static std::unique_ptr Create(OgaModel& model, const OgaGeneratorParams& params) { + static std::unique_ptr Create(const OgaModel& model, const OgaGeneratorParams& params) { OgaGenerator* p; OgaCheckResult(OgaCreateGenerator(&model, ¶ms, &p)); return std::unique_ptr(p); diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 13cae5235..d5ab67040 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -108,7 +108,7 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) { OGA_TRY auto* params = reinterpret_cast(generator_params); - params->max_batch_size = max_batch_size; + params->TryGraphCapture(max_batch_size); return nullptr; OGA_CATCH } @@ -143,23 +143,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera OGA_CATCH } -OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { +OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { OGA_TRY - auto* model_p = reinterpret_cast(model); - auto* params = reinterpret_cast(generator_params); - model_p->GetMaxBatchSizeFromGeneratorParams(*params); - auto result = Generators::Generate(*model_p, *params); + auto result = Generators::Generate(*reinterpret_cast(model), *reinterpret_cast(generator_params)); *out = reinterpret_cast(std::make_unique(std::move(result)).release()); return nullptr; OGA_CATCH } -OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { +OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { OGA_TRY - auto* model_p = reinterpret_cast(model); - auto* params = reinterpret_cast(generator_params); - model_p->GetMaxBatchSizeFromGeneratorParams(*params); - *out = reinterpret_cast(CreateGenerator(*model_p, *params).release()); + *out = reinterpret_cast(CreateGenerator(*reinterpret_cast(model), *reinterpret_cast(generator_params)).release()); return nullptr; OGA_CATCH } diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index 0939d2c36..3e44c29e4 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -117,7 +117,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model); * after it is done using the sequences. * \return OgaResult containing the error message if the generation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); /* * \brief Creates a OgaGeneratorParams from the given model. @@ -167,7 +167,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O * \param[out] out The created generator. * \return OgaResult containing the error message if the generator creation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); /* * \brief Destroys the given generator. diff --git a/src/python/python.cpp b/src/python/python.cpp index cd974d916..1d8a4e567 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -104,7 +104,7 @@ struct PyGeneratorParams { } void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) { - params_->max_batch_size = max_batch_size.cast(); + params_->TryGraphCapture(max_batch_size.cast()); } pybind11::array_t py_input_ids_; @@ -115,7 +115,6 @@ struct PyGeneratorParams { struct PyGenerator { PyGenerator(Model& model, PyGeneratorParams& params) { params.Prepare(); - model.GetMaxBatchSizeFromGeneratorParams(params); generator_ = CreateGenerator(model, params); } @@ -229,7 +228,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def(pybind11::init([](const std::string& config_path) { return CreateModel(GetOrtEnv(), config_path.c_str()); })) - .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); }) + .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); }) .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; }); pybind11::class_(m, "Generator") From eea971091978a7e8aa6d90a550d327e09c29d445 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Tue, 30 Apr 2024 15:22:55 -0700 Subject: [PATCH 4/7] update readme (#363) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f6cab5472..113a0a5b7 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ prompt = '''def print_prime(n): tokens = tokenizer.encode(prompt) params = og.GeneratorParams(model) -params.set_search_options({"max_length":200}) +params.set_search_options(max_length=200) # Add the following line to enable cuda graph by passing the maximum batch size. # params.try_use_cuda_graph_with_max_batch_size(16) params.input_ids = tokens From 7dd45f2a6b3930543e65a0d2685d00eaa30fb522 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Tue, 30 Apr 2024 18:34:01 -0400 Subject: [PATCH 5/7] Use ort-nightly build for genai gha ci (#256) --- .github/workflows/linux-cpu-x64-build.yml | 52 +++++++++++++++---- .github/workflows/linux-gpu-x64-build.yml | 52 +++++++++++++++---- .github/workflows/mac-cpu-arm64-build.yml | 26 +++++----- .github/workflows/win-cpu-arm64-build.yml | 10 ++-- .github/workflows/win-cpu-x64-build.yml | 44 ++++++++++------ .github/workflows/win-cuda-x64-build.yml | 44 ++++++++++------ cmake/presets/CMakeMacOSConfigPresets.json | 2 +- nuget.config | 17 +++--- onnxruntime-genai.sln | 36 +++++++++++++ ...icrosoft.ML.OnnxRuntimeGenAI.Tests.csproj} | 3 +- 10 files changed, 208 insertions(+), 78 deletions(-) create mode 100644 onnxruntime-genai.sln rename test/csharp/{Microsoft.OnnxRuntimeGenAI.Tests.csproj => Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj} (92%) diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 2e1c03aab..744fa567a 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -4,10 +4,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-linux-x64-1.17.3" - ort_zip: "onnxruntime-linux-x64-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz" - + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" + ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" + NUGET_EXE: "mono /usr/local/bin/nuget.exe" jobs: linux_cpu_x64: runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ] @@ -16,19 +16,49 @@ jobs: uses: actions/checkout@v4 with: submodules: true + - name: install Mono and Nuget + run: | + sudo apt install ca-certificates gnupg + sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF + echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list + sudo apt update + sudo apt install -y mono-devel + sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe + sudo chmod +x /usr/local/bin/nuget.exe + + - name: Install jq and dotnet + run: | + wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb + sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq - - name: Download OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV - - name: Unzip OnnxRuntime + - name: Download OnnxRuntime Nightly run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + ${{ env.NUGET_EXE }} install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x + continue-on-error: true + + - name: list files + shell: bash + run: | + ls -l + ls -R ${{ env.ORT_PACKAGE_NAME }} + continue-on-error: true - - name: Rename OnnxRuntime to ort +# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version + - name: Extra OnnxRuntime library and header files run: | - mv ${{ env.ort_dir }} ort + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/ + ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1) + cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version - name: Build with CMake and GCC run: | diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index c1e51251b..123ff5f75 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -6,9 +6,11 @@ concurrency: cancel-in-progress: true env: - ort_dir: "onnxruntime-linux-x64-gpu-1.17.3" - ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz" + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux + ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" + NUGET_EXE: "mono /usr/local/bin/nuget.exe" + jobs: linux-cuda-x64-build: @@ -29,19 +31,49 @@ jobs: clean: true path: manylinux submodules: true + - name: install Mono and Nuget + run: | + sudo apt install ca-certificates gnupg + sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF + echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list + sudo apt update + sudo apt install -y mono-devel + sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe + sudo chmod +x /usr/local/bin/nuget.exe + + - name: Install jq and dotnet + run: | + wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb + sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq - name: Download OnnxRuntime run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV - - name: Unzip OnnxRuntime + - name: Download OnnxRuntime Nightly + run: | + mono /usr/local/bin/nuget.exe install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x + continue-on-error: true + - name: list files + shell: bash run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + ls -l + ls -R ${{ env.ORT_PACKAGE_NAME }} + continue-on-error: true - - name: Rename OnnxRuntime to ort +# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version + - name: Extra OnnxRuntime library and header files run: | - mv ${{ env.ort_dir }} ort + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/ + ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1) + cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version + - name: Get Docker Image run: | @@ -78,7 +110,7 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - /usr/bin/cmake --build --preset linux_gcc_cuda_release --parallel $( nproc )" + /usr/bin/cmake --build --preset linux_gcc_cuda_release" - name: Get HuggingFace Token run: | diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index 9cb9cdc46..aba92d017 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -4,9 +4,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-osx-arm64-1.17.3" - ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz" + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" jobs: mac-cpu-arm64-build: runs-on: macos-latest @@ -16,22 +15,21 @@ jobs: with: submodules: true - - name: Install ninja + - name: Get the Latest OnnxRuntime Nightly Version run: | - brew install ninja - - - name: Download OnnxRuntime + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV + - name: Download OnnxRuntime Nightly run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x - - name: Unzip OnnxRuntime + - name: Extra OnnxRuntime library and header files run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/ - - name: Rename OnnxRuntime to ort - run: | - mv ${{ env.ort_dir }} ort - name: Configure CMake run: | diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 916af3009..ce3bfcf4b 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -53,6 +53,11 @@ jobs: run: | cmake --build --preset windows_arm64_cpu_release --parallel + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the Python Wheel and Test Dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -62,10 +67,7 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Verify Build Artifacts if: always() diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index ca0bb6b5b..cf5614dee 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -11,10 +11,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-win-x64-1.17.3" - ort_zip: "$(ort_dir).zip" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)" binaryDir: 'build/cpu' + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" jobs: windows-cpu-x64-build: @@ -33,19 +32,32 @@ jobs: with: vs-version: '17.5' - - name: Download OnnxRuntime + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '6.0.x' + + - name : Install jq and nuget run: | - $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip" - Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip + choco install -y jq curl - - name: Unzip OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version + shell: pwsh run: | - Expand-Archive $env:ort_zip -DestinationPath . - Remove-Item -Path $env:ort_zip + $ORT_NIGHTLY_VERSION = $(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Download OnnxRuntime Nightly + run: | + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x -NonInteractive + + - run: Get-ChildItem ${{ env.ORT_PACKAGE_NAME }} -Recurse + continue-on-error: true - - name: Rename OnnxRuntime to ort + - name: Extra OnnxRuntime library and header files run: | - Rename-Item -Path $env:ort_dir -NewName ort + mkdir ort/lib + move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Initialize CodeQL uses: github/codeql-action/init@v3 @@ -60,6 +72,11 @@ jobs: run: | cmake --build --preset windows_x64_cpu_release --parallel + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the python wheel and test dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -76,10 +93,7 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Verify Build Artifacts if: always() diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index a9f602ef8..f0cebbae8 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -8,14 +8,12 @@ concurrency: env: AZCOPY_AUTO_LOGIN_TYPE: MSI AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4 - ort_dir: "onnxruntime-win-x64-gpu-1.17.3" - ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip" cuda_dir: "${{ github.workspace }}\\cuda_sdk" cuda_version: "11.8" CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8 binaryDir: 'build/cuda' - + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Windows&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime.Gpu.Windows" jobs: windows-cuda-x64-build: @@ -35,17 +33,32 @@ jobs: run: | azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ env.cuda_version }}" ${{ env.cuda_dir}} - - name: Download OnnxRuntime + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '6.0.x' + + - name : Install jq and curl run: | - Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip + choco install -y jq curl - - name: Unzip OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version + shell: pwsh + run: | + $ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Download OnnxRuntime Nightly run: | - Expand-Archive $env:ort_zip -DestinationPath . - Remove-Item -Path $env:ort_zip - - name: Rename OnnxRuntime to ort + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -ExcludeVersion -NonInteractive + + - run: Get-ChildItem ${{ env.ORT_PACKAGE_NAME }} -Recurse + continue-on-error: true + + - name: Extra OnnxRuntime library and header files run: | - Rename-Item -Path $env:ort_dir -NewName ort + mkdir ort/lib + move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Configure CMake run: | @@ -59,6 +72,11 @@ jobs: run: | echo "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the Python Wheel and Test Dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -75,10 +93,6 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" - name: Verify Build Artifacts if: always() diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json index cd0c0a0b9..1ea6d85c8 100644 --- a/cmake/presets/CMakeMacOSConfigPresets.json +++ b/cmake/presets/CMakeMacOSConfigPresets.json @@ -6,7 +6,7 @@ "configurePresets": [ { "name": "macos_default", - "generator": "Ninja", + "generator": "Unix Makefiles", "binaryDir": "${sourceDir}/build/cpu", "cacheVariables": { "CMAKE_POSITION_INDEPENDENT_CODE": "ON", diff --git a/nuget.config b/nuget.config index 3e0389a52..63a200340 100644 --- a/nuget.config +++ b/nuget.config @@ -3,11 +3,14 @@ - - - - - - - + + + + + + + + + + \ No newline at end of file diff --git a/onnxruntime-genai.sln b/onnxruntime-genai.sln new file mode 100644 index 000000000..5e59cc82e --- /dev/null +++ b/onnxruntime-genai.sln @@ -0,0 +1,36 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31903.59 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{2253BDCC-33C9-431E-889A-56E3E75D10BA}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI", "src\csharp\Microsoft.ML.OnnxRuntimeGenAI.csproj", "{CA0EC087-3AF5-44D5-93F0-489420EBA014}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{505E2406-98C2-46DD-973A-3CEB95CF3626}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI.Tests", "test\csharp\Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj", "{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.Build.0 = Release|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.Build.0 = Debug|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.ActiveCfg = Release|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {CA0EC087-3AF5-44D5-93F0-489420EBA014} = {2253BDCC-33C9-431E-889A-56E3E75D10BA} + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73} = {505E2406-98C2-46DD-973A-3CEB95CF3626} + EndGlobalSection +EndGlobal diff --git a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj similarity index 92% rename from test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj rename to test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj index e4ec8e6d8..978deb04e 100644 --- a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj +++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj @@ -12,7 +12,8 @@ default True Debug;RelWithDebInfo;Release - + https://api.nuget.org/v3/index.json + $(RestoreAdditionalProjectSources);$(RestoreSources) Microsoft.ML.OnnxRuntimeGenAI.Tests Microsoft.ML.OnnxRuntimeGenAI.Tests From f94280f493c2f628726b7ea924592531fdb1bda1 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Tue, 30 Apr 2024 17:51:11 -0700 Subject: [PATCH 6/7] Ensure CIs are running on merge (#334) --- .github/workflows/linux-cpu-arm64-build.yml | 9 ++++++++- .github/workflows/linux-cpu-x64-build.yml | 8 +++++++- .github/workflows/linux-gpu-x64-build.yml | 8 +++++++- .github/workflows/mac-cpu-arm64-build.yml | 8 +++++++- .github/workflows/win-cuda-x64-build.yml | 8 +++++++- .github/workflows/win-directml-x64-build.yml | 8 +++++++- 6 files changed, 43 insertions(+), 6 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 3b55c3fe5..622b73eea 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -1,5 +1,12 @@ name: "Linux CPU ARM64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: + concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 744fa567a..290695c9c 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -1,5 +1,11 @@ name: "Linux CPU x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 123ff5f75..f6cdf0f37 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -1,5 +1,11 @@ name: "Linux CUDA x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index aba92d017..f2f90e427 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -1,5 +1,11 @@ name: "MacOS CPU ARM64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index f0cebbae8..ccc2f71fe 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -1,5 +1,11 @@ name: "Windows CUDA x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index 152b9ab1d..f7dcd89d0 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -1,5 +1,11 @@ name: "Windows DirectML x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} From b3ff5cec93015ef8b76ce7778be1df0acb3d893c Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Tue, 30 Apr 2024 21:38:06 -0700 Subject: [PATCH 7/7] Add 'add_extra_input' to handle models like QLora (#370) Add a new python api 'add_extra_input' that will take numpy tensors and turn them into OrtValue inputs internally. This allows models with extra custom inputs (like QLora) to be specified in python. C API to follow soon. --- src/generators.h | 8 ++++++++ src/models/model.cpp | 5 +++++ src/models/model.h | 2 ++ src/models/static_buffer.cpp | 19 ++----------------- src/models/static_buffer.h | 1 - src/python/python.cpp | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 53 insertions(+), 18 deletions(-) diff --git a/src/generators.h b/src/generators.h index c6a510739..e6ad6f0e1 100644 --- a/src/generators.h +++ b/src/generators.h @@ -99,6 +99,14 @@ struct GeneratorParams : std::enable_shared_from_this { std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime + struct Input { + std::string name; + std::unique_ptr value; + }; + + // A list of extra model inputs that will be matched at runtime based on name + std::vector extra_inputs; + void TryGraphCapture(int max_bs); private: diff --git a/src/models/model.cpp b/src/models/model.cpp index 6f0cc294a..35a9b4ad4 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -35,6 +35,11 @@ static std::wstring CurrentModulePath() { namespace Generators { State::State(const GeneratorParams& params) : params_{params.shared_from_this()} { + // Add extra user inputs + for (auto& input : params.extra_inputs) { + input_names_.push_back(input.name.c_str()); + inputs_.push_back(input.value.get()); + } } void State::Run(OrtSession& session, OrtRunOptions& run_options) { diff --git a/src/models/model.h b/src/models/model.h index 5b9ec12d9..165e7c345 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -16,6 +16,8 @@ struct Tokenizer; void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr& p_out, DeviceType device_type, cudaStream_t stream); +size_t GetOrtTypeSize(ONNXTensorElementDataType type); + struct State { State(const GeneratorParams& params); virtual ~State() = default; diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp index 9bc5f50ea..eab776e65 100644 --- a/src/models/static_buffer.cpp +++ b/src/models/static_buffer.cpp @@ -1,4 +1,5 @@ #include "../generators.h" +#include "model.h" #include "static_buffer.h" namespace Generators { @@ -8,7 +9,7 @@ StaticBuffer::StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size std::unique_ptr StaticBuffer::CreateTensorOnStaticBuffer(std::span shape, ONNXTensorElementDataType type) { - size_t new_bytes = GetElementSize(type) * GetNumElements(shape); + size_t new_bytes = GetOrtTypeSize(type) * GetNumElements(shape); if (buffer_ == nullptr) { // Assuming the first dimension is the batch size bytes_ = new_bytes * (max_beam_batch_size_ / shape[0]); @@ -21,22 +22,6 @@ std::unique_ptr StaticBuffer::CreateTensorOnStaticBuffer(std::span shape) { size_t num_elements = 1; for (auto dim : shape) { diff --git a/src/models/static_buffer.h b/src/models/static_buffer.h index ce9e14686..8c133fdae 100644 --- a/src/models/static_buffer.h +++ b/src/models/static_buffer.h @@ -18,7 +18,6 @@ struct StaticBuffer { ONNXTensorElementDataType type); private: - size_t GetElementSize(ONNXTensorElementDataType type); size_t GetNumElements(std::span shape); Ort::Allocator* allocator_{nullptr}; diff --git a/src/python/python.cpp b/src/python/python.cpp index 1d8a4e567..8bd25a9d3 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -22,6 +22,34 @@ pybind11::array_t ToPython(std::span v) { return pybind11::array_t(v.size(), v.data()); } +ONNXTensorElementDataType ToTensorType(const pybind11::dtype& type) { + switch (type.num()) { + case pybind11::detail::npy_api::NPY_INT32_: + return Ort::TypeToTensorType::type; + case pybind11::detail::npy_api::NPY_UINT32_: + return Ort::TypeToTensorType::type; + case 23 /*NPY_FLOAT16*/: + return Ort::TypeToTensorType::type; + case pybind11::detail::npy_api::NPY_FLOAT_: + return Ort::TypeToTensorType::type; + case pybind11::detail::npy_api::NPY_DOUBLE_: + return Ort::TypeToTensorType::type; + default: + throw std::runtime_error("Unsupported numpy type"); + } +} + +std::unique_ptr ToTensor(pybind11::array& v) { + auto type = ToTensorType(v.dtype()); + + std::vector shape(v.ndim()); + for (pybind11::ssize_t i = 0; i < v.ndim(); i++) + shape[i] = v.shape()[i]; + + auto p_memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); + return OrtValue::CreateTensor(*p_memory_info, v.mutable_data(), v.nbytes(), shape, type); +} + namespace Generators { // A roaming array is one that can be in CPU or GPU memory, and will copy the memory as needed to be used from anywhere @@ -85,6 +113,11 @@ struct PyGeneratorParams { } } + void AddExtraInput(const std::string& name, pybind11::array& value) { + params_->extra_inputs.push_back({name, ToTensor(value)}); + refs_.emplace_back(value); + } + void SetSearchOptions(const pybind11::kwargs& dict) { for (auto& entry : dict) { auto name = entry.first.cast(); @@ -110,6 +143,8 @@ struct PyGeneratorParams { pybind11::array_t py_input_ids_; pybind11::array_t py_whisper_input_features_; pybind11::array_t py_whisper_decoder_input_ids_; + + std::vector refs_; // References to data we want to ensure doesn't get garbage collected }; struct PyGenerator { @@ -198,6 +233,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_) .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_) .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_) + .def("add_extra_input", &PyGeneratorParams::AddExtraInput) .def("set_search_options", &PyGeneratorParams::SetSearchOptions) // See config.h 'struct Search' for the options .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize);