From a6cbaf54122d946544af7a7a3613a7fee4eabed3 Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Mon, 29 Apr 2024 11:11:29 -0700 Subject: [PATCH 01/15] Fix naming transpose nodes (#348) ### Description This PR adds the node name for the `Transpose` nodes. ### Motivation and Context While adding Phi-3 mini to the model builder, a new checker was added to prevent adding the same node twice in an ONNX model. The new checker found that the `Transpose` nodes were missing their unique names. --- src/python/py/models/builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 1a9c37a3e..a2c61b947 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -528,7 +528,7 @@ def make_mul(self, name, inputs, dtype, shape): def make_transpose(self, name, root_input, dtype, shape, perm): output = f"{name}/output_0" - self.make_node("Transpose", inputs=[root_input], outputs=[output], perm=perm) + self.make_node("Transpose", inputs=[root_input], outputs=[output], name=name, perm=perm) self.make_value_info(output, dtype, shape=shape) def make_matmul(self, matmul, name, root_input, **kwargs): From 769162cfb364e9ace09a9143ee768e152be0498a Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Mon, 29 Apr 2024 11:32:14 -0700 Subject: [PATCH 02/15] fix C++ APIs in C example (#347) --- examples/c/src/main.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp index e4be639f2..09f0c9aa5 100644 --- a/examples/c/src/main.cpp +++ b/examples/c/src/main.cpp @@ -5,11 +5,14 @@ // C++ API Example void CXX_API(const char* model_path) { + std::cout << "Creating model..." << std::endl; auto model = OgaModel::Create(model_path); + std::cout << "Creating tokenizer..." << std::endl; auto tokenizer = OgaTokenizer::Create(*model); const char* prompt = "def is_prime(num):"; - std::cout << "Prompt: " << std::endl << prompt << std::endl; + std::cout << "Prompt: " << std::endl + << prompt << std::endl; auto sequences = OgaSequences::Create(); tokenizer->Encode(prompt, *sequences); @@ -19,16 +22,19 @@ void CXX_API(const char* model_path) { params->SetInputSequences(*sequences); auto output_sequences = model->Generate(*params); - auto out_string = tokenizer->Decode(output_sequences->Get(0)); + const auto output_sequence_length = output_sequences->SequenceCount(0); + const auto* output_sequence_data = output_sequences->SequenceData(0); + auto out_string = tokenizer->Decode(output_sequence_data, output_sequence_length); - std::cout << "Output: " << std::endl << out_string << std::endl; + std::cout << "Output: " << std::endl + << out_string << std::endl; } // C API Example void CheckResult(OgaResult* result) { if (result) { - std::string string=OgaResultGetError(result); + std::string string = OgaResultGetError(result); OgaDestroyResult(result); throw std::runtime_error(string); } @@ -36,9 +42,11 @@ void CheckResult(OgaResult* result) { void C_API(const char* model_path) { OgaModel* model; + std::cout << "Creating model..." << std::endl; OgaCreateModel(model_path, &model); OgaTokenizer* tokenizer; + std::cout << "Creating tokenizer..." << std::endl; CheckResult(OgaCreateTokenizer(model, &tokenizer)); const char* prompt = "def is_prime(num):"; @@ -84,7 +92,6 @@ int main(int argc, char** argv) { return -1; } - std::cout << "-------------" << std::endl; std::cout << "Hello, Phi-2!" << std::endl; std::cout << "-------------" << std::endl; From a9fd3264ce6a49c0f9003710d42f24b96e32accc Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 29 Apr 2024 16:07:14 -0400 Subject: [PATCH 03/15] Merging Rel-0.2.0 back to main (#342) Co-authored-by: Patrice Vignola Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Co-authored-by: Nat Kershaw (MSFT) Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com> Co-authored-by: Kunal Vaishnavi Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Co-authored-by: Yufeng Li Co-authored-by: Baiju Meswani --- .pipelines/nuget-publishing.yml | 14 +- .pipelines/pypl-publishing.yml | 14 +- .../stages/jobs/nuget-packaging-job.yml | 27 +++- .pipelines/stages/jobs/py-packaging-job.yml | 21 ++- .../stages/jobs/steps/capi-linux-step.yml | 2 +- .../stages/jobs/steps/capi-win-step.yml | 16 ++- .../stages/jobs/steps/nuget-win-step.yml | 4 +- .../stages/jobs/steps/utils/capi-archive.yml | 1 + .../stages/jobs/steps/utils/download-ort.yml | 74 +++++++---- .pipelines/stages/nuget-packaging-stage.yml | 19 ++- .pipelines/stages/py-packaging-stage.yml | 21 ++- benchmark/c/main.cpp | 2 +- cmake/presets/CMakeWinConfigPresets.json | 40 ++---- examples/python/phi-3-tutorial.md | 121 ++++++++++++++---- src/csharp/GeneratorParams.cs | 5 + src/csharp/NativeMethods.cs | 8 +- src/ort_genai.h | 8 +- src/ort_genai_c.cpp | 22 +++- src/ort_genai_c.h | 5 +- 19 files changed, 320 insertions(+), 104 deletions(-) diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml index b6fff7111..00083c65c 100644 --- a/.pipelines/nuget-publishing.yml +++ b/.pipelines/nuget-publishing.yml @@ -19,12 +19,21 @@ parameters: type: boolean default: true +- name: enable_win_dml + displayName: 'Whether Windows DirectML package is built.' + type: boolean + default: true - name: ort_version displayName: 'OnnxRuntime version' type: string default: '1.17.3' +- name: ort_dml_version + displayName: 'OnnxRuntime DirectML version' + type: string + default: '1.18.0-dev-20240423-0527-c07b8d545d' + - name: cuda_version displayName: 'CUDA version' type: string @@ -54,6 +63,9 @@ stages: enable_win_cuda: ${{ parameters.enable_win_cuda }} enable_linux_cpu: ${{ parameters.enable_linux_cpu }} enable_linux_cuda: ${{ parameters.enable_linux_cuda }} + enable_win_dml: ${{ parameters.enable_win_dml }} ort_version: ${{ parameters.ort_version }} + ort_dml_version: ${{ parameters.ort_dml_version }} cuda_version: ${{ parameters.cuda_version }} - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} \ No newline at end of file + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + \ No newline at end of file diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml index 1069ede44..1b493ca69 100644 --- a/.pipelines/pypl-publishing.yml +++ b/.pipelines/pypl-publishing.yml @@ -7,6 +7,11 @@ parameters: - name: enable_win_cuda displayName: 'Whether Windows CUDA package is built.' type: boolean + default : true + +- name: enable_win_dml + displayName: 'Whether Windows DirectML package is built.' + type: boolean default: true - name: enable_linux_cpu @@ -24,6 +29,11 @@ parameters: type: string default: '1.17.3' +- name: ort_dml_version + displayName: 'OnnxRuntime DirectML version' + type: string + default: '1.18.0-dev-20240423-0527-c07b8d545d' + - name: cuda_version displayName: 'CUDA version' type: string @@ -53,6 +63,8 @@ stages: enable_linux_cuda: ${{ parameters.enable_linux_cuda }} enable_win_cpu: ${{ parameters.enable_win_cpu }} enable_win_cuda: ${{ parameters.enable_win_cuda }} + enable_win_dml: ${{ parameters.enable_win_dml }} ort_version: ${{ parameters.ort_version }} + ort_dml_version: ${{ parameters.ort_dml_version }} cuda_version: ${{ parameters.cuda_version }} - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} \ No newline at end of file diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml index f790215b7..f971a4586 100644 --- a/.pipelines/stages/jobs/nuget-packaging-job.yml +++ b/.pipelines/stages/jobs/nuget-packaging-job.yml @@ -21,7 +21,10 @@ jobs: ${{ if eq(parameters.os, 'linux') }}: pool: 'onnxruntime-Ubuntu2204-AMD-CPU' ${{ if eq(parameters.os, 'win') }}: - pool: 'onnxruntime-Win-CPU-2022' + ${{ if eq(parameters.ep, 'directml') }}: + pool: 'onnxruntime-Win2022-GPU-dml-A10' + ${{ else }}: + pool: 'onnxruntime-Win-CPU-2022' timeoutInMinutes: 180 # set variables here to be used in the template and steps variables: @@ -44,21 +47,39 @@ jobs: - name: ort_filename ${{ if eq(parameters.ep, 'cpu') }}: value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}' - ${{ else}}: + ${{ elseif eq(parameters.ep, 'cuda') }}: ${{if eq(parameters.cuda_version, '11.8') }}: value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' ${{ else }}: value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' + ${{ elseif eq(parameters.ep, 'directml')}}: + value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}' + ${{ else }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}' - name: genai_nuget_ext ${{ if eq(parameters.ep, 'cpu') }}: value: '' ${{ if eq(parameters.ep, 'cuda') }}: value: '.Cuda' + ${{ if eq(parameters.ep, 'directml') }}: + value: '.DirectML' - name: ortHome - value: 'ort' + value: 'ort' + - name: dml_dir + value: 'Microsoft.AI.DirectML.1.14.1' + - name: dml_zip + value: 'Microsoft.AI.DirectML.1.14.1.zip' + - name: dml_url + value: "https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1" workspace: clean: all steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: 3.10 + addToPath: true + architecture: $(arch) + - ${{ if eq(parameters.os, 'linux') }}: - template: steps/capi-linux-step.yml parameters: diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml index 7066b070e..ba9bc5731 100644 --- a/.pipelines/stages/jobs/py-packaging-job.yml +++ b/.pipelines/stages/jobs/py-packaging-job.yml @@ -21,7 +21,11 @@ jobs: ${{ if eq(parameters.os, 'linux') }}: pool: 'onnxruntime-Ubuntu2204-AMD-CPU' ${{ if eq(parameters.os, 'win') }}: - pool: 'onnxruntime-Win-CPU-2022' + ${{ if eq(parameters.ep, 'directml') }}: + pool: 'onnxruntime-Win2022-GPU-dml-A10' + ${{ else }}: + pool: 'onnxruntime-Win-CPU-2022' + strategy: matrix: Python38: @@ -66,11 +70,23 @@ jobs: - name: ort_filename ${{ if eq(parameters.ep, 'cpu') }}: value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}' - ${{ else}}: + ${{ elseif eq(parameters.ep, 'cuda') }}: ${{if eq(parameters.cuda_version, '11.8') }}: value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' ${{ else }}: value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' + ${{ elseif eq(parameters.ep, 'directml')}}: + value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}' + ${{ else }}: + value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}' + + - name: dml_dir + value: 'Microsoft.AI.DirectML.1.14.1' + - name: dml_zip + value: 'Microsoft.AI.DirectML.1.14.1.zip' + - name: dml_url + value: "https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1" + steps: - task: UsePythonVersion@0 inputs: @@ -97,6 +113,7 @@ jobs: - template: steps/capi-win-step.yml parameters: target: 'python' + ep: ${{ parameters.ep }} - ${{ if eq(parameters.publish_to_ado_feed, true)}}: - template: steps/py-ado-feed-releasing-step.yml diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml index 03f76feb1..2ff4d5c74 100644 --- a/.pipelines/stages/jobs/steps/capi-linux-step.yml +++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml @@ -26,10 +26,10 @@ steps: echo "arch=$(arch)" echo "ep=$(ep)" displayName: 'Print Parameters' - - template: utils/download-ort.yml parameters: archiveType: 'tgz' + - bash: | set -e -x az login --identity --username 63b63039-6328-442f-954b-5a64d124e5b4 diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index 3681bffd4..8afafdb41 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -2,6 +2,9 @@ parameters: - name: target type: string default: 'onnxruntime-genai' +- name: ep + type: string + default: 'cpu' steps: - bash: | echo "##[error]Error: ep and arch are not set" @@ -28,10 +31,21 @@ steps: echo "cuda_version=$(cuda_version)" echo "target=${{ parameters.target }}" displayName: 'Print Parameters' - - template: utils/download-ort.yml parameters: archiveType: 'zip' + ep: ${{ parameters.ep }} + +- ${{ if eq(parameters.ep, 'directml') }}: + - powershell: | + Invoke-WebRequest -Uri $(dml_url) -OutFile $(dml_zip) + Expand-Archive $(dml_zip) -DestinationPath $(dml_dir) + Remove-Item -Path $(dml_zip) + Get-ChildItem -Recurse $(dml_dir) + mv $(dml_dir)\bin\x64-win\DirectML.dll ort\lib + mv $(dml_dir)\include\DirectML.h ort\include + workingDirectory: '$(Build.Repository.LocalPath)' + continueOnError: true - powershell: | azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v$(cuda_version)" 'cuda_sdk' diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml index b5acf6a5a..af502e0df 100644 --- a/.pipelines/stages/jobs/steps/nuget-win-step.yml +++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml @@ -16,12 +16,12 @@ steps: DisplayName: 'ESRP - Sign C# dlls' Pattern: '*OnnxRuntimeGenAI*.dll' - powershell: | - $VERSION = '0.1.0-rc4' + $VERSION = '0.2.0-rc4' nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec ` -Prop version=$VERSION ` -Prop genai_nuget_ext=$(genai_nuget_ext) ` -Prop configuration=$(buildConfig) ` - -Prop buildPath=$(buildDir) + -Prop buildPath=$(buildDir) ` -Prop ortHome=$(ortHome) nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.Managed.nuspec ` -Prop version=$VERSION ` diff --git a/.pipelines/stages/jobs/steps/utils/capi-archive.yml b/.pipelines/stages/jobs/steps/utils/capi-archive.yml index 1395b31f7..6034d255c 100644 --- a/.pipelines/stages/jobs/steps/utils/capi-archive.yml +++ b/.pipelines/stages/jobs/steps/utils/capi-archive.yml @@ -39,6 +39,7 @@ steps: SourceFolder: '$(Build.Repository.LocalPath)/src' Contents: | ort_genai_c.h + ort_genai.h TargetFolder: '$(Build.ArtifactStagingDirectory)/$(artifactName)/include' - task: CopyFiles@2 diff --git a/.pipelines/stages/jobs/steps/utils/download-ort.yml b/.pipelines/stages/jobs/steps/utils/download-ort.yml index 78ffadd7c..5346bade8 100644 --- a/.pipelines/stages/jobs/steps/utils/download-ort.yml +++ b/.pipelines/stages/jobs/steps/utils/download-ort.yml @@ -1,6 +1,9 @@ parameters: - name: archiveType type: string +- name: ep + type: string + default: cpu steps: - bash: | echo "##[error]Error: ort_version and ort_filename are not set" @@ -8,29 +11,54 @@ steps: displayName: 'Check if variables ort_version and ort_filename are set' condition: or( eq (variables['ort_version'], ''), eq (variables['ort_filename'], '')) -- task: DownloadGitHubRelease@0 - inputs: - connection: 'GitHub - Release' - userRepository: 'microsoft/onnxruntime' - defaultVersionType: 'specificTag' - version: 'v$(ort_version)' - itemPattern: '$(ort_filename).${{ parameters.archiveType }}' - downloadPath: '$(Build.Repository.LocalPath)' - displayName: Download $(ort_filename) - -- task: ExtractFiles@1 - inputs: - archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).${{ parameters.archiveType }}' - destinationFolder: '$(Build.Repository.LocalPath)' - cleanDestinationFolder: false - overwriteExistingFiles: true - displayName: Unzip OnnxRuntime - -- task: CopyFiles@2 - inputs: - SourceFolder: '$(Build.Repository.LocalPath)/$(ort_filename)' - TargetFolder: '$(Build.Repository.LocalPath)/ort' - displayName: Copy OnnxRuntime to ort +#Special case for DML +- ${{ if ne(parameters.ep, 'directml') }}: + - task: DownloadGitHubRelease@0 + inputs: + connection: 'GitHub - Release' + userRepository: 'microsoft/onnxruntime' + defaultVersionType: 'specificTag' + version: 'v$(ort_version)' + itemPattern: '$(ort_filename).${{ parameters.archiveType }}' + downloadPath: '$(Build.Repository.LocalPath)' + displayName: Download $(ort_filename) + - task: ExtractFiles@1 + inputs: + archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).${{ parameters.archiveType }}' + destinationFolder: '$(Build.Repository.LocalPath)' + cleanDestinationFolder: false + overwriteExistingFiles: true + displayName: Unzip OnnxRuntime + - task: CopyFiles@2 + inputs: + SourceFolder: '$(Build.Repository.LocalPath)/$(ort_filename)' + TargetFolder: '$(Build.Repository.LocalPath)/ort' + displayName: Copy OnnxRuntime to ort +- ${{ else }}: + - task: DownloadPackage@1 + inputs: + packageType: 'nuget' + feed: '2692857e-05ef-43b4-ba9c-ccf1c22c437c/7982ae20-ed19-4a35-a362-a96ac99897b7' + definition: 'Microsoft.ML.OnnxRuntime.DirectML' # Can also be package name + version: '$(ort_version)' + extract: false + downloadPath: '$(Build.Repository.LocalPath)' + displayName: Download $(ort_filename) + - task: ExtractFiles@1 + inputs: + archiveFilePatterns: '$(Build.Repository.LocalPath)/*.nupkg' + destinationFolder: '$(Build.Repository.LocalPath)/ort' + cleanDestinationFolder: false + overwriteExistingFiles: true + displayName: Unzip OnnxRuntime + - task: CopyFiles@2 + inputs: + SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/win-x64/native' + TargetFolder: '$(Build.Repository.LocalPath)/ort/lib' + - task: CopyFiles@2 + inputs: + SourceFolder: '$(Build.Repository.LocalPath)/ort/build/native/include' + TargetFolder: '$(Build.Repository.LocalPath)/ort/include' - task: DeleteFiles@1 inputs: diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml index f962337ac..93bd7ac84 100644 --- a/.pipelines/stages/nuget-packaging-stage.yml +++ b/.pipelines/stages/nuget-packaging-stage.yml @@ -3,12 +3,16 @@ parameters: type: boolean - name: enable_win_cuda type: boolean +- name: enable_win_dml + type: boolean - name: enable_linux_cpu type: boolean - name: enable_linux_cuda type: boolean - name: ort_version type: string +- name: ort_dml_version + type: string - name: cuda_version type: string default: '' @@ -26,6 +30,16 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'win' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + + - ${{ if eq(parameters.enable_win_dml, true) }}: + - template: jobs/nuget-packaging-job.yml + parameters: + arch: 'x64' + ep: 'directml' + ort_version: ${{ parameters.ort_dml_version }} + os: 'win' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + - ${{ if eq(parameters.enable_win_cuda, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -35,6 +49,7 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'win' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + - ${{ if eq(parameters.enable_linux_cpu, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -43,6 +58,7 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'linux' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + - ${{ if eq(parameters.enable_linux_cuda, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -51,4 +67,5 @@ stages: ep: 'cuda' ort_version: ${{ parameters.ort_version }} os: 'linux' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} \ No newline at end of file + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + \ No newline at end of file diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml index e23581f56..efbd28d9e 100644 --- a/.pipelines/stages/py-packaging-stage.yml +++ b/.pipelines/stages/py-packaging-stage.yml @@ -3,12 +3,16 @@ parameters: type: boolean - name: enable_win_cuda type: boolean +- name: enable_win_dml + type: boolean - name: enable_linux_cpu type: boolean - name: enable_linux_cuda type: boolean - name: ort_version type: string +- name: ort_dml_version + type: string - name: cuda_version type: string default: '' @@ -26,6 +30,16 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'win' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + + - ${{ if eq(parameters.enable_win_dml, true) }}: + - template: jobs/py-packaging-job.yml + parameters: + arch: 'x64' + ep: 'directml' + ort_version: ${{ parameters.ort_dml_version }} + os: 'win' + publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + - ${{ if eq(parameters.enable_win_cuda, true) }}: - template: jobs/py-packaging-job.yml parameters: @@ -35,6 +49,7 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'win' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + - ${{ if eq(parameters.enable_linux_cpu, true) }}: - template: jobs/py-packaging-job.yml @@ -44,6 +59,7 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'linux' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} + - ${{ if eq(parameters.enable_linux_cuda, true) }}: - template: jobs/py-packaging-job.yml parameters: @@ -53,7 +69,4 @@ stages: ort_version: ${{ parameters.ort_version }} os: 'linux' publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - - - - + \ No newline at end of file diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp index 3a4c9b43b..2d4b62b1f 100644 --- a/benchmark/c/main.cpp +++ b/benchmark/c/main.cpp @@ -112,7 +112,7 @@ void WriteE2EStats(std::string_view label, << "\n"; } -std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) { +std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) { const char* const base_prompt = "A"; auto base_prompt_sequences = OgaSequences::Create(); diff --git a/cmake/presets/CMakeWinConfigPresets.json b/cmake/presets/CMakeWinConfigPresets.json index 57b74ae0d..3c885fdef 100644 --- a/cmake/presets/CMakeWinConfigPresets.json +++ b/cmake/presets/CMakeWinConfigPresets.json @@ -271,10 +271,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml release asan", - "binaryDir": "${sourceDir}/build/directml_asan", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml_asan" }, { "name": "windows_x64_directml_debug_asan", @@ -283,10 +280,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml debug asan", - "binaryDir": "${sourceDir}/build/directml_asan", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml_asan" }, { "name": "windows_x64_directml_relwithdebinfo_asan", @@ -295,10 +289,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml relwithdebinfo asan", - "binaryDir": "${sourceDir}/build/directml_asan", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml_asan" }, { "name": "windows_x64_directml_minsizerel_asan", @@ -307,10 +298,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml minsizerel asan", - "binaryDir": "${sourceDir}/build/directml_asan", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml_asan" }, { "name": "windows_x64_directml_release", @@ -319,10 +307,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml release", - "binaryDir": "${sourceDir}/build/directml", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml" }, { "name": "windows_x64_directml_debug", @@ -331,10 +316,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml debug", - "binaryDir": "${sourceDir}/build/directml", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml" }, { "name": "windows_x64_directml_relwithdebinfo", @@ -343,10 +325,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml relwithdebinfo", - "binaryDir": "${sourceDir}/build/directml", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml" }, { "name": "windows_x64_directml_minsizerel", @@ -355,10 +334,7 @@ "windows_directml_default" ], "displayName": "windows x64 directml minsizerel", - "binaryDir": "${sourceDir}/build/directml", - "cacheVariables": { - "USE_DML": "ON" - } + "binaryDir": "${sourceDir}/build/directml" }, { "name": "windows_arm64_cpu_relwithdebinfo", diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 4e8a5660c..2e3b93aa3 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -2,62 +2,139 @@ ## Steps 1. [Download Phi-3 Mini](#download-the-model) -2. [Install the generate() API](#install-the-generate()-api-package) -3. [Run Phi-3 Mini](#run-the-model) +2. [Build ONNX Runtime shared libraries](#build-onnx-runtime-from-source) +3. [Build generate() API](#build-the-generate-api-from-source) +4. [Run Phi-3 Mini](#run-the-model) ## Download the model Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face. +There are ONNX models for CPU (used for mobile too), as well as DirectML and CUDA. -For the short context model. + +## Install the generate() API package + +Right now, both `onnxruntime` and `onnxruntime-genai` need to be built from source. Once packages are published, this tutorial will be updated. + +The instructions for how to build both packages from source are documented in the [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html) guide. They are repeated here for your convenience. + +### Pre-requisites + +#### CMake + +This is included on Windows if you have Visual Studio installed. If you are running on Linux or Mac, you can install it using `conda`. ```bash -git clone https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx +conda install cmake ``` -For the long context model +### Build ONNX Runtime from source + +#### Clone the repo ```bash -git clone https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx +git clone https://github.com/microsoft/onnxruntime.git +cd onnxruntime ``` -These model repositories have models that run with DirectML, CPU and CUDA. +#### Build ONNX Runtime for DirectML on Windows -## Install the generate() API package +```bash +build.bat --build_shared_lib --skip_tests --parallel --use_dml --config Release +``` -### DirectML +#### Build ONNX Runtime for CPU on Windows +```bash +build.bat --build_shared_lib --skip_tests --parallel --config Release ``` -pip install numpy -pip install --pre onnxruntime-genai-directml + +#### Build ONNX Runtime for CUDA on Windows + +```bash +build.bat --build_shared_lib --skip_tests --parallel --use_cuda --config Release ``` -### CPU +#### Build ONNX Runtine on Linux +```bash +./build.sh --build_shared_lib --skip_tests --parallel [--use_cuda] --config Release ``` -pip install numpy -pip install --pre onnxruntime-genai + +You may need to provide extra command line options for building with CUDA on Linux. An example full command is as follows. + +```bash +./build.sh --parallel --build_shared_lib --use_cuda --cuda_version 11.8 --cuda_home /usr/local/cuda-11.8 --cudnn_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="80" --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda-11.8/bin/nvcc ``` -### CUDA +Replace the values given above for different versions and locations of CUDA. +#### Build ONNX Runtime on Mac + +```bash +./build.sh --build_shared_lib --skip_tests --parallel --config Release ``` -pip install numpy -pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ + +### Build the generate() API from source + +#### Clone the repo + +```bash +git clone https://github.com/microsoft/onnxruntime-genai +cd onnxruntime-genai +mkdir -p ort/include +mkdir -p ort/lib +``` + +#### Build the generate() API on Windows + + +If building for DirectML + +```bash +copy ..\onnxruntime\include\onnxruntime\core\providers\dml\dml_provider_factory.h ort\include +``` + +```bash +copy ..\onnxruntime\include\onnxruntime\core\session\onnxruntime_c_api.h ort\include +copy ..\onnxruntime\build\Windows\Release\Release\*.dll ort\lib +copy ..\onnxruntime\build\Windows\Release\Release\onnxruntime.lib ort\lib +python build.py [--use_dml | --use_cuda] +cd build\wheel +pip install *.whl +``` + + +#### Build the generate() API on Linux + +```bash +cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include +cp ../onnxruntime/build/Linux/Release/libonnxruntime*.so* ort/lib +python build.py [--use_cuda] +cd build/wheel +pip install *.whl +``` + +#### Build the generate() API on Mac + +```bash +cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include +cp ../onnxruntime/build/MacOS/Release/libonnxruntime*.dylib* ort/lib +python build.py +cd build/wheel +pip install *.whl ``` ## Run the model -Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). +Run the model with [this script](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line. -This example is using the long context model running with DirectML on Windows. - ```bash -curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py -python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 +pip install numpy +python model-qa.py -m models/phi3-mini-4k-instruct-cpu-int4-rtn-block-32 ``` Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: diff --git a/src/csharp/GeneratorParams.cs b/src/csharp/GeneratorParams.cs index 5aee3be3e..a48e5c4a5 100644 --- a/src/csharp/GeneratorParams.cs +++ b/src/csharp/GeneratorParams.cs @@ -30,6 +30,11 @@ public void SetSearchOption(string searchOption, bool value) Result.VerifySuccess(NativeMethods.OgaGeneratorParamsSetSearchBool(_generatorParamsHandle, StringUtils.ToUtf8(searchOption), value)); } + public void TryGraphCaptureWithMaxBatchSize(int maxBatchSize) + { + Result.VerifySuccess(NativeMethods.OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(_generatorParamsHandle, maxBatchSize)); + } + public void SetInputIDs(ReadOnlySpan inputIDs, ulong sequenceLength, ulong batchSize) { unsafe diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index da9d20e29..f2906f3df 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -53,6 +53,10 @@ internal class NativeLib byte[] /* const char* */ searchOption, bool value); + [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] + public static extern IntPtr /* OgaResult* */ OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(IntPtr /* OgaGeneratorParams* */ generatorParams, + int /* int32_t */ maxBatchSize); + // This function is used to set the input IDs for the generator. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern unsafe IntPtr /* OgaResult* */ OgaGeneratorParamsSetInputIDs(IntPtr /* OgaGeneratorParams* */ generatorParams, @@ -67,7 +71,7 @@ internal class NativeLib IntPtr /* const OgaSequences* */ sequences); [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaGenerator** */ generator); @@ -125,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq // This function is used to generate sequences for the given model using the given generator parameters. // The OgaSequences object is an array of sequences, where each sequence is an array of tokens. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaSequences** */ sequences); diff --git a/src/ort_genai.h b/src/ort_genai.h index e2c560637..fb863dae2 100644 --- a/src/ort_genai.h +++ b/src/ort_genai.h @@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract { return std::unique_ptr(p); } - std::unique_ptr Generate(const OgaGeneratorParams& params) const { + std::unique_ptr Generate(const OgaGeneratorParams& params) { OgaSequences* p; OgaCheckResult(OgaGenerate(this, ¶ms, &p)); return std::unique_ptr(p); @@ -193,11 +193,15 @@ struct OgaGeneratorParams : OgaAbstract { OgaCheckResult(OgaGeneratorParamsSetInputSequences(this, &sequences)); } + void TryGraphCaptureWithMaxBatchSize(int max_batch_size) { + OgaCheckResult(OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(this, max_batch_size)); + } + static void operator delete(void* p) { OgaDestroyGeneratorParams(reinterpret_cast(p)); } }; struct OgaGenerator : OgaAbstract { - static std::unique_ptr Create(const OgaModel& model, const OgaGeneratorParams& params) { + static std::unique_ptr Create(OgaModel& model, const OgaGeneratorParams& params) { OgaGenerator* p; OgaCheckResult(OgaCreateGenerator(&model, ¶ms, &p)); return std::unique_ptr(p); diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 1245c60d8..13cae5235 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -105,6 +105,14 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene OGA_CATCH } +OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) { + OGA_TRY + auto* params = reinterpret_cast(generator_params); + params->max_batch_size = max_batch_size; + return nullptr; + OGA_CATCH +} + OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputIDs(OgaGeneratorParams* oga_params, const int32_t* input_ids, size_t input_ids_count, size_t sequence_length, size_t batch_size) { OGA_TRY auto& params = *reinterpret_cast(oga_params); @@ -135,17 +143,23 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera OGA_CATCH } -OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { +OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { OGA_TRY - auto result = Generators::Generate(*reinterpret_cast(model), *reinterpret_cast(generator_params)); + auto* model_p = reinterpret_cast(model); + auto* params = reinterpret_cast(generator_params); + model_p->GetMaxBatchSizeFromGeneratorParams(*params); + auto result = Generators::Generate(*model_p, *params); *out = reinterpret_cast(std::make_unique(std::move(result)).release()); return nullptr; OGA_CATCH } -OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { +OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { OGA_TRY - *out = reinterpret_cast(CreateGenerator(*reinterpret_cast(model), *reinterpret_cast(generator_params)).release()); + auto* model_p = reinterpret_cast(model); + auto* params = reinterpret_cast(generator_params); + model_p->GetMaxBatchSizeFromGeneratorParams(*params); + *out = reinterpret_cast(CreateGenerator(*model_p, *params).release()); return nullptr; OGA_CATCH } diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index 5b6b9034f..0939d2c36 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -117,7 +117,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model); * after it is done using the sequences. * \return OgaResult containing the error message if the generation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); /* * \brief Creates a OgaGeneratorParams from the given model. @@ -135,6 +135,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyGeneratorParams(OgaGeneratorParams* gener OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchNumber(OgaGeneratorParams* generator_params, const char* name, double value); OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* generator_params, const char* name, bool value); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size); /* * \brief Sets the input ids for the generator params. The input ids are used to seed the generation. @@ -166,7 +167,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O * \param[out] out The created generator. * \return OgaResult containing the error message if the generator creation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); /* * \brief Destroys the given generator. From bf7d2f10e1d15c9848ff29824a5a9c142418dc7e Mon Sep 17 00:00:00 2001 From: Parinita Rahi <101819959+parinitarahi@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:38:06 -0700 Subject: [PATCH 04/15] Update README.md (#350) Added changes based on Marco's feedback --- README.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 68b1cfbd5..f6cab5472 100644 --- a/README.md +++ b/README.md @@ -51,24 +51,33 @@ See full documentation at [https://onnxruntime.ai/docs/genai]. ## Installation -### DirectML +If you don't know which hardware capabilities is available on your device. +* Windows GPU (use DirectML): [Verify if you have Windows GPU](https://www.microsoft.com/en-us/windows/learning-center/how-to-check-gpu) + +* CUDA GPU: [Verify if you have CUDA GPU](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#verify-you-have-a-cuda-capable-gpu) + +* CPU and Mobile: For Windows, Mac, Android and other devices use the CPU and Mobile option below + +### Windows GPU (DirectML) ```bash pip install [--pre] numpy onnxruntime-genai-directml ``` -### CPU +### CUDA GPU ```bash -pip install [--pre] numpy onnxruntime-genai +pip install numpy onnxruntime-genai-cuda --pre --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ ``` -### CUDA + +### CPU and Mobile ```bash -pip install numpy onnxruntime-genai-cuda --pre --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ +pip install [--pre] numpy onnxruntime-genai ``` + ## Sample code for phi-2 in Python [Install](https://onnxruntime.ai/docs/genai/howto/install) the onnxruntime-genai Python package. From 2fa3964db05e095a2d1eea1aca4885c67c870f09 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Mon, 29 Apr 2024 14:57:38 -0700 Subject: [PATCH 05/15] Change phi-3-tutorial.md back (#354) --- examples/python/phi-3-tutorial.md | 121 ++++++------------------------ 1 file changed, 22 insertions(+), 99 deletions(-) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 2e3b93aa3..4e8a5660c 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -2,139 +2,62 @@ ## Steps 1. [Download Phi-3 Mini](#download-the-model) -2. [Build ONNX Runtime shared libraries](#build-onnx-runtime-from-source) -3. [Build generate() API](#build-the-generate-api-from-source) -4. [Run Phi-3 Mini](#run-the-model) +2. [Install the generate() API](#install-the-generate()-api-package) +3. [Run Phi-3 Mini](#run-the-model) ## Download the model Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face. -There are ONNX models for CPU (used for mobile too), as well as DirectML and CUDA. - -## Install the generate() API package - -Right now, both `onnxruntime` and `onnxruntime-genai` need to be built from source. Once packages are published, this tutorial will be updated. - -The instructions for how to build both packages from source are documented in the [build from source](https://onnxruntime.ai/docs/genai/howto/build-from-source.html) guide. They are repeated here for your convenience. - -### Pre-requisites - -#### CMake - -This is included on Windows if you have Visual Studio installed. If you are running on Linux or Mac, you can install it using `conda`. - -```bash -conda install cmake -``` - -### Build ONNX Runtime from source - -#### Clone the repo - -```bash -git clone https://github.com/microsoft/onnxruntime.git -cd onnxruntime -``` - -#### Build ONNX Runtime for DirectML on Windows - -```bash -build.bat --build_shared_lib --skip_tests --parallel --use_dml --config Release -``` - -#### Build ONNX Runtime for CPU on Windows - -```bash -build.bat --build_shared_lib --skip_tests --parallel --config Release -``` - -#### Build ONNX Runtime for CUDA on Windows +For the short context model. ```bash -build.bat --build_shared_lib --skip_tests --parallel --use_cuda --config Release +git clone https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx ``` -#### Build ONNX Runtine on Linux +For the long context model ```bash -./build.sh --build_shared_lib --skip_tests --parallel [--use_cuda] --config Release +git clone https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx ``` -You may need to provide extra command line options for building with CUDA on Linux. An example full command is as follows. +These model repositories have models that run with DirectML, CPU and CUDA. -```bash -./build.sh --parallel --build_shared_lib --use_cuda --cuda_version 11.8 --cuda_home /usr/local/cuda-11.8 --cudnn_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES="80" --cmake_extra_defines CMAKE_CUDA_COMPILER=/usr/local/cuda-11.8/bin/nvcc -``` - -Replace the values given above for different versions and locations of CUDA. +## Install the generate() API package -#### Build ONNX Runtime on Mac +### DirectML -```bash -./build.sh --build_shared_lib --skip_tests --parallel --config Release ``` - -### Build the generate() API from source - -#### Clone the repo - -```bash -git clone https://github.com/microsoft/onnxruntime-genai -cd onnxruntime-genai -mkdir -p ort/include -mkdir -p ort/lib +pip install numpy +pip install --pre onnxruntime-genai-directml ``` -#### Build the generate() API on Windows - - -If building for DirectML +### CPU -```bash -copy ..\onnxruntime\include\onnxruntime\core\providers\dml\dml_provider_factory.h ort\include ``` - -```bash -copy ..\onnxruntime\include\onnxruntime\core\session\onnxruntime_c_api.h ort\include -copy ..\onnxruntime\build\Windows\Release\Release\*.dll ort\lib -copy ..\onnxruntime\build\Windows\Release\Release\onnxruntime.lib ort\lib -python build.py [--use_dml | --use_cuda] -cd build\wheel -pip install *.whl +pip install numpy +pip install --pre onnxruntime-genai ``` +### CUDA -#### Build the generate() API on Linux - -```bash -cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include -cp ../onnxruntime/build/Linux/Release/libonnxruntime*.so* ort/lib -python build.py [--use_cuda] -cd build/wheel -pip install *.whl ``` - -#### Build the generate() API on Mac - -```bash -cp ../onnxruntime/include/onnxruntime/core/session/onnxruntime_c_api.h ort/include -cp ../onnxruntime/build/MacOS/Release/libonnxruntime*.dylib* ort/lib -python build.py -cd build/wheel -pip install *.whl +pip install numpy +pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ ``` ## Run the model -Run the model with [this script](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). +Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line. +This example is using the long context model running with DirectML on Windows. + ```bash -pip install numpy -python model-qa.py -m models/phi3-mini-4k-instruct-cpu-int4-rtn-block-32 +curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py +python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 ``` Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: From 2f619f904ca446427f914bef4540545645f66ed5 Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Mon, 29 Apr 2024 15:19:40 -0700 Subject: [PATCH 06/15] Remove NO_TOKENIZER build option (#353) We always build with tokenizer now, and the option was broken anyways as it wasn't being updated. --- CMakeLists.txt | 15 +++++---------- cmake/options.cmake | 1 - src/models/model.cpp | 22 ---------------------- src/models/model.h | 17 ----------------- test/CMakeLists.txt | 9 +++------ test/model_tests.cpp | 8 -------- 6 files changed, 8 insertions(+), 64 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index acf8f22f6..7aa8021a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,16 +179,11 @@ if(USE_DML) add_dependencies(onnxruntime-genai-static RESTORE_PACKAGES) endif() -if(NO_TOKENIZEROOT) - add_compile_definitions(NO_TOKENIZER=1) - message("----------------Tokenizer Disabled------------------") -else() - add_subdirectory("${CMAKE_SOURCE_DIR}/src/tokenizer") - target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT}) - target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT}) - target_link_libraries(onnxruntime-genai PRIVATE tokenizer) - target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer) -endif() +add_subdirectory("${CMAKE_SOURCE_DIR}/src/tokenizer") +target_include_directories(onnxruntime-genai PRIVATE ${TOKENIZER_ROOT}) +target_include_directories(onnxruntime-genai-static PUBLIC ${TOKENIZER_ROOT}) +target_link_libraries(onnxruntime-genai PRIVATE tokenizer) +target_link_libraries(onnxruntime-genai-static PUBLIC tokenizer) if(ENABLE_TESTS) add_subdirectory("${CMAKE_SOURCE_DIR}/test") diff --git a/cmake/options.cmake b/cmake/options.cmake index ac40a6d1d..688633fda 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -2,7 +2,6 @@ include(CMakeDependentOption) option(USE_CUDA "Build with CUDA support" ON) option(USE_DML "Build with DML support" OFF) -option(NO_TOKENIZER "Don't include the Tokenizer" OFF) option(ENABLE_PYTHON "Build the Python API." ON) option(ENABLE_TESTS "Enable tests" ON) option(TEST_PHI2 "Enable tests for Phi2" OFF) diff --git a/src/models/model.cpp b/src/models/model.cpp index 4e7aa7343..439ab5c6a 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -94,26 +94,6 @@ std::vector PadInputs(std::span> sequences, in return result; } -#ifdef NO_TOKENIZER -const std::string& TokenizerStream::Decode(int32_t token) { - throw std::runtime_error("Tokenizer not enabled"); -} - -std::unique_ptr Tokenizer::CreateStream() const { - return std::make_unique(); -} - -Tokenizer::Tokenizer(Config& config) { -} - -std::vector Tokenizer::Encode(const char* text) const { - throw std::runtime_error("Tokenizer not enabled"); -} - -std::string Tokenizer::Decode(std::span tokens) const { - throw std::runtime_error("Tokenizer not enabled"); -} -#else void CheckResult(tfmError_t error) { if (error != kTfmOK) throw std::runtime_error(TfmGetLastErrorMessage()); @@ -179,8 +159,6 @@ std::vector Tokenizer::DecodeBatch(std::span sequenc return strings; } -#endif - #if USE_CUDA // Since Python/Others can and will hold onto a generator object past the model object's lifetime we need to ensure // the allocator used is not destroyed until last. This keeps the allocator around until exit, after all other memory diff --git a/src/models/model.h b/src/models/model.h index b569373f8..fe3b9d832 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -1,8 +1,5 @@ #pragma once -#ifndef NO_TOKENIZER #include "tfmtok_c.h" -#endif - #include "captured_graph_pool.h" #if USE_DML @@ -36,19 +33,6 @@ struct State { void ClearIO(); // Clear all inputs/outputs }; -#ifdef NO_TOKENIZER -struct TokenizerStream { - const std::string& Decode(int32_t token); -}; - -struct Tokenizer { - Tokenizer(Config& config); - - std::vector Encode(const char* text) const; - std::string Decode(std::span tokens) const; -}; -#else - template struct TfmPtr { ~TfmPtr() { TfmDispose(&p_); } @@ -94,7 +78,6 @@ struct Tokenizer : std::enable_shared_from_this { private: int32_t pad_token_id_; }; -#endif struct SessionInfo { SessionInfo(OrtSession& session); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index daf8c40b3..80bb58fcd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -43,12 +43,9 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) target_sources(unit_tests PRIVATE ${cuda_test_srcs}) endif() -if(NO_TOKENIZER) - add_compile_definitions(NO_TOKENIZER=1) -else() - target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT}) - target_link_libraries(unit_tests PRIVATE tokenizer) -endif() +target_include_directories(unit_tests PRIVATE ${TOKENIZER_ROOT}) +target_link_libraries(unit_tests PRIVATE tokenizer) + set(TEST_MODEL_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/test_models/") set(TEST_MODEL_DES_DIR "$/test_models/") add_custom_command(TARGET unit_tests POST_BUILD diff --git a/test/model_tests.cpp b/test/model_tests.cpp index 66ceaee83..73c6464e0 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -203,7 +203,6 @@ TEST(ModelTests, BeamSearchGptCuda) { TEST(ModelTests, TestApiCuda) { #if TEST_PHI2 -#ifndef NO_TOKENIZER auto prompt = R"( def print_prime(n): @@ -234,15 +233,11 @@ Print all primes between 1 and n auto result = generator->GetSequence(0); std::cout << tokenizer->Decode(result.GetCPU()) << "\r\n"; -#else - std::cout << "Test skipped - not built with onnxruntime extensions\r\n"; -#endif #endif } TEST(ModelTests, TestHighLevelApiCuda) { #if TEST_PHI2 -#ifndef NO_TOKENIZER auto prompt = R"( def print_prime(n): ''' @@ -266,9 +261,6 @@ Print all primes between 1 and n auto result = Generators::Generate(*model, *params); std::cout << tokenizer->Decode(result[0]) << "\r\n"; -#else - std::cout << "Test skipped - not built with onnxruntime extensions\r\n"; -#endif #endif } From 639d176d4a39cb32dc1087644f0a9c2a18bcf5a5 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 29 Apr 2024 18:26:33 -0700 Subject: [PATCH 07/15] Use the most performant adapter for DML (#333) --- CMakeLists.txt | 4 ++-- src/dml/dml_helpers.cpp | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7aa8021a1..7eaa9f83c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,8 +150,8 @@ if(USE_DML) target_include_directories(onnxruntime-genai-static PUBLIC $) target_include_directories(onnxruntime-genai-static PUBLIC $/directx) target_include_directories(onnxruntime-genai-static PUBLIC $) - target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib) - target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib) + target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib dxcore.lib dxguid.lib) + target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib dxcore.lib dxguid.lib) get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps ABSOLUTE) set(DXC_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.Direct3D.DXC.1.7.2308.12) diff --git a/src/dml/dml_helpers.cpp b/src/dml/dml_helpers.cpp index e7a0c2f08..2dcd0267f 100644 --- a/src/dml/dml_helpers.cpp +++ b/src/dml/dml_helpers.cpp @@ -9,6 +9,29 @@ namespace DmlHelpers { +static ComPtr CreatePerformantAdapter() { + ComPtr adapter_factory; + THROW_IF_FAILED(DXCoreCreateAdapterFactory(adapter_factory.GetAddressOf())); + + ComPtr adapter_list; + THROW_IF_FAILED(adapter_factory->CreateAdapterList( + 1, + &DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE, + adapter_list.GetAddressOf())); + + // We prefer the hightest performance adapter + std::array adapter_list_preferences = {DXCoreAdapterPreference::HighPerformance}; + + THROW_IF_FAILED(adapter_list->Sort( + static_cast(adapter_list_preferences.size()), + adapter_list_preferences.data())); + + ComPtr performant_adapter; + THROW_IF_FAILED(adapter_list->GetAdapter(0, performant_adapter.GetAddressOf())); + + return performant_adapter; +} + DmlObjects CreateDmlObjects() { D3D12_COMMAND_QUEUE_DESC command_queue_description = { D3D12_COMMAND_LIST_TYPE_COMPUTE, @@ -19,7 +42,8 @@ DmlObjects CreateDmlObjects() { DmlObjects dml_objects; - THROW_IF_FAILED(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device))); + auto adapter = CreatePerformantAdapter(); + THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device))); THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandQueue(&command_queue_description, IID_PPV_ARGS(&dml_objects.command_queue))); THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&dml_objects.command_allocator))); THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, dml_objects.command_allocator.Get(), nullptr, IID_PPV_ARGS(&dml_objects.command_list))); From 514493be37eebbfcdf387c3d83ac71650303d18a Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Mon, 29 Apr 2024 21:05:23 -0700 Subject: [PATCH 08/15] Improve tutorial (#355) --- examples/python/phi-3-tutorial.md | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 4e8a5660c..6d2f5f728 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -26,25 +26,40 @@ These model repositories have models that run with DirectML, CPU and CUDA. ## Install the generate() API package +**Unsure about which installation instructions to follow?** Here's a bit more guidance: + +Are you on Windows machine with GPU? +* I don't know → Review [this guide](https://www.microsoft.com/en-us/windows/learning-center/how-to-check-gpu) to see whether you have a GPU in your Windows machine. +* Yes → Follow the instructions for [DirectML](#directml). +* No → Do you have an NVIDIA GPU? + * I don't know → Review [this guide](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#verify-you-have-a-cuda-capable-gpu) to see whether you have a CUDA-capable GPU. + * Yes → Follow the instructions for [NVIDIA CUDA GPU](#nvidia-cuda-gpu). + * No → Follow the instructions for [CPU](#cpu). + +*Note: Only one package is required based on your hardware.* + ### DirectML + ``` pip install numpy pip install --pre onnxruntime-genai-directml ``` -### CPU +### NVIDIA CUDA GPU + ``` pip install numpy -pip install --pre onnxruntime-genai +pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ ``` -### CUDA +### CPU + ``` pip install numpy -pip install --pre onnxruntime-genai-cuda --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/ +pip install --pre onnxruntime-genai ``` ## Run the model @@ -55,6 +70,9 @@ The script accepts a model folder and takes the generation parameters from the c This example is using the long context model running with DirectML on Windows. +The `-m` argument is the path to the model you downloaded from HuggingFace above. +The `-l` argument is the length of output you would like to generate with the model. + ```bash curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 @@ -66,4 +84,4 @@ Once the script has loaded the model, it will ask you for input in a loop, strea Input: <|user|>Tell me a joke about creative writing<|end|><|assistant|> Output: Why don't writers ever get lost? Because they always follow the plot! -``` +``` \ No newline at end of file From a028d7879a2adc92f6a8bf0babe01d1c7398c15f Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Tue, 30 Apr 2024 09:31:46 -0700 Subject: [PATCH 09/15] Simulate the chat template (#352) --- examples/python/model-qa.py | 12 ++++- examples/python/phi-3-tutorial.md | 6 +-- examples/python/phi3-qa.py | 87 +++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 examples/python/phi3-qa.py diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py index 6f323ccc4..57ec9f6db 100644 --- a/examples/python/model-qa.py +++ b/examples/python/model-qa.py @@ -15,6 +15,9 @@ def main(args): if args.verbose: print("Tokenizer created") if args.verbose: print() search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1: + print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'") + exit(1) # Keep asking for input prompts in a loop while True: @@ -25,7 +28,12 @@ def main(args): if args.timings: started_timestamp = time.time() - input_tokens = tokenizer.encode(args.system_prompt + text) + # If there is a chat template, use it + prompt = text + if args.chat_template: + prompt = f'{args.chat_template.format(input=text)}' + + input_tokens = tokenizer.encode(prompt) params = og.GeneratorParams(model) params.try_use_cuda_graph_with_max_batch_size(1) @@ -76,7 +84,7 @@ def main(args): parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-s', '--system_prompt', type=str, default='', help='Prepend a system prompt to the user input prompt. Defaults to empty') parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') + parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}') args = parser.parse_args() main(args) \ No newline at end of file diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 6d2f5f728..5442886a3 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -64,7 +64,7 @@ pip install --pre onnxruntime-genai ## Run the model -Run the model with [model-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/model-qa.py). +Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/blob/main/examples/python/phi3-qa.py). The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line. @@ -74,8 +74,8 @@ The `-m` argument is the path to the model you downloaded from HuggingFace above The `-l` argument is the length of output you would like to generate with the model. ```bash -curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/model-qa.py -o model-qa.py -python model-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 +curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py +python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128 -l 2048 ``` Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py new file mode 100644 index 000000000..9e9392895 --- /dev/null +++ b/examples/python/phi3-qa.py @@ -0,0 +1,87 @@ +import onnxruntime_genai as og +import argparse +import time + +def main(args): + if args.verbose: print("Loading model...") + if args.timings: + started_timestamp = 0 + first_token_timestamp = 0 + + model = og.Model(f'{args.model}') + if args.verbose: print("Model loaded") + tokenizer = og.Tokenizer(model) + tokenizer_stream = tokenizer.create_stream() + if args.verbose: print("Tokenizer created") + if args.verbose: print() + search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' + + # Keep asking for input prompts in a loop + while True: + text = input("Input: ") + if not text: + print("Error, input cannot be empty") + continue + + if args.timings: started_timestamp = time.time() + + # If there is a chat template, use it + prompt = f'{chat_template.format(input=text)}' + + print(f"Prompt: {prompt}") + + input_tokens = tokenizer.encode(prompt) + + params = og.GeneratorParams(model) + params.try_use_cuda_graph_with_max_batch_size(1) + params.set_search_options(**search_options) + params.input_ids = input_tokens + generator = og.Generator(model, params) + if args.verbose: print("Generator created") + + if args.verbose: print("Running generation loop ...") + if args.timings: + first = True + new_tokens = [] + + print() + print("Output: ", end='', flush=True) + + try: + while not generator.is_done(): + generator.compute_logits() + generator.generate_next_token() + if args.timings: + if first: + first_token_timestamp = time.time() + first = False + + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end='', flush=True) + if args.timings: new_tokens.append(new_token) + except KeyboardInterrupt: + print(" --control+c pressed, aborting generation--") + print() + print() + + if args.timings: + prompt_time = first_token_timestamp - started_timestamp + run_time = time.time() - first_token_timestamp + print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") + parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)') + parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') + parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') + parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') + parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') + parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') + parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') + parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') + parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') + args = parser.parse_args() + main(args) \ No newline at end of file From cb4e3aaf982abcd22adfc01a9d4663f76f645561 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Tue, 30 Apr 2024 09:35:37 -0700 Subject: [PATCH 10/15] Update phi-3-tutorial.md (#361) --- examples/python/phi-3-tutorial.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md index 5442886a3..5ee9331e3 100644 --- a/examples/python/phi-3-tutorial.md +++ b/examples/python/phi-3-tutorial.md @@ -81,7 +81,7 @@ python phi3-qa.py -m Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-bl Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example: ```bash -Input: <|user|>Tell me a joke about creative writing<|end|><|assistant|> +Input: Tell me a joke about creative writing Output: Why don't writers ever get lost? Because they always follow the plot! -``` \ No newline at end of file +``` From afd2edc892c58a41f6b69df0200e96ea6785d9e6 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Tue, 30 Apr 2024 14:48:45 -0700 Subject: [PATCH 11/15] Make OgaModel* const again (#356) --- benchmark/c/main.cpp | 2 +- src/csharp/NativeMethods.cs | 4 ++-- src/generators.cpp | 20 +++++++++++++++++++- src/generators.h | 6 ++++++ src/models/captured_graph_pool.cpp | 2 +- src/models/decoder_only.cpp | 4 ++-- src/models/model.cpp | 22 ---------------------- src/models/model.h | 5 ----- src/ort_genai.h | 4 ++-- src/ort_genai_c.cpp | 16 +++++----------- src/ort_genai_c.h | 4 ++-- src/python/python.cpp | 5 ++--- 12 files changed, 42 insertions(+), 52 deletions(-) diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp index 2d4b62b1f..3a4c9b43b 100644 --- a/benchmark/c/main.cpp +++ b/benchmark/c/main.cpp @@ -112,7 +112,7 @@ void WriteE2EStats(std::string_view label, << "\n"; } -std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) { +std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) { const char* const base_prompt = "A"; auto base_prompt_sequences = OgaSequences::Create(); diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index f2906f3df..a56e7dd7e 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -71,7 +71,7 @@ internal class NativeLib IntPtr /* const OgaSequences* */ sequences); [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaGenerator** */ generator); @@ -129,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq // This function is used to generate sequences for the given model using the given generator parameters. // The OgaSequences object is an array of sequences, where each sequence is an array of tokens. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaSequences** */ sequences); diff --git a/src/generators.cpp b/src/generators.cpp index 0c664f341..bc00f8d3e 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -61,7 +61,25 @@ GeneratorParams::GeneratorParams(const Model& model) eos_token_id{model.config_->model.eos_token_id}, vocab_size{model.config_->model.vocab_size}, device_type{model.device_type_}, - cuda_stream{model.cuda_stream_} { + cuda_stream{model.cuda_stream_}, + is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} { +} + +void GeneratorParams::TryGraphCapture(int max_bs) { + if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) { + // no-op + return; + } + + if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) { + if (max_bs == 0) { + throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set."); + } + use_cuda_graph = true; + max_batch_size = max_bs; + } else { + throw std::runtime_error("CUDA graph is not supported on this device"); + } } std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params) { diff --git a/src/generators.h b/src/generators.h index c10868570..c6a510739 100644 --- a/src/generators.h +++ b/src/generators.h @@ -61,6 +61,7 @@ struct GeneratorParams : std::enable_shared_from_this { int batch_size{1}; int max_batch_size{0}; + bool use_cuda_graph{}; int sequence_length{}; int BatchBeamSize() const { return search.num_beams * batch_size; } @@ -97,6 +98,11 @@ struct GeneratorParams : std::enable_shared_from_this { std::vector input_ids_owner; // Backing memory of input_ids in some cases std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime + + void TryGraphCapture(int max_bs); + + private: + bool is_cuda_graph_enabled_{}; }; struct Generator { diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp index 140f2a8cd..96cc029b8 100644 --- a/src/models/captured_graph_pool.cpp +++ b/src/models/captured_graph_pool.cpp @@ -24,7 +24,7 @@ static std::tuple MakeKey(int max_batch_size, int max_length, int } CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const { - if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { + if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { return nullptr; } diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index 83d1f03d3..53f4f6697 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -26,7 +26,7 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArra RoamingArray DecoderOnly_State::Run(int current_length, RoamingArray next_tokens, RoamingArray next_indices) { if (first_run_) { - if (model_.use_cuda_graph_) { + if (params_->use_cuda_graph) { model_.run_options_->AddConfigEntry("gpu_graph_id", "-1"); } first_run_ = false; @@ -37,7 +37,7 @@ RoamingArray DecoderOnly_State::Run(int current_length, RoamingArrayuse_cuda_graph) { int new_batch_size = static_cast(input_ids_.GetShape()[0]); if (new_batch_size != current_batch_size_) { current_batch_size_ = new_batch_size; diff --git a/src/models/model.cpp b/src/models/model.cpp index 439ab5c6a..6f0cc294a 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -515,26 +515,4 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, return expanded; } -void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) { - bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options); - max_batch_size_ = params.max_batch_size; - - if (DeviceType::CUDA == device_type_) { - if (is_cuda_graph_enabled) { - if (max_batch_size_ == 0) { - throw std::runtime_error("CUDA graph is enabled, but max_batch_size is not set."); - } - use_cuda_graph_ = true; - } - } else if (DeviceType::DML == device_type_) { - if (max_batch_size_ == 0) { - throw std::runtime_error("max_batch_size needs to be set when using DirectML."); - } - - use_cuda_graph_ = true; - } else if (is_cuda_graph_enabled) { - throw std::runtime_error("CUDA graph is not supported on this device"); - } -} - } // namespace Generators diff --git a/src/models/model.h b/src/models/model.h index fe3b9d832..5b9ec12d9 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -102,8 +102,6 @@ struct Model : std::enable_shared_from_this { std::unique_ptr ExpandInputs(std::unique_ptr& input, int num_beams) const; - void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params); - CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); } std::unique_ptr config_; @@ -119,9 +117,6 @@ struct Model : std::enable_shared_from_this { std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime - bool use_cuda_graph_{}; - int max_batch_size_{}; - #if USE_DML DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); } DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); } diff --git a/src/ort_genai.h b/src/ort_genai.h index fb863dae2..b8e55bf19 100644 --- a/src/ort_genai.h +++ b/src/ort_genai.h @@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract { return std::unique_ptr(p); } - std::unique_ptr Generate(const OgaGeneratorParams& params) { + std::unique_ptr Generate(const OgaGeneratorParams& params) const { OgaSequences* p; OgaCheckResult(OgaGenerate(this, ¶ms, &p)); return std::unique_ptr(p); @@ -201,7 +201,7 @@ struct OgaGeneratorParams : OgaAbstract { }; struct OgaGenerator : OgaAbstract { - static std::unique_ptr Create(OgaModel& model, const OgaGeneratorParams& params) { + static std::unique_ptr Create(const OgaModel& model, const OgaGeneratorParams& params) { OgaGenerator* p; OgaCheckResult(OgaCreateGenerator(&model, ¶ms, &p)); return std::unique_ptr(p); diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index 13cae5235..d5ab67040 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -108,7 +108,7 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) { OGA_TRY auto* params = reinterpret_cast(generator_params); - params->max_batch_size = max_batch_size; + params->TryGraphCapture(max_batch_size); return nullptr; OGA_CATCH } @@ -143,23 +143,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera OGA_CATCH } -OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { +OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { OGA_TRY - auto* model_p = reinterpret_cast(model); - auto* params = reinterpret_cast(generator_params); - model_p->GetMaxBatchSizeFromGeneratorParams(*params); - auto result = Generators::Generate(*model_p, *params); + auto result = Generators::Generate(*reinterpret_cast(model), *reinterpret_cast(generator_params)); *out = reinterpret_cast(std::make_unique(std::move(result)).release()); return nullptr; OGA_CATCH } -OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { +OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { OGA_TRY - auto* model_p = reinterpret_cast(model); - auto* params = reinterpret_cast(generator_params); - model_p->GetMaxBatchSizeFromGeneratorParams(*params); - *out = reinterpret_cast(CreateGenerator(*model_p, *params).release()); + *out = reinterpret_cast(CreateGenerator(*reinterpret_cast(model), *reinterpret_cast(generator_params)).release()); return nullptr; OGA_CATCH } diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index 0939d2c36..3e44c29e4 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -117,7 +117,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model); * after it is done using the sequences. * \return OgaResult containing the error message if the generation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); /* * \brief Creates a OgaGeneratorParams from the given model. @@ -167,7 +167,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O * \param[out] out The created generator. * \return OgaResult containing the error message if the generator creation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); /* * \brief Destroys the given generator. diff --git a/src/python/python.cpp b/src/python/python.cpp index cd974d916..1d8a4e567 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -104,7 +104,7 @@ struct PyGeneratorParams { } void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) { - params_->max_batch_size = max_batch_size.cast(); + params_->TryGraphCapture(max_batch_size.cast()); } pybind11::array_t py_input_ids_; @@ -115,7 +115,6 @@ struct PyGeneratorParams { struct PyGenerator { PyGenerator(Model& model, PyGeneratorParams& params) { params.Prepare(); - model.GetMaxBatchSizeFromGeneratorParams(params); generator_ = CreateGenerator(model, params); } @@ -229,7 +228,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def(pybind11::init([](const std::string& config_path) { return CreateModel(GetOrtEnv(), config_path.c_str()); })) - .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); }) + .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); }) .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; }); pybind11::class_(m, "Generator") From eea971091978a7e8aa6d90a550d327e09c29d445 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Tue, 30 Apr 2024 15:22:55 -0700 Subject: [PATCH 12/15] update readme (#363) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f6cab5472..113a0a5b7 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ prompt = '''def print_prime(n): tokens = tokenizer.encode(prompt) params = og.GeneratorParams(model) -params.set_search_options({"max_length":200}) +params.set_search_options(max_length=200) # Add the following line to enable cuda graph by passing the maximum batch size. # params.try_use_cuda_graph_with_max_batch_size(16) params.input_ids = tokens From 7dd45f2a6b3930543e65a0d2685d00eaa30fb522 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Tue, 30 Apr 2024 18:34:01 -0400 Subject: [PATCH 13/15] Use ort-nightly build for genai gha ci (#256) --- .github/workflows/linux-cpu-x64-build.yml | 52 +++++++++++++++---- .github/workflows/linux-gpu-x64-build.yml | 52 +++++++++++++++---- .github/workflows/mac-cpu-arm64-build.yml | 26 +++++----- .github/workflows/win-cpu-arm64-build.yml | 10 ++-- .github/workflows/win-cpu-x64-build.yml | 44 ++++++++++------ .github/workflows/win-cuda-x64-build.yml | 44 ++++++++++------ cmake/presets/CMakeMacOSConfigPresets.json | 2 +- nuget.config | 17 +++--- onnxruntime-genai.sln | 36 +++++++++++++ ...icrosoft.ML.OnnxRuntimeGenAI.Tests.csproj} | 3 +- 10 files changed, 208 insertions(+), 78 deletions(-) create mode 100644 onnxruntime-genai.sln rename test/csharp/{Microsoft.OnnxRuntimeGenAI.Tests.csproj => Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj} (92%) diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 2e1c03aab..744fa567a 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -4,10 +4,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-linux-x64-1.17.3" - ort_zip: "onnxruntime-linux-x64-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz" - + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" + ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" + NUGET_EXE: "mono /usr/local/bin/nuget.exe" jobs: linux_cpu_x64: runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ] @@ -16,19 +16,49 @@ jobs: uses: actions/checkout@v4 with: submodules: true + - name: install Mono and Nuget + run: | + sudo apt install ca-certificates gnupg + sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF + echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list + sudo apt update + sudo apt install -y mono-devel + sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe + sudo chmod +x /usr/local/bin/nuget.exe + + - name: Install jq and dotnet + run: | + wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb + sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq - - name: Download OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV - - name: Unzip OnnxRuntime + - name: Download OnnxRuntime Nightly run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + ${{ env.NUGET_EXE }} install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x + continue-on-error: true + + - name: list files + shell: bash + run: | + ls -l + ls -R ${{ env.ORT_PACKAGE_NAME }} + continue-on-error: true - - name: Rename OnnxRuntime to ort +# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version + - name: Extra OnnxRuntime library and header files run: | - mv ${{ env.ort_dir }} ort + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/ + ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1) + cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version - name: Build with CMake and GCC run: | diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index c1e51251b..123ff5f75 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -6,9 +6,11 @@ concurrency: cancel-in-progress: true env: - ort_dir: "onnxruntime-linux-x64-gpu-1.17.3" - ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz" + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux + ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" + NUGET_EXE: "mono /usr/local/bin/nuget.exe" + jobs: linux-cuda-x64-build: @@ -29,19 +31,49 @@ jobs: clean: true path: manylinux submodules: true + - name: install Mono and Nuget + run: | + sudo apt install ca-certificates gnupg + sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF + echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list + sudo apt update + sudo apt install -y mono-devel + sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe + sudo chmod +x /usr/local/bin/nuget.exe + + - name: Install jq and dotnet + run: | + wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb + sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq - name: Download OnnxRuntime run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV - - name: Unzip OnnxRuntime + - name: Download OnnxRuntime Nightly + run: | + mono /usr/local/bin/nuget.exe install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x + continue-on-error: true + - name: list files + shell: bash run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + ls -l + ls -R ${{ env.ORT_PACKAGE_NAME }} + continue-on-error: true - - name: Rename OnnxRuntime to ort +# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version + - name: Extra OnnxRuntime library and header files run: | - mv ${{ env.ort_dir }} ort + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/ + ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1) + cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version + - name: Get Docker Image run: | @@ -78,7 +110,7 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - /usr/bin/cmake --build --preset linux_gcc_cuda_release --parallel $( nproc )" + /usr/bin/cmake --build --preset linux_gcc_cuda_release" - name: Get HuggingFace Token run: | diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index 9cb9cdc46..aba92d017 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -4,9 +4,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-osx-arm64-1.17.3" - ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz" + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" jobs: mac-cpu-arm64-build: runs-on: macos-latest @@ -16,22 +15,21 @@ jobs: with: submodules: true - - name: Install ninja + - name: Get the Latest OnnxRuntime Nightly Version run: | - brew install ninja - - - name: Download OnnxRuntime + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV + - name: Download OnnxRuntime Nightly run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x - - name: Unzip OnnxRuntime + - name: Extra OnnxRuntime library and header files run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/ - - name: Rename OnnxRuntime to ort - run: | - mv ${{ env.ort_dir }} ort - name: Configure CMake run: | diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 916af3009..ce3bfcf4b 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -53,6 +53,11 @@ jobs: run: | cmake --build --preset windows_arm64_cpu_release --parallel + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the Python Wheel and Test Dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -62,10 +67,7 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Verify Build Artifacts if: always() diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index ca0bb6b5b..cf5614dee 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -11,10 +11,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-win-x64-1.17.3" - ort_zip: "$(ort_dir).zip" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)" binaryDir: 'build/cpu' + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" jobs: windows-cpu-x64-build: @@ -33,19 +32,32 @@ jobs: with: vs-version: '17.5' - - name: Download OnnxRuntime + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '6.0.x' + + - name : Install jq and nuget run: | - $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip" - Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip + choco install -y jq curl - - name: Unzip OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version + shell: pwsh run: | - Expand-Archive $env:ort_zip -DestinationPath . - Remove-Item -Path $env:ort_zip + $ORT_NIGHTLY_VERSION = $(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Download OnnxRuntime Nightly + run: | + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x -NonInteractive + + - run: Get-ChildItem ${{ env.ORT_PACKAGE_NAME }} -Recurse + continue-on-error: true - - name: Rename OnnxRuntime to ort + - name: Extra OnnxRuntime library and header files run: | - Rename-Item -Path $env:ort_dir -NewName ort + mkdir ort/lib + move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Initialize CodeQL uses: github/codeql-action/init@v3 @@ -60,6 +72,11 @@ jobs: run: | cmake --build --preset windows_x64_cpu_release --parallel + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the python wheel and test dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -76,10 +93,7 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Verify Build Artifacts if: always() diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index a9f602ef8..f0cebbae8 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -8,14 +8,12 @@ concurrency: env: AZCOPY_AUTO_LOGIN_TYPE: MSI AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4 - ort_dir: "onnxruntime-win-x64-gpu-1.17.3" - ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip" cuda_dir: "${{ github.workspace }}\\cuda_sdk" cuda_version: "11.8" CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8 binaryDir: 'build/cuda' - + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Windows&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime.Gpu.Windows" jobs: windows-cuda-x64-build: @@ -35,17 +33,32 @@ jobs: run: | azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ env.cuda_version }}" ${{ env.cuda_dir}} - - name: Download OnnxRuntime + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '6.0.x' + + - name : Install jq and curl run: | - Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip + choco install -y jq curl - - name: Unzip OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version + shell: pwsh + run: | + $ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Download OnnxRuntime Nightly run: | - Expand-Archive $env:ort_zip -DestinationPath . - Remove-Item -Path $env:ort_zip - - name: Rename OnnxRuntime to ort + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -ExcludeVersion -NonInteractive + + - run: Get-ChildItem ${{ env.ORT_PACKAGE_NAME }} -Recurse + continue-on-error: true + + - name: Extra OnnxRuntime library and header files run: | - Rename-Item -Path $env:ort_dir -NewName ort + mkdir ort/lib + move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Configure CMake run: | @@ -59,6 +72,11 @@ jobs: run: | echo "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the Python Wheel and Test Dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -75,10 +93,6 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" - name: Verify Build Artifacts if: always() diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json index cd0c0a0b9..1ea6d85c8 100644 --- a/cmake/presets/CMakeMacOSConfigPresets.json +++ b/cmake/presets/CMakeMacOSConfigPresets.json @@ -6,7 +6,7 @@ "configurePresets": [ { "name": "macos_default", - "generator": "Ninja", + "generator": "Unix Makefiles", "binaryDir": "${sourceDir}/build/cpu", "cacheVariables": { "CMAKE_POSITION_INDEPENDENT_CODE": "ON", diff --git a/nuget.config b/nuget.config index 3e0389a52..63a200340 100644 --- a/nuget.config +++ b/nuget.config @@ -3,11 +3,14 @@ - - - - - - - + + + + + + + + + + \ No newline at end of file diff --git a/onnxruntime-genai.sln b/onnxruntime-genai.sln new file mode 100644 index 000000000..5e59cc82e --- /dev/null +++ b/onnxruntime-genai.sln @@ -0,0 +1,36 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31903.59 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{2253BDCC-33C9-431E-889A-56E3E75D10BA}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI", "src\csharp\Microsoft.ML.OnnxRuntimeGenAI.csproj", "{CA0EC087-3AF5-44D5-93F0-489420EBA014}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{505E2406-98C2-46DD-973A-3CEB95CF3626}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI.Tests", "test\csharp\Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj", "{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.Build.0 = Release|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.Build.0 = Debug|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.ActiveCfg = Release|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {CA0EC087-3AF5-44D5-93F0-489420EBA014} = {2253BDCC-33C9-431E-889A-56E3E75D10BA} + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73} = {505E2406-98C2-46DD-973A-3CEB95CF3626} + EndGlobalSection +EndGlobal diff --git a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj similarity index 92% rename from test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj rename to test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj index e4ec8e6d8..978deb04e 100644 --- a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj +++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj @@ -12,7 +12,8 @@ default True Debug;RelWithDebInfo;Release - + https://api.nuget.org/v3/index.json + $(RestoreAdditionalProjectSources);$(RestoreSources) Microsoft.ML.OnnxRuntimeGenAI.Tests Microsoft.ML.OnnxRuntimeGenAI.Tests From f94280f493c2f628726b7ea924592531fdb1bda1 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Tue, 30 Apr 2024 17:51:11 -0700 Subject: [PATCH 14/15] Ensure CIs are running on merge (#334) --- .github/workflows/linux-cpu-arm64-build.yml | 9 ++++++++- .github/workflows/linux-cpu-x64-build.yml | 8 +++++++- .github/workflows/linux-gpu-x64-build.yml | 8 +++++++- .github/workflows/mac-cpu-arm64-build.yml | 8 +++++++- .github/workflows/win-cuda-x64-build.yml | 8 +++++++- .github/workflows/win-directml-x64-build.yml | 8 +++++++- 6 files changed, 43 insertions(+), 6 deletions(-) diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml index 3b55c3fe5..622b73eea 100644 --- a/.github/workflows/linux-cpu-arm64-build.yml +++ b/.github/workflows/linux-cpu-arm64-build.yml @@ -1,5 +1,12 @@ name: "Linux CPU ARM64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: + concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 744fa567a..290695c9c 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -1,5 +1,11 @@ name: "Linux CPU x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index 123ff5f75..f6cdf0f37 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -1,5 +1,11 @@ name: "Linux CUDA x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index aba92d017..f2f90e427 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -1,5 +1,11 @@ name: "MacOS CPU ARM64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index f0cebbae8..ccc2f71fe 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -1,5 +1,11 @@ name: "Windows CUDA x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/.github/workflows/win-directml-x64-build.yml b/.github/workflows/win-directml-x64-build.yml index 152b9ab1d..f7dcd89d0 100644 --- a/.github/workflows/win-directml-x64-build.yml +++ b/.github/workflows/win-directml-x64-build.yml @@ -1,5 +1,11 @@ name: "Windows DirectML x64 Build" -on: [ workflow_dispatch, pull_request ] +on: + workflow_dispatch: + push: + branches: + - main + - rel-* + pull_request: concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} From b3ff5cec93015ef8b76ce7778be1df0acb3d893c Mon Sep 17 00:00:00 2001 From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Date: Tue, 30 Apr 2024 21:38:06 -0700 Subject: [PATCH 15/15] Add 'add_extra_input' to handle models like QLora (#370) Add a new python api 'add_extra_input' that will take numpy tensors and turn them into OrtValue inputs internally. This allows models with extra custom inputs (like QLora) to be specified in python. C API to follow soon. --- src/generators.h | 8 ++++++++ src/models/model.cpp | 5 +++++ src/models/model.h | 2 ++ src/models/static_buffer.cpp | 19 ++----------------- src/models/static_buffer.h | 1 - src/python/python.cpp | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 53 insertions(+), 18 deletions(-) diff --git a/src/generators.h b/src/generators.h index c6a510739..e6ad6f0e1 100644 --- a/src/generators.h +++ b/src/generators.h @@ -99,6 +99,14 @@ struct GeneratorParams : std::enable_shared_from_this { std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime + struct Input { + std::string name; + std::unique_ptr value; + }; + + // A list of extra model inputs that will be matched at runtime based on name + std::vector extra_inputs; + void TryGraphCapture(int max_bs); private: diff --git a/src/models/model.cpp b/src/models/model.cpp index 6f0cc294a..35a9b4ad4 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -35,6 +35,11 @@ static std::wstring CurrentModulePath() { namespace Generators { State::State(const GeneratorParams& params) : params_{params.shared_from_this()} { + // Add extra user inputs + for (auto& input : params.extra_inputs) { + input_names_.push_back(input.name.c_str()); + inputs_.push_back(input.value.get()); + } } void State::Run(OrtSession& session, OrtRunOptions& run_options) { diff --git a/src/models/model.h b/src/models/model.h index 5b9ec12d9..165e7c345 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -16,6 +16,8 @@ struct Tokenizer; void ConvertFp16ToFp32(OrtAllocator& allocator, OrtValue& in, std::unique_ptr& p_out, DeviceType device_type, cudaStream_t stream); +size_t GetOrtTypeSize(ONNXTensorElementDataType type); + struct State { State(const GeneratorParams& params); virtual ~State() = default; diff --git a/src/models/static_buffer.cpp b/src/models/static_buffer.cpp index 9bc5f50ea..eab776e65 100644 --- a/src/models/static_buffer.cpp +++ b/src/models/static_buffer.cpp @@ -1,4 +1,5 @@ #include "../generators.h" +#include "model.h" #include "static_buffer.h" namespace Generators { @@ -8,7 +9,7 @@ StaticBuffer::StaticBuffer(Ort::Allocator* allocator, size_t max_beam_batch_size std::unique_ptr StaticBuffer::CreateTensorOnStaticBuffer(std::span shape, ONNXTensorElementDataType type) { - size_t new_bytes = GetElementSize(type) * GetNumElements(shape); + size_t new_bytes = GetOrtTypeSize(type) * GetNumElements(shape); if (buffer_ == nullptr) { // Assuming the first dimension is the batch size bytes_ = new_bytes * (max_beam_batch_size_ / shape[0]); @@ -21,22 +22,6 @@ std::unique_ptr StaticBuffer::CreateTensorOnStaticBuffer(std::span shape) { size_t num_elements = 1; for (auto dim : shape) { diff --git a/src/models/static_buffer.h b/src/models/static_buffer.h index ce9e14686..8c133fdae 100644 --- a/src/models/static_buffer.h +++ b/src/models/static_buffer.h @@ -18,7 +18,6 @@ struct StaticBuffer { ONNXTensorElementDataType type); private: - size_t GetElementSize(ONNXTensorElementDataType type); size_t GetNumElements(std::span shape); Ort::Allocator* allocator_{nullptr}; diff --git a/src/python/python.cpp b/src/python/python.cpp index 1d8a4e567..8bd25a9d3 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -22,6 +22,34 @@ pybind11::array_t ToPython(std::span v) { return pybind11::array_t(v.size(), v.data()); } +ONNXTensorElementDataType ToTensorType(const pybind11::dtype& type) { + switch (type.num()) { + case pybind11::detail::npy_api::NPY_INT32_: + return Ort::TypeToTensorType::type; + case pybind11::detail::npy_api::NPY_UINT32_: + return Ort::TypeToTensorType::type; + case 23 /*NPY_FLOAT16*/: + return Ort::TypeToTensorType::type; + case pybind11::detail::npy_api::NPY_FLOAT_: + return Ort::TypeToTensorType::type; + case pybind11::detail::npy_api::NPY_DOUBLE_: + return Ort::TypeToTensorType::type; + default: + throw std::runtime_error("Unsupported numpy type"); + } +} + +std::unique_ptr ToTensor(pybind11::array& v) { + auto type = ToTensorType(v.dtype()); + + std::vector shape(v.ndim()); + for (pybind11::ssize_t i = 0; i < v.ndim(); i++) + shape[i] = v.shape()[i]; + + auto p_memory_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); + return OrtValue::CreateTensor(*p_memory_info, v.mutable_data(), v.nbytes(), shape, type); +} + namespace Generators { // A roaming array is one that can be in CPU or GPU memory, and will copy the memory as needed to be used from anywhere @@ -85,6 +113,11 @@ struct PyGeneratorParams { } } + void AddExtraInput(const std::string& name, pybind11::array& value) { + params_->extra_inputs.push_back({name, ToTensor(value)}); + refs_.emplace_back(value); + } + void SetSearchOptions(const pybind11::kwargs& dict) { for (auto& entry : dict) { auto name = entry.first.cast(); @@ -110,6 +143,8 @@ struct PyGeneratorParams { pybind11::array_t py_input_ids_; pybind11::array_t py_whisper_input_features_; pybind11::array_t py_whisper_decoder_input_ids_; + + std::vector refs_; // References to data we want to ensure doesn't get garbage collected }; struct PyGenerator { @@ -198,6 +233,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def_readwrite("input_ids", &PyGeneratorParams::py_input_ids_) .def_readwrite("whisper_input_features", &PyGeneratorParams::py_whisper_input_features_) .def_readwrite("whisper_decoder_input_ids", &PyGeneratorParams::py_whisper_decoder_input_ids_) + .def("add_extra_input", &PyGeneratorParams::AddExtraInput) .def("set_search_options", &PyGeneratorParams::SetSearchOptions) // See config.h 'struct Search' for the options .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize);