diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml index 2e1c03aab..8063cbeda 100644 --- a/.github/workflows/linux-cpu-x64-build.yml +++ b/.github/workflows/linux-cpu-x64-build.yml @@ -4,10 +4,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-linux-x64-1.17.3" - ort_zip: "onnxruntime-linux-x64-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz" - + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" + ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" + NUGET_EXE: "mono /usr/local/bin/nuget.exe" jobs: linux_cpu_x64: runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ] @@ -16,19 +16,49 @@ jobs: uses: actions/checkout@v4 with: submodules: true + - name: install Mono and Nuget + run: | + sudo apt install ca-certificates gnupg + sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF + echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list + sudo apt update + sudo apt install -y mono-devel + sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe + sudo chmod +x /usr/local/bin/nuget.exe + + - name: Install jq and dotnet + run: | + wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb + sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq - - name: Download OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV - - name: Unzip OnnxRuntime + - name: Download OnnxRuntime Nightly run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + ${{ env.NUGET_EXE }} install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x + continue-on-error: true + + - name: list files + shell: bash + run: | + ls -l + ls -R ${{ env.ORT_PACKAGE_NAME }} + continue-on-error: true - - name: Rename OnnxRuntime to ort +# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version + - name: Extract OnnxRuntime library and header files run: | - mv ${{ env.ort_dir }} ort + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/ + ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1) + cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version - name: Build with CMake and GCC run: | diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml index c1e51251b..6ea48d5d7 100644 --- a/.github/workflows/linux-gpu-x64-build.yml +++ b/.github/workflows/linux-gpu-x64-build.yml @@ -6,9 +6,11 @@ concurrency: cancel-in-progress: true env: - ort_dir: "onnxruntime-linux-x64-gpu-1.17.3" - ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz" + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux + ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" + NUGET_EXE: "mono /usr/local/bin/nuget.exe" + jobs: linux-cuda-x64-build: @@ -29,19 +31,49 @@ jobs: clean: true path: manylinux submodules: true + - name: install Mono and Nuget + run: | + sudo apt install ca-certificates gnupg + sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF + echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list + sudo apt update + sudo apt install -y mono-devel + sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe + sudo chmod +x /usr/local/bin/nuget.exe + + - name: Install jq and dotnet + run: | + wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb + sudo dpkg -i packages-microsoft-prod.deb + rm packages-microsoft-prod.deb + sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq - name: Download OnnxRuntime run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV - - name: Unzip OnnxRuntime + - name: Download OnnxRuntime Nightly + run: | + mono /usr/local/bin/nuget.exe install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x + continue-on-error: true + - name: list files + shell: bash run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + ls -l + ls -R ${{ env.ORT_PACKAGE_NAME }} + continue-on-error: true - - name: Rename OnnxRuntime to ort +# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version + - name: Extract OnnxRuntime library and header files run: | - mv ${{ env.ort_dir }} ort + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/ + ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1) + cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version + - name: Get Docker Image run: | @@ -78,7 +110,7 @@ jobs: --volume $GITHUB_WORKSPACE:/ort_genai_src \ -w /ort_genai_src onnxruntimecudabuildx64 \ bash -c " \ - /usr/bin/cmake --build --preset linux_gcc_cuda_release --parallel $( nproc )" + /usr/bin/cmake --build --preset linux_gcc_cuda_release" - name: Get HuggingFace Token run: | diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml index 9cb9cdc46..a35063a98 100644 --- a/.github/workflows/mac-cpu-arm64-build.yml +++ b/.github/workflows/mac-cpu-arm64-build.yml @@ -4,9 +4,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-osx-arm64-1.17.3" - ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz" + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" jobs: mac-cpu-arm64-build: runs-on: macos-latest @@ -16,22 +15,21 @@ jobs: with: submodules: true - - name: Install ninja + - name: Get the Latest OnnxRuntime Nightly Version run: | - brew install ninja - - - name: Download OnnxRuntime + ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV + - name: Download OnnxRuntime Nightly run: | - curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x - - name: Unzip OnnxRuntime + - name: Extract OnnxRuntime library and header files run: | - tar -xzf ${{ env.ort_zip }} - rm ${{ env.ort_zip }} + mkdir -p ort/lib + mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/ - - name: Rename OnnxRuntime to ort - run: | - mv ${{ env.ort_dir }} ort - name: Configure CMake run: | diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml index 916af3009..ce3bfcf4b 100644 --- a/.github/workflows/win-cpu-arm64-build.yml +++ b/.github/workflows/win-cpu-arm64-build.yml @@ -53,6 +53,11 @@ jobs: run: | cmake --build --preset windows_arm64_cpu_release --parallel + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the Python Wheel and Test Dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -62,10 +67,7 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Verify Build Artifacts if: always() diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml index ca0bb6b5b..855025fb2 100644 --- a/.github/workflows/win-cpu-x64-build.yml +++ b/.github/workflows/win-cpu-x64-build.yml @@ -11,10 +11,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: - ort_dir: "onnxruntime-win-x64-1.17.3" - ort_zip: "$(ort_dir).zip" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)" binaryDir: 'build/cpu' + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime" jobs: windows-cpu-x64-build: @@ -33,19 +32,32 @@ jobs: with: vs-version: '17.5' - - name: Download OnnxRuntime + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '6.0.x' + + - name : Install jq and nuget run: | - $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip" - Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip + choco install -y jq curl - - name: Unzip OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version + shell: pwsh run: | - Expand-Archive $env:ort_zip -DestinationPath . - Remove-Item -Path $env:ort_zip + $ORT_NIGHTLY_VERSION = $(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Download OnnxRuntime Nightly + run: | + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x -NonInteractive + + - run: Get-ChildItem ${{ env.ORT_PACKAGE_NAME }} -Recurse + continue-on-error: true - - name: Rename OnnxRuntime to ort + - name: Extract OnnxRuntime library and header files run: | - Rename-Item -Path $env:ort_dir -NewName ort + mkdir ort/lib + move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Initialize CodeQL uses: github/codeql-action/init@v3 @@ -60,6 +72,11 @@ jobs: run: | cmake --build --preset windows_x64_cpu_release --parallel + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the python wheel and test dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -76,10 +93,7 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Verify Build Artifacts if: always() diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml index a9f602ef8..0f27a7bfc 100644 --- a/.github/workflows/win-cuda-x64-build.yml +++ b/.github/workflows/win-cuda-x64-build.yml @@ -8,14 +8,12 @@ concurrency: env: AZCOPY_AUTO_LOGIN_TYPE: MSI AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4 - ort_dir: "onnxruntime-win-x64-gpu-1.17.3" - ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip" - ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip" cuda_dir: "${{ github.workspace }}\\cuda_sdk" cuda_version: "11.8" CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8 binaryDir: 'build/cuda' - + ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Windows&api-version=6.0-preview.1" + ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime.Gpu.Windows" jobs: windows-cuda-x64-build: @@ -35,17 +33,32 @@ jobs: run: | azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ env.cuda_version }}" ${{ env.cuda_dir}} - - name: Download OnnxRuntime + - uses: actions/setup-dotnet@v4 + with: + dotnet-version: '6.0.x' + + - name : Install jq and curl run: | - Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip + choco install -y jq curl - - name: Unzip OnnxRuntime + - name: Get the Latest OnnxRuntime Nightly Version + shell: pwsh + run: | + $ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion') + echo "$ORT_NIGHTLY_VERSION" + "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append + - name: Download OnnxRuntime Nightly run: | - Expand-Archive $env:ort_zip -DestinationPath . - Remove-Item -Path $env:ort_zip - - name: Rename OnnxRuntime to ort + nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -ExcludeVersion -NonInteractive + + - run: Get-ChildItem ${{ env.ORT_PACKAGE_NAME }} -Recurse + continue-on-error: true + + - name: Extract OnnxRuntime library and header files run: | - Rename-Item -Path $env:ort_dir -NewName ort + mkdir ort/lib + move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/ + move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/ - name: Configure CMake run: | @@ -59,6 +72,11 @@ jobs: run: | echo "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Build the C# API and Run the C# Tests + run: | + cd test\csharp + dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" + - name: Install the Python Wheel and Test Dependencies run: | python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) @@ -75,10 +93,6 @@ jobs: run: | python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" - - name: Build the C# API and Run the C# Tests - run: | - cd test\csharp - dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release" - name: Verify Build Artifacts if: always() diff --git a/.gitignore b/.gitignore index 1ff9a0f9c..a4ad8b1a4 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,8 @@ cache_dir example-models *.onnx -*.onnx.data +*.onnx.date +**/.DS_Store __pycache__ benchmark/python/*.csv examples/python/genai_models diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml index f02e05080..8e09c3eb1 100644 --- a/.pipelines/nuget-publishing.yml +++ b/.pipelines/nuget-publishing.yml @@ -27,12 +27,17 @@ parameters: - name: ort_version displayName: 'OnnxRuntime version' type: string - default: '1.17.3' + default: '1.18.0-dev-20240426-1256-b842effa29' + +- name: ort_cuda_version + displayName: 'OnnxRuntime GPU version' + type: string + default: '1.18.0-dev-20240426-0614-b842effa29' - name: ort_dml_version - displayName: 'OnnxRuntime DirectML version' + displayName: 'OnnxRuntime DML version' type: string - default: '1.18.0-dev-20240423-0527-c07b8d545d' + default: '1.18.0-dev-20240426-0116-b842effa29' - name: cuda_version displayName: 'CUDA version' @@ -42,16 +47,6 @@ parameters: - '12.2' default: '11.8' -- name: use_build_in_ort - displayName: 'Whether to use the built-in OnnxRuntime package.' - type: boolean - default: false - -- name: publish_to_ado_feed - displayName: 'Publish to Azure DevOps Feed' - type: boolean - default: false - resources: repositories: - repository: manylinux @@ -70,7 +65,6 @@ stages: enable_linux_cuda: ${{ parameters.enable_linux_cuda }} enable_win_dml: ${{ parameters.enable_win_dml }} ort_version: ${{ parameters.ort_version }} + ort_cuda_version: ${{ parameters.ort_cuda_version }} ort_dml_version: ${{ parameters.ort_dml_version }} cuda_version: ${{ parameters.cuda_version }} - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} \ No newline at end of file diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml index 8759a6cca..b603fcde3 100644 --- a/.pipelines/pypl-publishing.yml +++ b/.pipelines/pypl-publishing.yml @@ -7,7 +7,7 @@ parameters: - name: enable_win_cuda displayName: 'Whether Windows CUDA package is built.' type: boolean - default : true + default: true - name: enable_win_dml displayName: 'Whether Windows DirectML package is built.' @@ -27,12 +27,17 @@ parameters: - name: ort_version displayName: 'OnnxRuntime version' type: string - default: '1.17.3' + default: '1.18.0-dev-20240426-1256-b842effa29' + +- name: ort_cuda_version + displayName: 'OnnxRuntime GPU version' + type: string + default: '1.18.0-dev-20240426-0614-b842effa29' - name: ort_dml_version - displayName: 'OnnxRuntime DirectML version' + displayName: 'OnnxRuntime DML version' type: string - default: '1.18.0-dev-20240423-0527-c07b8d545d' + default: '1.18.0-dev-20240426-0116-b842effa29' - name: cuda_version displayName: 'CUDA version' @@ -42,16 +47,6 @@ parameters: - '11.8' - '12.2' -- name: use_build_in_ort - displayName: 'Whether to use the built-in OnnxRuntime package.' - type: boolean - default: false - -- name: publish_to_ado_feed - displayName: 'Whether to publish the packages to ADO feed.' - type: boolean - default: false - resources: repositories: - repository: manylinux @@ -70,7 +65,7 @@ stages: enable_win_cuda: ${{ parameters.enable_win_cuda }} enable_win_dml: ${{ parameters.enable_win_dml }} ort_version: ${{ parameters.ort_version }} + ort_cuda_version: ${{ parameters.ort_cuda_version }} ort_dml_version: ${{ parameters.ort_dml_version }} cuda_version: ${{ parameters.cuda_version }} - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml index 0728720c4..5f2a4c8e9 100644 --- a/.pipelines/stages/jobs/nuget-packaging-job.yml +++ b/.pipelines/stages/jobs/nuget-packaging-job.yml @@ -13,10 +13,6 @@ parameters: values: - 'linux' - 'win' -- name: use_build_in_ort - type: boolean -- name: publish_to_ado_feed - type: boolean jobs: - job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging @@ -46,18 +42,21 @@ jobs: value: ${{ parameters.ort_version }} - name: GDN_CODESIGN_TARGETDIRECTORY value: '$(Build.ArtifactStagingDirectory)/nuget' + - name: os + value: ${{ parameters.os }} - name: ort_filename ${{ if eq(parameters.ep, 'cpu') }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime' ${{ elseif eq(parameters.ep, 'cuda') }}: - ${{if eq(parameters.cuda_version, '11.8') }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' + ${{if eq(parameters.os, 'win') }}: + value: 'Microsoft.ML.OnnxRuntime.Gpu.Windows' ${{ else }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' ${{ elseif eq(parameters.ep, 'directml')}}: - value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime.DirectML' ${{ else }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime' + - name: genai_nuget_ext ${{ if eq(parameters.ep, 'cpu') }}: value: '' @@ -86,7 +85,8 @@ jobs: - template: steps/capi-linux-step.yml parameters: target: 'onnxruntime-genai' - use_build_in_ort: ${{ parameters.use_build_in_ort }} + arch: ${{ parameters.arch }} + ep: ${{ parameters.ep }} # TODO: Add a step to build the linux nuget package @@ -94,12 +94,9 @@ jobs: - template: steps/capi-win-step.yml parameters: target: 'onnxruntime-genai' + arch: ${{ parameters.arch }} ep: ${{ parameters.ep }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} - - template: steps/nuget-win-step.yml - - ${{ if eq(parameters.publish_to_ado_feed, true)}}: - - template: steps/nuget-ado-feed-releasing-step.yml - template: steps/compliant-and-cleanup-step.yml diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml index 32cc3e0e3..f5a183289 100644 --- a/.pipelines/stages/jobs/py-packaging-job.yml +++ b/.pipelines/stages/jobs/py-packaging-job.yml @@ -13,10 +13,7 @@ parameters: values: - 'linux' - 'win' -- name: use_build_in_ort - type: boolean -- name: publish_to_ado_feed - type: boolean + jobs: - job: python_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging @@ -69,18 +66,21 @@ jobs: value: ${{ parameters.ep }} - name: ort_version value: ${{ parameters.ort_version }} + - name: os + value: ${{ parameters.os }} + - name: ort_filename ${{ if eq(parameters.ep, 'cpu') }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime' ${{ elseif eq(parameters.ep, 'cuda') }}: - ${{if eq(parameters.cuda_version, '11.8') }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}' + ${{if eq(parameters.os, 'win') }}: + value: 'Microsoft.ML.OnnxRuntime.Gpu.Windows' ${{ else }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime.Gpu.Linux' ${{ elseif eq(parameters.ep, 'directml')}}: - value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime.DirectML' ${{ else }}: - value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}' + value: 'Microsoft.ML.OnnxRuntime' - name: dml_dir value: 'Microsoft.AI.DirectML.1.14.1' @@ -109,20 +109,16 @@ jobs: - template: steps/capi-linux-step.yml parameters: target: 'python' - use_build_in_ort: ${{ parameters.use_build_in_ort }} - + arch: ${{ parameters.arch }} + ep: ${{ parameters.ep }} # Windows job needs to set the python version and install the required packages - ${{ if eq(parameters.os, 'win') }}: - template: steps/capi-win-step.yml parameters: target: 'python' + arch: ${{ parameters.arch }} ep: ${{ parameters.ep }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} - - - - ${{ if eq(parameters.publish_to_ado_feed, true)}}: - - template: steps/py-ado-feed-releasing-step.yml - template: steps/compliant-and-cleanup-step.yml diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml index 1e518f08b..16638017d 100644 --- a/.pipelines/stages/jobs/steps/capi-linux-step.yml +++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml @@ -1,8 +1,12 @@ parameters: - name: target type: string -- name: use_build_in_ort - type: boolean +- name: ep + type: string + default: 'cpu' +- name: arch + type: string + default: 'x64' steps: - checkout: self @@ -29,23 +33,9 @@ steps: echo "ep=$(ep)" displayName: 'Print Parameters' -- ${{ if eq(parameters.use_build_in_ort, false) }}: - - template: utils/download-ort.yml - parameters: - archiveType: 'tgz' -- ${{ else }}: - - bash: | - set -e -x - azcopy copy --recursive "https://lotusscus.blob.core.windows.net/tmp/ort/$(os)/$(ep)" '$(Build.Repository.LocalPath)/tmp_ort' - displayName: 'Download ONNXRuntime' - - task: CopyFiles@2 - inputs: - SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/lib' - TargetFolder: '$(Build.Repository.LocalPath)/ort/lib' - - task: CopyFiles@2 - inputs: - SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/include' - TargetFolder: '$(Build.Repository.LocalPath)/ort/include' +- template: utils/download-ort.yml + parameters: + archiveType: 'tgz' - bash: | set -e -x @@ -57,7 +47,7 @@ steps: --container-registry onnxruntimebuildcache \ --manylinux-src manylinux \ --multiple_repos \ - --repository onnxruntime$(ep)build$(arch) + --repository ortgenai$(ep)build$(arch) displayName: 'Get Docker Image' workingDirectory: '$(Build.Repository.LocalPath)' @@ -67,7 +57,7 @@ steps: docker run \ --rm \ --volume $(Build.Repository.LocalPath):/ort_genai_src \ - -w /ort_genai_src/ onnxruntime$(ep)build$(arch) \ + -w /ort_genai_src/ ortgenai$(ep)build$(arch) \ bash -c " \ /usr/bin/cmake --preset linux_gcc_$(ep)_release \ -DENABLE_TESTS=OFF && \ @@ -75,6 +65,7 @@ steps: --target onnxruntime-genai" displayName: 'Build GenAi' workingDirectory: '$(Build.Repository.LocalPath)' + - task: BinSkim@4 displayName: 'Run BinSkim' inputs: @@ -83,12 +74,22 @@ steps: - template: utils/capi-archive.yml parameters: archiveType: tar + - script: | + set -e -x + docker run \ + --rm \ + --volume $(Build.Repository.LocalPath):/ort_genai_src \ + -w /ort_genai_src/ ortgenai$(ep)build$(arch) \ + bash -c " \ + /usr/bin/cmake --build --preset linux_gcc_$(ep)_release --target package" + displayName: 'Package C/C++ API' + workingDirectory: '$(Build.Repository.LocalPath)' - task: PublishBuildArtifacts@1 displayName: 'Publish Artifact: ONNXRuntime Genai capi' inputs: ArtifactName: $(artifactName)-capi - PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi' + PathtoPublish: '$(Build.Repository.LocalPath)/build/$(ep)/package' - ${{ if eq(parameters.target, 'python') }}: - bash: | @@ -96,7 +97,7 @@ steps: docker run \ --rm \ --volume $(Build.Repository.LocalPath):/ort_genai_src \ - -w /ort_genai_src/ onnxruntime$(ep)build$(arch) \ + -w /ort_genai_src/ ortgenai$(ep)build$(arch) \ bash -c " \ /usr/bin/cmake --preset linux_gcc_$(ep)_release \ -DENABLE_TESTS=OFF \ @@ -118,7 +119,7 @@ steps: docker run \ --rm \ --volume $(Build.Repository.LocalPath):/ort_genai_src \ - -w /ort_genai_src/ onnxruntime$(ep)build$(arch) \ + -w /ort_genai_src/ ortgenai$(ep)build$(arch) \ bash -c " \ /usr/bin/cmake --build --preset linux_gcc_$(ep)_release \ -DENABLE_TESTS=OFF \ diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml index 7026e058e..4d527cda1 100644 --- a/.pipelines/stages/jobs/steps/capi-win-step.yml +++ b/.pipelines/stages/jobs/steps/capi-win-step.yml @@ -2,11 +2,12 @@ parameters: - name: target type: string default: 'onnxruntime-genai' -- name: use_build_in_ort - type: boolean - name: ep type: string default: 'cpu' +- name: arch + type: string + default: 'x64' steps: - bash: | echo "##[error]Error: ep and arch are not set" @@ -33,23 +34,12 @@ steps: echo "cuda_version=$(cuda_version)" echo "target=${{ parameters.target }}" displayName: 'Print Parameters' -- ${{ if eq(parameters.use_build_in_ort, false) }}: - - template: utils/download-ort.yml - parameters: - archiveType: 'zip' - ep: ${{ parameters.ep }} -- ${{ else }}: - - bash: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/tmp/ort/$(os)/$(ep)" '$(Build.Repository.LocalPath)/tmp_ort' - displayName: 'Download ONNXRuntime' - - task: CopyFiles@2 - inputs: - SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/lib' - TargetFolder: '$(Build.Repository.LocalPath)/ort/lib' - - task: CopyFiles@2 - inputs: - SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/include' - TargetFolder: '$(Build.Repository.LocalPath)/ort/include' + + +- template: utils/download-ort.yml + parameters: + archiveType: 'zip' + ep: ${{ parameters.ep }} - ${{ if eq(parameters.ep, 'directml') }}: - powershell: | @@ -98,15 +88,16 @@ steps: AnalyzeTargetGlob: '$(Build.Repository.LocalPath)\**\*genai.dll' continueOnError: true - - template: utils/capi-archive.yml - parameters: - archiveType: zip + - powershell: | + cmake --build --preset windows_$(arch)_$(ep)_release --target package + displayName: 'Package C/C++ API' + workingDirectory: '$(Build.Repository.LocalPath)' - task: PublishBuildArtifacts@1 displayName: 'Publish Artifact: ONNXRuntime Genai capi' inputs: ArtifactName: $(artifactName)-capi - PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi' + PathtoPublish: '$(Build.Repository.LocalPath)\build\$(ep)\package' - ${{ if eq(parameters.target, 'python') }}: - task: BinSkim@4 diff --git a/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml b/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml deleted file mode 100644 index 331a9ea7c..000000000 --- a/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml +++ /dev/null @@ -1,43 +0,0 @@ -steps: -- task: NuGetToolInstaller@1 - inputs: - versionSpec: 6.8.x - -- powershell: | - New-Item -Path $(Agent.TempDirectory) -Name "binfiles" -ItemType "directory" - $base_path_name = Join-Path -Path $(Agent.TempDirectory) -ChildPath "binfiles" - Get-ChildItem $(GDN_CODESIGN_TARGETDIRECTORY) -Filter *.nupkg | - Foreach-Object { - $dir_name = Join-Path -Path $base_path_name -ChildPath $_.Basename - $cmd = "7z.exe x $($_.FullName) -y -o$dir_name" - Write-Output $cmd - Invoke-Expression -Command $cmd - } - dir $(Agent.TempDirectory) - tree $(Agent.TempDirectory) - workingDirectory: '$(Agent.TempDirectory)' - -- task: CodeSign@1 - displayName: 'Run Codesign Validation' - -- task: PublishSecurityAnalysisLogs@3 - displayName: 'Publish Security Analysis Logs' - continueOnError: true - -- task: PostAnalysis@2 - inputs: - GdnBreakAllTools: true - GdnBreakPolicy: M365 - GdnBreakPolicyMinSev: Error - -- template: utils/get-nuget-package-version-as-variable.yml - parameters: - packageFolder: '$(GDN_CODESIGN_TARGETDIRECTORY)' -#This task must be run on a Windows machine -- task: NuGetCommand@2 - displayName: 'NuGet push to Azure DevOps Feed' - inputs: - command: push - packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg' - publishVstsFeed: 'PublicPackages/onnxruntime-genai' - allowPackageConflicts: true \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml index af502e0df..aae44d69a 100644 --- a/.pipelines/stages/jobs/steps/nuget-win-step.yml +++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml @@ -16,7 +16,7 @@ steps: DisplayName: 'ESRP - Sign C# dlls' Pattern: '*OnnxRuntimeGenAI*.dll' - powershell: | - $VERSION = '0.2.0-rc4' + $VERSION = '0.2.0-rc5' nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec ` -Prop version=$VERSION ` -Prop genai_nuget_ext=$(genai_nuget_ext) ` diff --git a/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml b/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml deleted file mode 100644 index 85c0a7e3d..000000000 --- a/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml +++ /dev/null @@ -1,10 +0,0 @@ -steps: -- task: TwineAuthenticate@1 - inputs: - artifactFeed: PublicPackages/onnxruntime-genai -- script: 'python -m twine upload -r onnxruntime-genai --config-file $(PYPIRC_PATH) --non-interactive *.whl' - workingDirectory: '$(Build.ArtifactStagingDirectory)/wheel' - displayName: 'Uploading wheels to PublicPackages/onnxruntime-genai' - retryCountOnTaskFailure: 3 - env: - SYSTEM_ACCESSTOKEN: $(System.AccessToken) \ No newline at end of file diff --git a/.pipelines/stages/jobs/steps/utils/capi-archive.yml b/.pipelines/stages/jobs/steps/utils/capi-archive.yml index 1395b31f7..3de2f2703 100644 --- a/.pipelines/stages/jobs/steps/utils/capi-archive.yml +++ b/.pipelines/stages/jobs/steps/utils/capi-archive.yml @@ -20,7 +20,7 @@ steps: inputs: SourceFolder: '$(Build.Repository.LocalPath)/$(buildDir)' Contents: | - onnxruntime-genai.so + libonnxruntime-genai.so TargetFolder: '$(Build.ArtifactStagingDirectory)\$(artifactName)\lib' - ${{ else }}: - task: CopyFiles@2 diff --git a/.pipelines/stages/jobs/steps/utils/download-ort.yml b/.pipelines/stages/jobs/steps/utils/download-ort.yml index 5346bade8..fd393c62f 100644 --- a/.pipelines/stages/jobs/steps/utils/download-ort.yml +++ b/.pipelines/stages/jobs/steps/utils/download-ort.yml @@ -5,60 +5,58 @@ parameters: type: string default: cpu steps: -- bash: | - echo "##[error]Error: ort_version and ort_filename are not set" - exit 1 - displayName: 'Check if variables ort_version and ort_filename are set' - condition: or( eq (variables['ort_version'], ''), eq (variables['ort_filename'], '')) -#Special case for DML -- ${{ if ne(parameters.ep, 'directml') }}: - - task: DownloadGitHubRelease@0 - inputs: - connection: 'GitHub - Release' - userRepository: 'microsoft/onnxruntime' - defaultVersionType: 'specificTag' - version: 'v$(ort_version)' - itemPattern: '$(ort_filename).${{ parameters.archiveType }}' - downloadPath: '$(Build.Repository.LocalPath)' - displayName: Download $(ort_filename) +- task: DownloadPackage@1 + inputs: + packageType: 'nuget' + feed: '2692857e-05ef-43b4-ba9c-ccf1c22c437c/7982ae20-ed19-4a35-a362-a96ac99897b7' + definition: '$(ort_filename)' # Can also be package name + version: '$(ort_version)' + extract: false + downloadPath: '$(Build.Repository.LocalPath)' + displayName: Download Onnxruntime file +- ${{ if eq(parameters.archiveType, 'zip') }}: - task: ExtractFiles@1 inputs: - archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).${{ parameters.archiveType }}' - destinationFolder: '$(Build.Repository.LocalPath)' + archiveFilePatterns: '$(Build.Repository.LocalPath)/*.nupkg' + destinationFolder: '$(Build.Repository.LocalPath)/ort' cleanDestinationFolder: false overwriteExistingFiles: true displayName: Unzip OnnxRuntime - task: CopyFiles@2 inputs: - SourceFolder: '$(Build.Repository.LocalPath)/$(ort_filename)' - TargetFolder: '$(Build.Repository.LocalPath)/ort' - displayName: Copy OnnxRuntime to ort + SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/$(os)-$(arch)/native' + TargetFolder: '$(Build.Repository.LocalPath)/ort/lib' - ${{ else }}: - - task: DownloadPackage@1 - inputs: - packageType: 'nuget' - feed: '2692857e-05ef-43b4-ba9c-ccf1c22c437c/7982ae20-ed19-4a35-a362-a96ac99897b7' - definition: 'Microsoft.ML.OnnxRuntime.DirectML' # Can also be package name - version: '$(ort_version)' - extract: false - downloadPath: '$(Build.Repository.LocalPath)' - displayName: Download $(ort_filename) + - script: | + mv $(Build.Repository.LocalPath)/$(ort_filename).nupkg $(Build.Repository.LocalPath)/$(ort_filename).zip + displayName: Rename OnnxRuntime nuget package to zip - task: ExtractFiles@1 inputs: - archiveFilePatterns: '$(Build.Repository.LocalPath)/*.nupkg' + archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).zip' destinationFolder: '$(Build.Repository.LocalPath)/ort' cleanDestinationFolder: false overwriteExistingFiles: true displayName: Unzip OnnxRuntime - task: CopyFiles@2 inputs: - SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/win-x64/native' + SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/$(os)-$(arch)/native' TargetFolder: '$(Build.Repository.LocalPath)/ort/lib' - - task: CopyFiles@2 - inputs: - SourceFolder: '$(Build.Repository.LocalPath)/ort/build/native/include' - TargetFolder: '$(Build.Repository.LocalPath)/ort/include' + # TODO: Find out why do we need to to have libonnxruntime.so.ort_stable_version + - script: | + set -e -x + ort_stable_version=$(echo $(ort_version) | cut -d- -f1) + echo $ort_stable_version + cp libonnxruntime.so libonnxruntime.so.$ort_stable_version + displayName: Copy libonnxruntime.so to libonnxruntime.so. + workingDirectory: '$(Build.Repository.LocalPath)/ort/lib' + +- task: CopyFiles@2 + inputs: + SourceFolder: '$(Build.Repository.LocalPath)/ort/' + Contents: '**/native/include/**' + TargetFolder: '$(Build.Repository.LocalPath)/ort/include' + flattenFolders: true - task: DeleteFiles@1 inputs: diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml index 8bc0da10a..37709022e 100644 --- a/.pipelines/stages/nuget-packaging-stage.yml +++ b/.pipelines/stages/nuget-packaging-stage.yml @@ -11,15 +11,14 @@ parameters: type: boolean - name: ort_version type: string +- name: ort_cuda_version + type: string - name: ort_dml_version type: string - name: cuda_version type: string default: '' -- name: use_build_in_ort - type: boolean -- name: publish_to_ado_feed - type: boolean + stages: - stage: nuget_packaging @@ -31,8 +30,7 @@ stages: ep: 'cpu' ort_version: ${{ parameters.ort_version }} os: 'win' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_win_dml, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -40,18 +38,16 @@ stages: ep: 'directml' ort_version: ${{ parameters.ort_dml_version }} os: 'win' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_win_cuda, true) }}: - template: jobs/nuget-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' - ort_version: ${{ parameters.ort_version }} + ort_version: ${{ parameters.ort_cuda_version }} os: 'win' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_linux_cpu, true) }}: - template: jobs/nuget-packaging-job.yml parameters: @@ -59,15 +55,12 @@ stages: ep: 'cpu' ort_version: ${{ parameters.ort_version }} os: 'linux' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_linux_cuda, true) }}: - template: jobs/nuget-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' - ort_version: ${{ parameters.ort_version }} + ort_version: ${{ parameters.ort_cuda_version }} os: 'linux' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} \ No newline at end of file diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml index 39a0d8e77..d2e220d5a 100644 --- a/.pipelines/stages/py-packaging-stage.yml +++ b/.pipelines/stages/py-packaging-stage.yml @@ -11,15 +11,14 @@ parameters: type: boolean - name: ort_version type: string +- name: ort_cuda_version + type: string - name: ort_dml_version type: string - name: cuda_version type: string default: '' -- name: use_build_in_ort - type: boolean -- name: publish_to_ado_feed - type: boolean + stages: - stage: python_packaging @@ -31,8 +30,7 @@ stages: ep: 'cpu' ort_version: ${{ parameters.ort_version }} os: 'win' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_win_dml, true) }}: - template: jobs/py-packaging-job.yml parameters: @@ -40,18 +38,16 @@ stages: ep: 'directml' ort_version: ${{ parameters.ort_dml_version }} os: 'win' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_win_cuda, true) }}: - template: jobs/py-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' - ort_version: ${{ parameters.ort_version }} + ort_version: ${{ parameters.ort_cuda_version }} os: 'win' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_linux_cpu, true) }}: - template: jobs/py-packaging-job.yml @@ -60,15 +56,17 @@ stages: ep: 'cpu' ort_version: ${{ parameters.ort_version }} os: 'linux' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} + - ${{ if eq(parameters.enable_linux_cuda, true) }}: - template: jobs/py-packaging-job.yml parameters: arch: 'x64' cuda_version: ${{ parameters.cuda_version }} ep: 'cuda' - ort_version: ${{ parameters.ort_version }} + ort_version: ${{ parameters.ort_cuda_version }} os: 'linux' - publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }} - use_build_in_ort: ${{ parameters.use_build_in_ort }} \ No newline at end of file + + + + + diff --git a/CMakeLists.txt b/CMakeLists.txt index acf8f22f6..63c438245 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ message("Building onnxruntime-genai for version ${VERSION_INFO}") # Checking if CUDA is supported include(CheckLanguage) add_compile_definitions(BUILDING_ORT_GENAI_C) + if(USE_CUDA) check_language(CUDA) if(CMAKE_CUDA_COMPILER) @@ -150,8 +151,8 @@ if(USE_DML) target_include_directories(onnxruntime-genai-static PUBLIC $) target_include_directories(onnxruntime-genai-static PUBLIC $/directx) target_include_directories(onnxruntime-genai-static PUBLIC $) - target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib) - target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib) + target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib dxcore.lib dxguid.lib dxgi.lib) + target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib dxcore.lib dxguid.lib dxgi.lib) get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps ABSOLUTE) set(DXC_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.Direct3D.DXC.1.7.2308.12) @@ -245,5 +246,7 @@ foreach(DLL_FILE ${onnxruntime_libs}) ) endforeach() +include(cmake/package.cmake) + # Have visual studio put all files into one single folder vs the default split of header files into a separate folder source_group(TREE ${GENERATORS_ROOT} FILES ${generator_srcs}) diff --git a/VERSION_INFO b/VERSION_INFO index ff4a316ec..47035a2ca 100644 --- a/VERSION_INFO +++ b/VERSION_INFO @@ -1 +1 @@ -0.2.0rc4 \ No newline at end of file +0.2.0rc5 \ No newline at end of file diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp index 76d25458b..4e502de00 100644 --- a/benchmark/c/main.cpp +++ b/benchmark/c/main.cpp @@ -111,7 +111,7 @@ void WriteE2EStats(std::string_view label, << "\n"; } -std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) { +std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) { const char* const base_prompt = "A"; auto base_prompt_sequences = OgaSequences::Create(); @@ -231,6 +231,7 @@ void RunBenchmark(const benchmark::Options& opts) { } // namespace int main(int argc, char** argv) { + OgaHandle handle; try { const auto opts = benchmark::ParseOptionsFromCommandLine(argc, argv); RunBenchmark(opts); diff --git a/cmake/cxx_standard.cmake b/cmake/cxx_standard.cmake index 7e752d40b..52732c2f2 100644 --- a/cmake/cxx_standard.cmake +++ b/cmake/cxx_standard.cmake @@ -9,4 +9,8 @@ elseif (USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION else () message("Test is using C++20") set(CMAKE_CXX_STANDARD 20) -endif () \ No newline at end of file +endif () + +if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9) + add_compile_definitions(USE_EXPERIMENTAL_FILESYSTEM) +endif() \ No newline at end of file diff --git a/cmake/package.cmake b/cmake/package.cmake new file mode 100644 index 000000000..9c9102689 --- /dev/null +++ b/cmake/package.cmake @@ -0,0 +1,29 @@ +set_target_properties( + onnxruntime-genai PROPERTIES PUBLIC_HEADER + "${PROJECT_SOURCE_DIR}/src/ort_genai_c.h;${PROJECT_SOURCE_DIR}/src/ort_genai.h" +) +install(TARGETS + onnxruntime-genai + LIBRARY + PUBLIC_HEADER +) +set(CPACK_PACKAGE_VENDOR "Microsoft") +set(CPACK_PACKAGE_NAME "onnxruntime-genai") +set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE") +set(CPACK_RESOURCE_FILE_README "${PROJECT_SOURCE_DIR}/README.md") +set(CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/microsoft/onnxruntime-genai") +set(CPACK_OUTPUT_FILE_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/package") +set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${VERSION_INFO}-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") +if (WIN32) + set(CPACK_GENERATOR "ZIP") +else () + set(CPACK_GENERATOR "TGZ") +endif () +set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY TRUE) +install(FILES + "${PROJECT_SOURCE_DIR}/README.md" + "${PROJECT_SOURCE_DIR}/ThirdPartyNotices.txt" + "${PROJECT_SOURCE_DIR}/SECURITY.md" + "${PROJECT_SOURCE_DIR}/LICENSE" + DESTINATION .) +include(CPack) \ No newline at end of file diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json index cd0c0a0b9..1ea6d85c8 100644 --- a/cmake/presets/CMakeMacOSConfigPresets.json +++ b/cmake/presets/CMakeMacOSConfigPresets.json @@ -6,7 +6,7 @@ "configurePresets": [ { "name": "macos_default", - "generator": "Ninja", + "generator": "Unix Makefiles", "binaryDir": "${sourceDir}/build/cpu", "cacheVariables": { "CMAKE_POSITION_INDEPENDENT_CODE": "ON", diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp index e4be639f2..f78aa196e 100644 --- a/examples/c/src/main.cpp +++ b/examples/c/src/main.cpp @@ -9,7 +9,8 @@ void CXX_API(const char* model_path) { auto tokenizer = OgaTokenizer::Create(*model); const char* prompt = "def is_prime(num):"; - std::cout << "Prompt: " << std::endl << prompt << std::endl; + std::cout << "Prompt: " << std::endl + << prompt << std::endl; auto sequences = OgaSequences::Create(); tokenizer->Encode(prompt, *sequences); @@ -21,14 +22,15 @@ void CXX_API(const char* model_path) { auto output_sequences = model->Generate(*params); auto out_string = tokenizer->Decode(output_sequences->Get(0)); - std::cout << "Output: " << std::endl << out_string << std::endl; + std::cout << "Output: " << std::endl + << out_string << std::endl; } // C API Example void CheckResult(OgaResult* result) { if (result) { - std::string string=OgaResultGetError(result); + std::string string = OgaResultGetError(result); OgaDestroyResult(result); throw std::runtime_error(string); } @@ -84,6 +86,8 @@ int main(int argc, char** argv) { return -1; } + // Responsible for cleaning up the library during shutdown + OgaHandle handle; std::cout << "-------------" << std::endl; std::cout << "Hello, Phi-2!" << std::endl; diff --git a/examples/csharp/HelloPhi2/Program.cs b/examples/csharp/HelloPhi2/Program.cs index 993af8b57..fecb24ad7 100644 --- a/examples/csharp/HelloPhi2/Program.cs +++ b/examples/csharp/HelloPhi2/Program.cs @@ -1,6 +1,8 @@ // See https://aka.ms/new-console-template for more information using Microsoft.ML.OnnxRuntimeGenAI; +OgaHandle ogaHandle = new OgaHandle(); + Console.WriteLine("-------------"); Console.WriteLine("Hello, Phi-2!"); Console.WriteLine("-------------"); diff --git a/nuget.config b/nuget.config index 3e0389a52..f3853aee6 100644 --- a/nuget.config +++ b/nuget.config @@ -3,11 +3,14 @@ - - - - - - - + + + + + + + + + + \ No newline at end of file diff --git a/onnxruntime-genai.sln b/onnxruntime-genai.sln new file mode 100644 index 000000000..5e59cc82e --- /dev/null +++ b/onnxruntime-genai.sln @@ -0,0 +1,36 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31903.59 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{2253BDCC-33C9-431E-889A-56E3E75D10BA}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI", "src\csharp\Microsoft.ML.OnnxRuntimeGenAI.csproj", "{CA0EC087-3AF5-44D5-93F0-489420EBA014}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{505E2406-98C2-46DD-973A-3CEB95CF3626}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI.Tests", "test\csharp\Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj", "{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.Build.0 = Debug|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.ActiveCfg = Release|Any CPU + {CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.Build.0 = Release|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.Build.0 = Debug|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.ActiveCfg = Release|Any CPU + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {CA0EC087-3AF5-44D5-93F0-489420EBA014} = {2253BDCC-33C9-431E-889A-56E3E75D10BA} + {24C405DE-9E56-4E1F-BDDF-332DFD8BCB73} = {505E2406-98C2-46DD-973A-3CEB95CF3626} + EndGlobalSection +EndGlobal diff --git a/src/config.cpp b/src/config.cpp index 39341f5b5..7dc3133ec 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -397,7 +397,7 @@ struct RootObject_Element : JSON::Element { JSON::Element& t_; }; -void ParseConfig(const std::filesystem::path& filename, Config& config) { +void ParseConfig(const fs::path& filename, Config& config) { std::ifstream file(filename, std::ios::binary | std::ios::ate); if (!file.is_open()) { throw std::runtime_error("Error opening " + filename.string()); @@ -421,7 +421,7 @@ void ParseConfig(const std::filesystem::path& filename, Config& config) { } } -Config::Config(const std::filesystem::path& path) : config_path{path} { +Config::Config(const fs::path& path) : config_path{path} { ParseConfig(path / "genai_config.json", *this); if (model.context_length == 0) diff --git a/src/config.h b/src/config.h index b94e05ca0..8fb5debdc 100644 --- a/src/config.h +++ b/src/config.h @@ -6,9 +6,9 @@ namespace Generators { struct Config { Config() = default; - Config(const std::filesystem::path& path); + Config(const fs::path& path); - std::filesystem::path config_path; // Path of the config directory + fs::path config_path; // Path of the config directory using ProviderOption = std::pair; struct ProviderOptions { diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs index f2906f3df..8960bdb0e 100644 --- a/src/csharp/NativeMethods.cs +++ b/src/csharp/NativeMethods.cs @@ -71,7 +71,7 @@ internal class NativeLib IntPtr /* const OgaSequences* */ sequences); [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaGenerator** */ generator); @@ -129,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq // This function is used to generate sequences for the given model using the given generator parameters. // The OgaSequences object is an array of sequences, where each sequence is an array of tokens. [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] - public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model, + public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model, IntPtr /* const OgaGeneratorParams* */ generatorParams, out IntPtr /* OgaSequences** */ sequences); @@ -176,5 +176,8 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] public static extern IntPtr /* OgaResult* */ OgaGetCurrentGpuDeviceId(out IntPtr /* int32_t */ device_id); + + [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)] + public static extern void OgaShutdown(); } } diff --git a/src/csharp/Utils.cs b/src/csharp/Utils.cs index 815652a71..2a8723280 100644 --- a/src/csharp/Utils.cs +++ b/src/csharp/Utils.cs @@ -7,6 +7,14 @@ namespace Microsoft.ML.OnnxRuntimeGenAI { + public class OgaHandle + { + ~OgaHandle() + { + NativeMethods.OgaShutdown(); + } + } + public class Utils { public static void SetCurrentGpuDeviceId(int device_id) diff --git a/src/dml/dml_adapter_info.h b/src/dml/dml_adapter_info.h index eb021b8ae..1638649bb 100644 --- a/src/dml/dml_adapter_info.h +++ b/src/dml/dml_adapter_info.h @@ -11,6 +11,7 @@ enum class VendorID { Undefined = 0, Intel = 0x8086, + Microsoft = 0x1414, }; // Retrieves information from a DXCore or DXGI adapter. @@ -27,4 +28,5 @@ class AdapterInfo { void Initialize(IDXCoreAdapter* adapter); ::VendorID vendor_id_; + uint32_t device_id_; }; diff --git a/src/dml/dml_helpers.cpp b/src/dml/dml_helpers.cpp index e7a0c2f08..d7954b84c 100644 --- a/src/dml/dml_helpers.cpp +++ b/src/dml/dml_helpers.cpp @@ -4,11 +4,82 @@ #include #include #include +#include #include "dml_helpers.h" #include "dml_adapter_info.h" namespace DmlHelpers { +static bool IsSoftwareAdapter(IDXGIAdapter1* adapter) { + DXGI_ADAPTER_DESC1 desc = {}; + THROW_IF_FAILED(adapter->GetDesc1(&desc)); + + // See here for documentation on filtering WARP adapter: + // https://docs.microsoft.com/en-us/windows/desktop/direct3ddxgi/d3d10-graphics-programming-guide-dxgi#new-info-about-enumerating-adapters-for-windows-8 + const bool is_basic_render_driver_vendor_id = desc.VendorId == static_cast(VendorID::Microsoft); + const bool is_basic_render_driver_device_id = desc.DeviceId == 0x8c; + return desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE || (is_basic_render_driver_vendor_id && is_basic_render_driver_device_id); +}; + +static std::vector> EnumerateAdapters() { + ComPtr dxgi_factory; + THROW_IF_FAILED(CreateDXGIFactory(IID_PPV_ARGS(&dxgi_factory))); + + std::vector> adapter_infos; + + ComPtr dxgi_factory6; + if (SUCCEEDED(dxgi_factory.As(&dxgi_factory6))) { + // Enumerate adapters by performance. This only works in Windows 10 Version 1803 and later. + ComPtr adapter; + for (uint32_t adapter_index = 0; + dxgi_factory6->EnumAdapterByGpuPreference( + adapter_index, + DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE, + IID_PPV_ARGS(&adapter)) != DXGI_ERROR_NOT_FOUND; + adapter_index++) { + // Since we enumerate by performance, we can ignore everything that comes after the first software adapter, which includes the IDD + // adapters. This is necessary for now because IDD (e.g. remote desktop) adapters don't have the DXGI_ADAPTER_FLAG_SOFTWARE flag, + // even though they run on software. + if (IsSoftwareAdapter(adapter.Get())) { + break; + } + + // Make sure that we are able to create the device + ComPtr d3d12_device; + THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&d3d12_device))); + + if (d3d12_device) { + adapter_infos.emplace_back(std::move(adapter)); + } + } + } else { + // Enumerate adapters without ordering. + ComPtr adapter; + for (uint32_t adapter_index = 0; dxgi_factory->EnumAdapters1(adapter_index, &adapter) != DXGI_ERROR_NOT_FOUND; adapter_index++) { + // We can't assume the ordering of hardware and software adapters, so keep looping. This path should only execute on Windows 10 + // version 1709 or earlier; IDD (e.g. remote desktop) adapters do not exist when taking this code path. + if (IsSoftwareAdapter(adapter.Get())) { + continue; + } + + // Make sure that we are able to create the device + ComPtr d3d12_device; + THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&d3d12_device))); + + if (d3d12_device) { + adapter_infos.emplace_back(std::move(adapter)); + } + } + } + + return adapter_infos; +} + +static ComPtr CreatePerformantAdapter() { + auto filtered_adapters = EnumerateAdapters(); + return filtered_adapters.front(); +} + DmlObjects CreateDmlObjects() { D3D12_COMMAND_QUEUE_DESC command_queue_description = { D3D12_COMMAND_LIST_TYPE_COMPUTE, @@ -19,7 +90,8 @@ DmlObjects CreateDmlObjects() { DmlObjects dml_objects; - THROW_IF_FAILED(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device))); + auto adapter = CreatePerformantAdapter(); + THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device))); THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandQueue(&command_queue_description, IID_PPV_ARGS(&dml_objects.command_queue))); THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&dml_objects.command_allocator))); THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, dml_objects.command_allocator.Get(), nullptr, IID_PPV_ARGS(&dml_objects.command_list))); diff --git a/src/filesystem.h b/src/filesystem.h new file mode 100644 index 000000000..45c4c7015 --- /dev/null +++ b/src/filesystem.h @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// TODO(baijumeswani): Remove experimental when packaging pipeline can use GCC > 8 +#ifdef USE_EXPERIMENTAL_FILESYSTEM +#include +namespace fs = std::experimental::filesystem; +#else +#include +namespace fs = std::filesystem; +#endif diff --git a/src/generators.cpp b/src/generators.cpp index 49751c3e9..133d735da 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -11,6 +11,23 @@ namespace Generators { +static bool _ = (Ort::InitApi(), false); + +OrtGlobals::OrtGlobals() : env_{OrtEnv::Create(OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR)} {} + +std::unique_ptr& GetOrtGlobals() { + static auto globals = std::make_unique(); + return globals; +} + +void Shutdown() { + GetOrtGlobals().reset(); +} + +OrtEnv& GetOrtEnv() { + return *GetOrtGlobals()->env_; +} + // IEEE 752-2008 binary16 format, 1 sign bit, 5 bit exponent, 10 bit fraction float Float16ToFloat32(uint16_t v) { // Extract sign, exponent, and fraction from numpy.float16 @@ -44,7 +61,25 @@ GeneratorParams::GeneratorParams(const Model& model) eos_token_id{model.config_->model.eos_token_id}, vocab_size{model.config_->model.vocab_size}, device_type{model.device_type_}, - cuda_stream{model.cuda_stream_} { + cuda_stream{model.cuda_stream_}, + is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} { +} + +void GeneratorParams::TryGraphCapture(int max_bs) { + if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) { + // no-op + return; + } + + if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) { + if (max_bs == 0) { + throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set."); + } + use_cuda_graph = true; + max_batch_size = max_bs; + } else { + throw std::runtime_error("CUDA graph is not supported on this device"); + } } std::unique_ptr CreateGenerator(const Model& model, const GeneratorParams& params) { diff --git a/src/generators.h b/src/generators.h index 3b2e65fa8..beb958353 100644 --- a/src/generators.h +++ b/src/generators.h @@ -5,8 +5,9 @@ #include #include #include -#include +#include "filesystem.h" #include +#include #include "span.h" #include #include @@ -61,6 +62,7 @@ struct GeneratorParams : std::enable_shared_from_this { int batch_size{1}; int max_batch_size{0}; + bool use_cuda_graph{}; int sequence_length{}; int BatchBeamSize() const { return search.num_beams * batch_size; } @@ -97,6 +99,11 @@ struct GeneratorParams : std::enable_shared_from_this { std::vector input_ids_owner; // Backing memory of input_ids in some cases std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime + + void TryGraphCapture(int max_bs); + + private: + bool is_cuda_graph_enabled_{}; }; struct Generator { @@ -114,6 +121,23 @@ struct Generator { bool computed_logits_{}; // Set to true in ComputeLogits() and false after appending a token to ensure a 1 to 1 call ratio }; +struct OrtGlobals { + OrtGlobals(); + + std::unique_ptr env_; +#if USE_CUDA + std::unique_ptr memory_info_cuda_; + std::unique_ptr allocator_cuda_; +#endif + private: + OrtGlobals(const OrtGlobals&) = delete; + void operator=(const OrtGlobals&) = delete; +}; + +std::unique_ptr& GetOrtGlobals(); +void Shutdown(); // Do this once at exit, Ort code will fail after this call +OrtEnv& GetOrtEnv(); + std::shared_ptr CreateModel(OrtEnv& ort_env, const char* config_path); std::shared_ptr CreateGeneratorParams(const Model& model); std::shared_ptr CreateGeneratorParams(); // For benchmarking purposes only diff --git a/src/logging.cpp b/src/logging.cpp index 6c334f50a..edd698168 100644 --- a/src/logging.cpp +++ b/src/logging.cpp @@ -44,7 +44,7 @@ void SetLogString(std::string_view name, std::string_view value) { if (value.empty()) gp_logfile.reset(); else { - std::filesystem::path filename{value}; + fs::path filename{std::string(value)}; gp_logfile = std::make_unique(filename); } diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp index 140f2a8cd..96cc029b8 100644 --- a/src/models/captured_graph_pool.cpp +++ b/src/models/captured_graph_pool.cpp @@ -24,7 +24,7 @@ static std::tuple MakeKey(int max_batch_size, int max_length, int } CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const { - if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { + if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { return nullptr; } diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp index 83d1f03d3..53f4f6697 100644 --- a/src/models/decoder_only.cpp +++ b/src/models/decoder_only.cpp @@ -26,7 +26,7 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArra RoamingArray DecoderOnly_State::Run(int current_length, RoamingArray next_tokens, RoamingArray next_indices) { if (first_run_) { - if (model_.use_cuda_graph_) { + if (params_->use_cuda_graph) { model_.run_options_->AddConfigEntry("gpu_graph_id", "-1"); } first_run_ = false; @@ -37,7 +37,7 @@ RoamingArray DecoderOnly_State::Run(int current_length, RoamingArrayuse_cuda_graph) { int new_batch_size = static_cast(input_ids_.GetShape()[0]); if (new_batch_size != current_batch_size_) { current_batch_size_ = new_batch_size; diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 0c4436928..3c2e0dbfa 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -117,7 +117,7 @@ KV_Cache::KV_Cache(const Model& model, State& state) : model_{model}, state_{state}, layer_count_{model_.config_->model.decoder.num_hidden_layers}, - past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && (model_.device_type_ == DeviceType::CUDA || model_.device_type_ == DeviceType::DML)}, + past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1}, shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} { if (g_log.enabled && g_log.warning && past_present_share_buffer_ != state_.params_->search.past_present_share_buffer) Log("warning", "past_present_share_buffer search option set to true, but has been disabled due to the current configuration. See https://aka.ms/generate_config for details"); diff --git a/src/models/model.cpp b/src/models/model.cpp index 9226e5c05..a5de9979f 100644 --- a/src/models/model.cpp +++ b/src/models/model.cpp @@ -187,14 +187,12 @@ std::vector Tokenizer::DecodeBatch(std::span sequenc // has been destroyed. Without this, we will crash in the Onnxruntime BFCArena code when deleting tensors due to the // arena already being destroyed. Ort::Allocator* GetCudaAllocator(OrtSession& session) { - static std::unique_ptr memory_info_cuda_; - static std::unique_ptr allocator_cuda_; - - if (!allocator_cuda_) { - memory_info_cuda_ = OrtMemoryInfo::Create("Cuda", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); - allocator_cuda_ = Ort::Allocator::Create(session, *memory_info_cuda_); + auto& globals = *GetOrtGlobals(); + if (!globals.allocator_cuda_) { + globals.memory_info_cuda_ = OrtMemoryInfo::Create("Cuda", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + globals.allocator_cuda_ = Ort::Allocator::Create(session, *globals.memory_info_cuda_); } - return allocator_cuda_.get(); + return globals.allocator_cuda_.get(); } #endif @@ -316,7 +314,7 @@ void Model::CreateSessionOptions() { } if (options.enable_profiling.has_value()) { - std::filesystem::path profile_file_prefix{options.enable_profiling.value()}; + fs::path profile_file_prefix{options.enable_profiling.value()}; ort_options.EnableProfiling(profile_file_prefix.c_str()); } @@ -539,26 +537,4 @@ std::unique_ptr Model::ExpandInputs(std::unique_ptr& input, return expanded; } -void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) { - bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options); - max_batch_size_ = params.max_batch_size; - - if (DeviceType::CUDA == device_type_) { - if (is_cuda_graph_enabled) { - if (max_batch_size_ == 0) { - throw std::runtime_error("CUDA graph is enabled, but max_batch_size is not set."); - } - use_cuda_graph_ = true; - } - } else if (DeviceType::DML == device_type_) { - if (max_batch_size_ == 0) { - throw std::runtime_error("max_batch_size needs to be set when using DirectML."); - } - - use_cuda_graph_ = true; - } else if (is_cuda_graph_enabled) { - throw std::runtime_error("CUDA graph is not supported on this device"); - } -} - } // namespace Generators diff --git a/src/models/model.h b/src/models/model.h index b569373f8..d1b0a1ec0 100644 --- a/src/models/model.h +++ b/src/models/model.h @@ -119,8 +119,6 @@ struct Model : std::enable_shared_from_this { std::unique_ptr ExpandInputs(std::unique_ptr& input, int num_beams) const; - void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params); - CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); } std::unique_ptr config_; @@ -136,9 +134,6 @@ struct Model : std::enable_shared_from_this { std::shared_ptr external_owner_; // Set to 'this' when created by the C API to preserve lifetime - bool use_cuda_graph_{}; - int max_batch_size_{}; - #if USE_DML DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); } DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); } diff --git a/src/ort_genai.h b/src/ort_genai.h index fb863dae2..aa6553a5c 100644 --- a/src/ort_genai.h +++ b/src/ort_genai.h @@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract { return std::unique_ptr(p); } - std::unique_ptr Generate(const OgaGeneratorParams& params) { + std::unique_ptr Generate(const OgaGeneratorParams& params) const { OgaSequences* p; OgaCheckResult(OgaGenerate(this, ¶ms, &p)); return std::unique_ptr(p); @@ -201,7 +201,7 @@ struct OgaGeneratorParams : OgaAbstract { }; struct OgaGenerator : OgaAbstract { - static std::unique_ptr Create(OgaModel& model, const OgaGeneratorParams& params) { + static std::unique_ptr Create(const OgaModel& model, const OgaGeneratorParams& params) { OgaGenerator* p; OgaCheckResult(OgaCreateGenerator(&model, ¶ms, &p)); return std::unique_ptr(p); @@ -235,3 +235,10 @@ struct OgaGenerator : OgaAbstract { static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast(p)); } }; + +struct OgaHandle { + OgaHandle() = default; + ~OgaHandle() noexcept { + OgaShutdown(); + } +}; diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp index ce5eb56e9..c383d22f0 100644 --- a/src/ort_genai_c.cpp +++ b/src/ort_genai_c.cpp @@ -13,16 +13,6 @@ namespace Generators { -std::unique_ptr g_ort_env; - -OrtEnv& GetOrtEnv() { - if (!g_ort_env) { - Ort::InitApi(); - g_ort_env = OrtEnv::Create(); - } - return *g_ort_env; -} - struct Result { explicit Result(const char* what) : what_{what} {} std::string what_; @@ -39,6 +29,10 @@ extern "C" { return reinterpret_cast(std::make_unique(e.what()).release()); \ } +void OGA_API_CALL OgaShutdown() { + Generators::Shutdown(); +} + const char* OGA_API_CALL OgaResultGetError(const OgaResult* result) { return reinterpret_cast(result)->what_.c_str(); } @@ -111,7 +105,7 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) { OGA_TRY auto* params = reinterpret_cast(generator_params); - params->max_batch_size = max_batch_size; + params->TryGraphCapture(max_batch_size); return nullptr; OGA_CATCH } @@ -146,23 +140,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera OGA_CATCH } -OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { +OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) { OGA_TRY - auto* model_p = reinterpret_cast(model); - auto* params = reinterpret_cast(generator_params); - model_p->GetMaxBatchSizeFromGeneratorParams(*params); - auto result = Generators::Generate(*model_p, *params); + auto result = Generators::Generate(*reinterpret_cast(model), *reinterpret_cast(generator_params)); *out = reinterpret_cast(std::make_unique(std::move(result)).release()); return nullptr; OGA_CATCH } -OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { +OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) { OGA_TRY - auto* model_p = reinterpret_cast(model); - auto* params = reinterpret_cast(generator_params); - model_p->GetMaxBatchSizeFromGeneratorParams(*params); - *out = reinterpret_cast(CreateGenerator(*model_p, *params).release()); + *out = reinterpret_cast(CreateGenerator(*reinterpret_cast(model), *reinterpret_cast(generator_params)).release()); return nullptr; OGA_CATCH } diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h index df4c47dde..7c703061b 100644 --- a/src/ort_genai_c.h +++ b/src/ort_genai_c.h @@ -40,6 +40,10 @@ typedef struct OgaSequences OgaSequences; typedef struct OgaTokenizer OgaTokenizer; typedef struct OgaTokenizerStream OgaTokenizerStream; +/* \brief Call this on process exit to cleanly shutdown the genai library & its onnxruntime usage + */ +OGA_EXPORT void OGA_API_CALL OgaShutdown(); + /* * \param[in] result OgaResult that contains the error message. * \return Error message contained in the OgaResult. The const char* is owned by the OgaResult @@ -111,7 +115,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model); * after it is done using the sequences. * \return OgaResult containing the error message if the generation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out); /* * \brief Creates a OgaGeneratorParams from the given model. @@ -161,7 +165,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O * \param[out] out The created generator. * \return OgaResult containing the error message if the generator creation failed. */ -OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); +OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out); /* * \brief Destroys the given generator. diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index bc85d8bce..6dbf82371 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -64,8 +64,12 @@ if(BUILD_WHEEL) "libcufft.so.11" "libcurand.so.10" "libnvinfer.so.8" + "libnvinfer.so.10" "libnvinfer_plugin.so.8" + "libnvinfer_plugin.so.10" "libnvonnxparser.so.8" + "libnvonnxparser.so.10" + ) set(modified_exclude_list) foreach(item IN LISTS auditwheel_exclude_list) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 1a9c37a3e..3865de4bc 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -167,16 +167,17 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): "use_rotemb_in_attn": False, # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op) "use_packed_matmul": False, # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V) } - if self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16: + enable_GQA_on_CPU = True if "enable_GQA_on_CPU" in extra_options and extra_options["enable_GQA_on_CPU"] == "1" else False + if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT): # Change model settings for GroupQueryAttention self.attention_attrs["op_type"] = "GroupQueryAttention" - print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.") + print("GroupQueryAttention (GQA) is used in this model.") # DML doesn't support packed Q/K/V for GQA yet self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads # GQA + Rot.Emb. does not require `position ids` as input - if self.ep == "cuda": + if self.ep in {"cuda", "cpu"}: self.attention_attrs["use_rotemb_in_attn"] = True self.input_names.remove("position_ids") @@ -227,7 +228,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir): "num_key_value_heads": self.num_kv_heads, }, "eos_token_id": config.eos_token_id, - "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id, + "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id, "type": self.model_type[ : self.model_type.find("For")].lower(), "vocab_size": self.vocab_size, }, @@ -1979,6 +1980,7 @@ def get_args(): enable_cuda_graph = 1 : The model can use CUDA graph capture for CUDA execution provider. If enabled, all nodes being placed on the CUDA EP is the prerequisite for the CUDA graph to be used correctly. It is not guaranteed that cuda graph be enabled as it depends on the model and the graph structure. + enable_GQA_on_CPU = Enalbe G(Group)Query(Q)Attention(A) on CPU. """), ) diff --git a/src/python/python.cpp b/src/python/python.cpp index dd9d5a9f3..1d8a4e567 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -24,15 +24,6 @@ pybind11::array_t ToPython(std::span v) { namespace Generators { -std::unique_ptr g_ort_env; - -OrtEnv& GetOrtEnv() { - if (!g_ort_env) { - g_ort_env = OrtEnv::Create(); - } - return *g_ort_env; -} - // A roaming array is one that can be in CPU or GPU memory, and will copy the memory as needed to be used from anywhere template struct PyRoamingArray : RoamingArray { @@ -113,7 +104,7 @@ struct PyGeneratorParams { } void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) { - params_->max_batch_size = max_batch_size.cast(); + params_->TryGraphCapture(max_batch_size.cast()); } pybind11::array_t py_input_ids_; @@ -124,7 +115,6 @@ struct PyGeneratorParams { struct PyGenerator { PyGenerator(Model& model, PyGeneratorParams& params) { params.Prepare(); - model.GetMaxBatchSizeFromGeneratorParams(params); generator_ = CreateGenerator(model, params); } @@ -186,6 +176,14 @@ PYBIND11_MODULE(onnxruntime_genai, m) { )pbdoc"; + // Add a cleanup call to happen before global variables are destroyed + static int unused{}; // The capsule needs something to reference + pybind11::capsule cleanup( + &unused, "cleanup", [](PyObject*) { + Generators::Shutdown(); + }); + m.add_object("_cleanup", cleanup); + // So that python users can catch OrtExceptions specifically pybind11::register_exception(m, "OrtException"); @@ -203,9 +201,6 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def("set_search_options", &PyGeneratorParams::SetSearchOptions) // See config.h 'struct Search' for the options .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize); - // We need to init the OrtApi before we can use it - Ort::InitApi(); - pybind11::class_(m, "TokenizerStream") .def("decode", [](TokenizerStream& t, int32_t token) { return t.Decode(token); }); @@ -233,7 +228,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) { .def(pybind11::init([](const std::string& config_path) { return CreateModel(GetOrtEnv(), config_path.c_str()); })) - .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); }) + .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); }) .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; }); pybind11::class_(m, "Generator") diff --git a/src/python/setup.py.in b/src/python/setup.py.in index 866515be1..1d784df2e 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -13,6 +13,7 @@ class BinaryDistribution(Distribution): setup( name='@TARGET_NAME@', version='@VERSION_INFO@', + description='Generative AI API for ONNX Runtime', packages=['onnxruntime_genai', 'onnxruntime_genai.models'], include_package_data=True, package_data={'': ['*.pyd', '*.dll', '*.so*']}, diff --git a/src/smartptrs.h b/src/smartptrs.h index 9eab82abc..5591cfde5 100644 --- a/src/smartptrs.h +++ b/src/smartptrs.h @@ -115,7 +115,7 @@ struct cuda_stream_holder { #else struct cuda_stream_holder { void Create() { - assert(false); + throw std::runtime_error("Trying to create a cuda stream in a non cuda build"); } operator cudaStream_t() const { return v_; } diff --git a/src/tokenizer/CMakeLists.txt b/src/tokenizer/CMakeLists.txt index 69d603715..135dedce6 100644 --- a/src/tokenizer/CMakeLists.txt +++ b/src/tokenizer/CMakeLists.txt @@ -8,11 +8,10 @@ file(GLOB tokenizer_srcs CONFIGURE_DEPENDS ) FetchContent_Declare(GSL URL https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip) -FetchContent_MakeAvailable(GSL) +onnxruntime_fetchcontent_makeavailable(GSL) FetchContent_Declare(simdjson URL https://github.com/simdjson/simdjson/archive/refs/tags/v3.6.3.zip URL_HASH SHA1=2b063a2e81f74a5d1cb937fadf3d2fca0f1edb09) -FetchContent_MakeAvailable(simdjson) - +onnxruntime_fetchcontent_makeavailable(simdjson) add_library(tokenizer STATIC ${tokenizer_srcs}) message(STATUS "GSL_SOURCE_DIR: ${GSL_SOURCE_DIR}") diff --git a/src/tokenizer/c_api/tfmtok_c.cc b/src/tokenizer/c_api/tfmtok_c.cc index 02c57ce65..3dc9be009 100644 --- a/src/tokenizer/c_api/tfmtok_c.cc +++ b/src/tokenizer/c_api/tfmtok_c.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include -#include +#include "../filesystem.h" #include #include "tfmtok.h" @@ -117,7 +117,7 @@ tfmError_t TFM_API_CALL TfmCreateTokenizer(TfmTokenizer** tokenizer, return kTfmErrorInvalidArgument; } - if (!std::filesystem::is_directory(tokenizer_path)) { + if (!fs::is_directory(tokenizer_path)) { last_error_message = std::string("Cannot find the directory of ") + tokenizer_path; return kTfmErrorInvalidArgument; } diff --git a/src/tokenizer/config.cc b/src/tokenizer/config.cc index dbc0908cf..a40b7d7db 100644 --- a/src/tokenizer/config.cc +++ b/src/tokenizer/config.cc @@ -4,7 +4,7 @@ #include #include #include -#include +#include "../filesystem.h" #include "config.h" @@ -68,8 +68,7 @@ TfmStatus TokenConfig::LoadJson(const std::string& json_path) { simdjson::dom::parser parser; simdjson::dom::element root; - if (!std::filesystem::exists( - std::filesystem::path(json_path).lexically_normal())) { + if (!fs::exists(fs::path(json_path))) { return {kTfmErrorInvalidFile, std::string(json_path) + " not found"}; } std::string json_text = PatchJsonText(json_path); diff --git a/src/tokenizer/token_bpe.cc b/src/tokenizer/token_bpe.cc index 93c897eea..80ac9d5bf 100644 --- a/src/tokenizer/token_bpe.cc +++ b/src/tokenizer/token_bpe.cc @@ -237,15 +237,17 @@ std::vector BPETokenizer::Encode(std::string_view sv_input, int64_ text = text.strip() */ std::u32string str = RemoveConsecutiveSpaces(input); - if (IsUnicodeSpace(str.front())) { - str.erase(str.begin()); - } - if (IsUnicodeSpace(str.back())) { - str.pop_back(); + if (!str.empty()) { + if (IsUnicodeSpace(str.front())) { + str.erase(str.begin()); + } + if (IsUnicodeSpace(str.back())) { + str.pop_back(); + } + // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned) + str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end()); + str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end()); } - // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned) - str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end()); - str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end()); input = str; } @@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta token.push_back(' '); } } // end case of whitespace_token_ + + bpe_state->incomplete_utf8_ += token; + token.clear(); + std::string& s_utf8 = bpe_state->incomplete_utf8_; + size_t utf8_len = 1; + size_t utf8_all_len = 0; + for (size_t i = 0; i < s_utf8.size(); i += utf8_len) { + utf8_len = UTF8Len(s_utf8[i]); + if (utf8_len <= s_utf8.size() - i) { + utf8_all_len += utf8_len; + auto _t = s_utf8.substr(i, utf8_len); + token += ValidateUTF8(_t) ? _t : ""; + } + } + s_utf8 = s_utf8.substr(utf8_all_len); } return status; diff --git a/src/tokenizer/token_bpe.h b/src/tokenizer/token_bpe.h index ed5f1f23c..2327b3a60 100644 --- a/src/tokenizer/token_bpe.h +++ b/src/tokenizer/token_bpe.h @@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl { BPEDeocerState() = default; ~BPEDeocerState() override = default; bool f_special_last; + std::string incomplete_utf8_; }; public: diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc index b2a0622e7..4f52acd72 100644 --- a/src/tokenizer/tokenizer.cc +++ b/src/tokenizer/tokenizer.cc @@ -1,7 +1,7 @@ #include "token_bpe.h" #include "token_rwkv.h" -#include +#include "../filesystem.h" #include namespace tfm { @@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path, if (type.empty()) { if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) { type = "BPE"; - } else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) { + } /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) { // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model type = "SPM"; - } else { + } */ else { status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument"); } } @@ -43,7 +43,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path, } /* else if (type == "SPM") { token_ptr = std::make_unique(); } */ else { - status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported."); + status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported."); } if (status.ok()) { diff --git a/src/tokenizer/utils/unescape.cc b/src/tokenizer/utils/unescape.cc index f42e962f9..f94a1f192 100644 --- a/src/tokenizer/utils/unescape.cc +++ b/src/tokenizer/utils/unescape.cc @@ -41,27 +41,60 @@ std::string EncodeUTF8Char(char32_t utf8_char) { return {utf8_buf}; } -bool ValidateUTF8(const std::string& data) { - int cnt = 0; - for (size_t i = 0; i < data.size(); i++) { - int x = data[i]; - if (!cnt) { - if ((x >> 5) == 0b110) { - cnt = 1; - } else if ((x >> 4) == 0b1110) { - cnt = 2; - } else if ((x >> 3) == 0b11110) { - cnt = 3; - } else if ((x >> 7) != 0) { + bool ValidateUTF8(const std::string& data) { + const unsigned char* s = reinterpret_cast(data.c_str()); + const unsigned char* s_end = s + data.size(); + if (*s_end != '\0') + return false; + + while (*s) { + if (*s < 0x80) + /* 0xxxxxxx */ + s++; + else if ((s[0] & 0xe0) == 0xc0) { + /* 110XXXXx 10xxxxxx */ + if (s + 1 >= s_end) { + return false; + } + if ((s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) /* overlong? */ + return false; + else + s += 2; + } else if ((s[0] & 0xf0) == 0xe0) { + /* 1110XXXX 10Xxxxxx 10xxxxxx */ + if (s + 2 >= s_end) { + return false; + } + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */ + return false; + else + s += 3; + } else if ((s[0] & 0xf8) == 0xf0) { + /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ + if (s + 3 >= s_end) { + return false; + } + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */ + return false; + else + s += 4; + } else return false; - } - } else { - if ((x >> 6) != 0b10) return false; - cnt--; } + + return true; } - return cnt == 0; -} + bool IsDigit(char c) { return c >= '0' && c <= '9'; } bool IsHexDigit(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } diff --git a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj similarity index 92% rename from test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj rename to test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj index e4ec8e6d8..978deb04e 100644 --- a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj +++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj @@ -12,7 +12,8 @@ default True Debug;RelWithDebInfo;Release - + https://api.nuget.org/v3/index.json + $(RestoreAdditionalProjectSources);$(RestoreSources) Microsoft.ML.OnnxRuntimeGenAI.Tests Microsoft.ML.OnnxRuntimeGenAI.Tests diff --git a/test/main.cpp b/test/main.cpp index 2f69632b0..ee21918c4 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -5,20 +5,16 @@ #include #include -extern std::unique_ptr g_ort_env; - int main(int argc, char** argv) { std::cout << "Generators Utility Library" << std::endl; std::cout << "Initializing OnnxRuntime... "; std::cout.flush(); try { - Ort::InitApi(); - g_ort_env = OrtEnv::Create(); std::cout << "done" << std::endl; ::testing::InitGoogleTest(&argc, argv); int result = RUN_ALL_TESTS(); std::cout << "Shutting down OnnxRuntime... "; - g_ort_env.reset(); + Generators::Shutdown(); std::cout << "done" << std::endl; return result; } catch (const std::exception& e) { diff --git a/test/model_tests.cpp b/test/model_tests.cpp index edeeb4ea4..66ceaee83 100644 --- a/test/model_tests.cpp +++ b/test/model_tests.cpp @@ -10,7 +10,6 @@ #ifndef MODEL_PATH #define MODEL_PATH "../../test/test_models/" #endif -std::unique_ptr g_ort_env; // To generate this file: // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20 @@ -33,7 +32,7 @@ TEST(ModelTests, GreedySearchGptFp32) { // To generate this file: // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20 // And copy the resulting gpt2_init_past_fp32.onnx file into these two files (as it's the same for gpt2) - auto model = Generators::CreateModel(*g_ort_env, + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto params = Generators::CreateGeneratorParams(*model); @@ -74,7 +73,7 @@ TEST(ModelTests, BeamSearchGptFp32) { // --output tiny_gpt2_beamsearch_fp16.onnx --use_gpu --max_length 20 // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default) - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); auto params = Generators::CreateGeneratorParams(*model); params->batch_size = static_cast(input_ids_shape[0]); @@ -119,7 +118,7 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label) 0, 0, 0, 52, 204, 204, 204, 204, 204, 204, 0, 0, 195, 731, 731, 114, 114, 114, 114, 114}; - auto model = Generators::CreateModel(*g_ort_env, model_path); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), model_path); auto params = Generators::CreateGeneratorParams(*model); params->batch_size = static_cast(input_ids_shape[0]); @@ -164,7 +163,7 @@ void Test_BeamSearch_Gpt_Cuda(const char* model_path, const char* model_label) { // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 // --output tiny_gpt2_beamsearch_fp16.onnx --use_gpu --max_length 20 // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default) - auto model = Generators::CreateModel(*g_ort_env, model_path); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), model_path); auto params = Generators::CreateGeneratorParams(*model); params->batch_size = static_cast(input_ids_shape[0]); @@ -215,7 +214,7 @@ Print all primes between 1 and n std::cout << "With prompt:" << prompt << "\r\n"; - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "phi-2"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "phi-2"); auto tokenizer = model->CreateTokenizer(); auto tokens = tokenizer->Encode(prompt); @@ -253,7 +252,7 @@ Print all primes between 1 and n std::cout << "With prompt:" << prompt << "\r\n"; - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "phi-2"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "phi-2"); auto tokenizer = model->CreateTokenizer(); auto tokens = tokenizer->Encode(prompt); diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp index 6190e2507..e614b2b20 100644 --- a/test/sampling_benchmark.cpp +++ b/test/sampling_benchmark.cpp @@ -14,13 +14,11 @@ #define MODEL_PATH "../../test/test_models/" #endif -extern std::unique_ptr g_ort_env; - // Defined in sampling_tests.cpp void CreateRandomLogits(float* logits, int num_large, int vocab_size, int batch_size, std::mt19937& engine); TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 1; std::vector input_ids{0, 1, 2, 3, 4}; @@ -54,7 +52,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCpu) { } TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 1; int k = 5; @@ -91,7 +89,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) { } TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 1; float p = 0.95f; @@ -132,7 +130,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) { #include "tests_helper.cuh" TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 1; std::vector input_ids{0, 1, 2, 3, 4}; @@ -175,10 +173,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) { } TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) { - std::unique_ptr g_ort_env; - Ort::InitApi(); - g_ort_env = OrtEnv::Create(); - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 1; int k = 5; @@ -218,10 +213,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) { } TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) { - std::unique_ptr g_ort_env; - Ort::InitApi(); - g_ort_env = OrtEnv::Create(); - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 1; float p = 0.95f; @@ -266,10 +258,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) { } TEST(Benchmarks, BenchmarkRandomizedSelectTopCuda) { - std::unique_ptr g_ort_env; - Ort::InitApi(); - g_ort_env = OrtEnv::Create(); - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 12; std::vector input_ids{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; // Needs to match batch_size diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp index 239c71ab6..d2fe34cc6 100644 --- a/test/sampling_tests.cpp +++ b/test/sampling_tests.cpp @@ -12,10 +12,9 @@ #ifndef MODEL_PATH #define MODEL_PATH "../../test/test_models/" #endif -extern std::unique_ptr g_ort_env; TEST(SamplingTests, BatchedSamplingTopPCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); std::vector input_ids{0, 1, 2, 3}; std::vector expected_output{1, 2, 3, 4}; auto output_span = Generators::cpu_span(expected_output); @@ -45,7 +44,7 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) { } TEST(SamplingTests, BatchedSamplingTopKCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); std::vector input_ids{0, 1, 2, 3}; std::vector logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f, 0.25f, 2.0f, 1.25f, 1.5f, 0.25f, @@ -78,7 +77,7 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) { } TEST(SamplingTests, BatchedSamplingTopPAndKCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); std::vector input_ids{0, 1, 2, 3}; std::vector logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f, 0.25f, 2.0f, 1.25f, 1.5f, 0.25f, @@ -128,7 +127,7 @@ void CreateRandomLogits(float* logits, int num_large, int vocab_size, int batch_ } TEST(SamplingTests, RandomizedSamplingTopPCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; std::vector input_ids{0, 1, 2, 3, 4}; @@ -165,7 +164,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) { } TEST(SamplingTests, RandomizedSamplingTopKCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; int k = 5; @@ -203,7 +202,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) { } TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; float p = 0.95f; @@ -246,7 +245,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) { #include "tests_helper.cuh" TEST(SamplingTests, BatchedSamplingTopPCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); std::vector input_ids{0, 1, 2, 3}; std::vector expected_output{1, 2, 3, 4}; auto output_span = Generators::cpu_span(expected_output); @@ -278,7 +277,7 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) { } TEST(SamplingTests, BatchedSamplingTopKCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); std::vector input_ids{0, 1, 2, 3}; std::vector logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f, 0.25f, 2.0f, 1.25f, 1.5f, 0.25f, @@ -312,7 +311,7 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) { } TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); std::vector input_ids{0, 1, 2, 3}; std::vector logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f, 0.25f, 2.0f, 1.25f, 1.5f, 0.25f, @@ -347,7 +346,7 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) { } TEST(SamplingTests, RandomizedSamplingTopPCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; std::vector input_ids{0, 1, 2, 3, 4}; @@ -388,7 +387,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) { } TEST(SamplingTests, RandomizedSamplingTopKCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; int k = 5; @@ -430,7 +429,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) { } TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; float p = 0.95f; @@ -474,7 +473,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) { } TEST(SamplingTests, RandomizedSamplingSelectTopCuda) { - auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); + auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32"); int vocab_size = 32000; // vocab size of llama int batch_size = 5; std::vector input_ids{0, 1, 2, 3, 4};