diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 2e1c03aab..8063cbeda 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -4,10 +4,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-x64-1.17.3"
-  ort_zip: "onnxruntime-linux-x64-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
-
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
+  ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
+  NUGET_EXE: "mono /usr/local/bin/nuget.exe"
 jobs:
   linux_cpu_x64:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ]
@@ -16,19 +16,49 @@ jobs:
         uses: actions/checkout@v4
         with:
           submodules: true
+      - name: install Mono and Nuget
+        run: |
+          sudo apt install ca-certificates gnupg
+          sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+          echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list
+          sudo apt update
+          sudo apt install -y mono-devel
+          sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+          sudo chmod +x /usr/local/bin/nuget.exe
+
+      - name: Install jq and dotnet
+        run: |
+          wget https://packages.microsoft.com/config/ubuntu/22.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+          sudo dpkg -i packages-microsoft-prod.deb
+          rm packages-microsoft-prod.deb
+          sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq
 
-      - name: Download OnnxRuntime
+      - name: Get the Latest OnnxRuntime Nightly Version
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
 
-      - name: Unzip OnnxRuntime
+      - name: Download OnnxRuntime Nightly
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          ${{ env.NUGET_EXE }} install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
+        continue-on-error: true
+
+      - name: list files
+        shell: bash
+        run: |
+          ls -l
+          ls -R ${{ env.ORT_PACKAGE_NAME }}
+        continue-on-error: true
 
-      - name: Rename OnnxRuntime to ort
+# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version
+      - name: Extract OnnxRuntime library and header files
         run: |
-          mv ${{ env.ort_dir }} ort
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/
+          ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1)
+          cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version
 
       - name: Build with CMake and GCC
         run: |
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index c1e51251b..6ea48d5d7 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -6,9 +6,11 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  ort_dir: "onnxruntime-linux-x64-gpu-1.17.3"
-  ort_zip: "onnxruntime-linux-x64-gpu-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-gpu-1.17.3.tgz"
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Linux&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: Microsoft.ML.OnnxRuntime.Gpu.Linux
+  ORT_NIGHTLY_SOURCE: "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json"
+  NUGET_EXE: "mono /usr/local/bin/nuget.exe"
+
 
 jobs:
   linux-cuda-x64-build:
@@ -29,19 +31,49 @@ jobs:
           clean: true
           path: manylinux
           submodules: true
+      - name: install Mono and Nuget
+        run: |
+          sudo apt install ca-certificates gnupg
+          sudo gpg --homedir /tmp --no-default-keyring --keyring /usr/share/keyrings/mono-official-archive-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF
+          echo "deb [signed-by=/usr/share/keyrings/mono-official-archive-keyring.gpg] https://download.mono-project.com/repo/ubuntu stable-focal main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list
+          sudo apt update
+          sudo apt install -y mono-devel
+          sudo curl -o /usr/local/bin/nuget.exe https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
+          sudo chmod +x /usr/local/bin/nuget.exe
+
+      - name: Install jq and dotnet
+        run: |
+          wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb
+          sudo dpkg -i packages-microsoft-prod.deb
+          rm packages-microsoft-prod.deb
+          sudo apt-get update && sudo apt-get install -y dotnet-sdk-8.0 jq
 
       - name: Download OnnxRuntime
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }}
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
 
-      - name: Unzip OnnxRuntime
+      - name: Download OnnxRuntime Nightly
+        run: |
+          mono /usr/local/bin/nuget.exe install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
+        continue-on-error: true
+      - name: list files
+        shell: bash
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          ls -l
+          ls -R ${{ env.ORT_PACKAGE_NAME }}
+        continue-on-error: true
 
-      - name: Rename OnnxRuntime to ort
+# TODO: Find out why do we need to to have libonnxruntime.so.$ort_version
+      - name: Extract OnnxRuntime library and header files
         run: |
-          mv ${{ env.ort_dir }} ort
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/linux-x64/native/* ort/lib/
+          ort_version=$(echo ${{ env.ORT_NIGHTLY_VERSION }} | cut -d- -f1-1)
+          cp ort/lib/libonnxruntime.so ort/lib/libonnxruntime.so.$ort_version
+
 
       - name: Get Docker Image
         run: |
@@ -78,7 +110,7 @@ jobs:
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
             -w /ort_genai_src onnxruntimecudabuildx64 \
             bash -c " \
-              /usr/bin/cmake --build --preset linux_gcc_cuda_release --parallel $( nproc )"
+              /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
       - name: Get HuggingFace Token
         run: |
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index 9cb9cdc46..a35063a98 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -4,9 +4,8 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-osx-arm64-1.17.3"
-  ort_zip: "onnxruntime-osx-arm64-1.17.3.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-osx-arm64-1.17.3.tgz"
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 jobs:
   mac-cpu-arm64-build:
     runs-on: macos-latest
@@ -16,22 +15,21 @@ jobs:
         with:
           submodules: true
 
-      - name: Install ninja
+      - name: Get the Latest OnnxRuntime Nightly Version
         run: |
-          brew install ninja
-
-      - name: Download OnnxRuntime
+          ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+          echo "$ORT_NIGHTLY_VERSION" 
+          echo "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" >> $GITHUB_ENV
+      - name: Download OnnxRuntime Nightly
         run: |
-          curl -L -o ${{ env.ort_zip }} ${{ env.ort_url }} 
+          nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x
 
-      - name: Unzip OnnxRuntime
+      - name: Extract OnnxRuntime library and header files
         run: |
-          tar -xzf ${{ env.ort_zip }}
-          rm ${{ env.ort_zip }}
+          mkdir -p ort/lib
+          mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+          mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/
 
-      - name: Rename OnnxRuntime to ort
-        run: |
-          mv ${{ env.ort_dir }} ort
 
       - name: Configure CMake
         run: |
diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
index 916af3009..ce3bfcf4b 100644
--- a/.github/workflows/win-cpu-arm64-build.yml
+++ b/.github/workflows/win-cpu-arm64-build.yml
@@ -53,6 +53,11 @@ jobs:
       run: |
         cmake --build --preset windows_arm64_cpu_release --parallel
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the Python Wheel and Test Dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -62,10 +67,7 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
index ca0bb6b5b..855025fb2 100644
--- a/.github/workflows/win-cpu-x64-build.yml
+++ b/.github/workflows/win-cpu-x64-build.yml
@@ -11,10 +11,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-win-x64-1.17.3"
-  ort_zip: "$(ort_dir).zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)"
   binaryDir: 'build/cpu'
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 
 jobs:
   windows-cpu-x64-build:
@@ -33,19 +32,32 @@ jobs:
       with:
         vs-version: '17.5'
 
-    - name: Download OnnxRuntime
+    - uses: actions/setup-dotnet@v4
+      with:
+        dotnet-version: '6.0.x'
+
+    - name : Install jq and nuget
       run: |
-        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-1.17.3.zip"
-        Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
+        choco install -y jq curl
 
-    - name: Unzip OnnxRuntime
+    - name: Get the Latest OnnxRuntime Nightly Version
+      shell: pwsh
       run: |
-        Expand-Archive $env:ort_zip -DestinationPath .
-        Remove-Item -Path $env:ort_zip
+        $ORT_NIGHTLY_VERSION = $(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+        echo "$ORT_NIGHTLY_VERSION" 
+        "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append
+    - name: Download OnnxRuntime Nightly
+      run: |
+        nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -x -NonInteractive
+
+    - run: Get-ChildItem  ${{ env.ORT_PACKAGE_NAME }} -Recurse
+      continue-on-error: true
 
-    - name: Rename OnnxRuntime to ort
+    - name: Extract OnnxRuntime library and header files
       run: |
-        Rename-Item -Path $env:ort_dir -NewName ort
+        mkdir ort/lib
+        move ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/
 
     - name: Initialize CodeQL
       uses: github/codeql-action/init@v3
@@ -60,6 +72,11 @@ jobs:
       run: |
         cmake --build --preset windows_x64_cpu_release --parallel
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the python wheel and test dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -76,10 +93,7 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
index a9f602ef8..0f27a7bfc 100644
--- a/.github/workflows/win-cuda-x64-build.yml
+++ b/.github/workflows/win-cuda-x64-build.yml
@@ -8,14 +8,12 @@ concurrency:
 env:
   AZCOPY_AUTO_LOGIN_TYPE: MSI
   AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-  ort_dir: "onnxruntime-win-x64-gpu-1.17.3"
-  ort_zip: "onnxruntime-win-x64-gpu-1.17.3.zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-x64-gpu-1.17.3.zip"
   cuda_dir: "${{ github.workspace }}\\cuda_sdk"
   cuda_version: "11.8"
   CUDA_PATH: ${{ github.workspace }}\\cuda_sdk\\v11.8
   binaryDir: 'build/cuda'
-
+  ORT_NIGHTLY_REST_API: "https://feeds.dev.azure.com/aiinfra/PublicPackages/_apis/packaging/Feeds/ORT-Nightly/packages?packageNameQuery=Microsoft.ML.OnnxRuntime.Gpu.Windows&api-version=6.0-preview.1"
+  ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime.Gpu.Windows"
 
 jobs:
   windows-cuda-x64-build:
@@ -35,17 +33,32 @@ jobs:
       run: |
         azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ env.cuda_version }}" ${{ env.cuda_dir}}
 
-    - name: Download OnnxRuntime
+    - uses: actions/setup-dotnet@v4
+      with:
+        dotnet-version: '6.0.x'
+
+    - name : Install jq and curl
       run: |
-        Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
+        choco install -y jq curl
 
-    - name: Unzip OnnxRuntime
+    - name: Get the Latest OnnxRuntime Nightly Version
+      shell: pwsh
+      run: |
+        $ORT_NIGHTLY_VERSION=$(curl -s "${{ env.ORT_NIGHTLY_REST_API }}" | jq -r '.value[0].versions[0].normalizedVersion')
+        echo "$ORT_NIGHTLY_VERSION" 
+        "ORT_NIGHTLY_VERSION=$ORT_NIGHTLY_VERSION" | Out-File -FilePath $env:GITHUB_ENV -Append
+    - name: Download OnnxRuntime Nightly
       run: |
-        Expand-Archive $env:ort_zip -DestinationPath .
-        Remove-Item -Path $env:ort_zip
-    - name: Rename OnnxRuntime to ort
+        nuget install ${{ env.ORT_PACKAGE_NAME }} -version ${{ env.ORT_NIGHTLY_VERSION }} -ExcludeVersion -NonInteractive
+
+    - run: Get-ChildItem  ${{ env.ORT_PACKAGE_NAME }} -Recurse
+      continue-on-error: true
+
+    - name: Extract OnnxRuntime library and header files
       run: |
-        Rename-Item -Path $env:ort_dir -NewName ort
+        mkdir ort/lib
+        move ${{ env.ORT_PACKAGE_NAME }}/buildTransitive/native/include ort/
+        move ${{ env.ORT_PACKAGE_NAME }}/runtimes/win-x64/native/* ort/lib/    
 
     - name: Configure CMake
       run: |
@@ -59,6 +72,11 @@ jobs:
       run: |
         echo "${{ env.cuda_dir }}\\v${{ env.cuda_version }}\\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
 
+    - name: Build the C# API and Run the C# Tests
+      run: |
+        cd test\csharp
+        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
+
     - name: Install the Python Wheel and Test Dependencies
       run: |
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl"))
@@ -75,10 +93,6 @@ jobs:
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"
 
-    - name: Build the C# API and Run the C# Tests
-      run: |
-        cd test\csharp
-        dotnet test /p:Configuration=release /p:NativeBuildOutputDir="$env:GITHUB_WORKSPACE\$env:binaryDir\Release"
 
     - name: Verify Build Artifacts
       if: always()
diff --git a/.gitignore b/.gitignore
index 1ff9a0f9c..a4ad8b1a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,8 @@
 cache_dir
 example-models
 *.onnx
-*.onnx.data
+*.onnx.date
+**/.DS_Store
 __pycache__
 benchmark/python/*.csv
 examples/python/genai_models
diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
index f02e05080..8e09c3eb1 100644
--- a/.pipelines/nuget-publishing.yml
+++ b/.pipelines/nuget-publishing.yml
@@ -27,12 +27,17 @@ parameters:
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
-  default: '1.17.3'
+  default: '1.18.0-dev-20240426-1256-b842effa29'
+
+- name: ort_cuda_version
+  displayName: 'OnnxRuntime GPU version'
+  type: string
+  default: '1.18.0-dev-20240426-0614-b842effa29'
 
 - name: ort_dml_version
-  displayName: 'OnnxRuntime DirectML version'
+  displayName: 'OnnxRuntime DML version'
   type: string
-  default: '1.18.0-dev-20240423-0527-c07b8d545d'
+  default: '1.18.0-dev-20240426-0116-b842effa29'
 
 - name: cuda_version
   displayName: 'CUDA version'
@@ -42,16 +47,6 @@ parameters:
   - '12.2'
   default: '11.8'
 
-- name: use_build_in_ort
-  displayName: 'Whether to use the built-in OnnxRuntime package.'
-  type: boolean
-  default: false
-
-- name: publish_to_ado_feed
-  displayName: 'Publish to Azure DevOps Feed'
-  type: boolean
-  default: false
-
 resources:
   repositories:
   - repository: manylinux
@@ -70,7 +65,6 @@ stages:
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
     enable_win_dml: ${{ parameters.enable_win_dml }}
     ort_version: ${{ parameters.ort_version }}
+    ort_cuda_version: ${{ parameters.ort_cuda_version }}
     ort_dml_version: ${{ parameters.ort_dml_version }}
     cuda_version: ${{ parameters.cuda_version }}
-    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-    use_build_in_ort: ${{ parameters.use_build_in_ort }}
\ No newline at end of file
diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml
index 8759a6cca..b603fcde3 100644
--- a/.pipelines/pypl-publishing.yml
+++ b/.pipelines/pypl-publishing.yml
@@ -7,7 +7,7 @@ parameters:
 - name: enable_win_cuda
   displayName: 'Whether Windows CUDA package is built.'
   type: boolean
-  default : true
+  default: true
 
 - name: enable_win_dml
   displayName: 'Whether Windows DirectML package is built.'
@@ -27,12 +27,17 @@ parameters:
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
-  default: '1.17.3'
+  default: '1.18.0-dev-20240426-1256-b842effa29'
+
+- name: ort_cuda_version
+  displayName: 'OnnxRuntime GPU version'
+  type: string
+  default: '1.18.0-dev-20240426-0614-b842effa29'
 
 - name: ort_dml_version
-  displayName: 'OnnxRuntime DirectML version'
+  displayName: 'OnnxRuntime DML version'
   type: string
-  default: '1.18.0-dev-20240423-0527-c07b8d545d'
+  default: '1.18.0-dev-20240426-0116-b842effa29'
 
 - name: cuda_version
   displayName: 'CUDA version'
@@ -42,16 +47,6 @@ parameters:
   - '11.8'
   - '12.2'
 
-- name: use_build_in_ort
-  displayName: 'Whether to use the built-in OnnxRuntime package.'
-  type: boolean
-  default: false
-
-- name: publish_to_ado_feed
-  displayName: 'Whether to publish the packages to ADO feed.'
-  type: boolean
-  default: false
-
 resources:
   repositories:
   - repository: manylinux
@@ -70,7 +65,7 @@ stages:
     enable_win_cuda: ${{ parameters.enable_win_cuda }}
     enable_win_dml: ${{ parameters.enable_win_dml }}
     ort_version: ${{ parameters.ort_version }}
+    ort_cuda_version: ${{ parameters.ort_cuda_version }}
     ort_dml_version: ${{ parameters.ort_dml_version }}
     cuda_version: ${{ parameters.cuda_version }}
-    publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-    use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml
index 0728720c4..5f2a4c8e9 100644
--- a/.pipelines/stages/jobs/nuget-packaging-job.yml
+++ b/.pipelines/stages/jobs/nuget-packaging-job.yml
@@ -13,10 +13,6 @@ parameters:
   values:
   - 'linux'
   - 'win'
-- name: use_build_in_ort
-  type: boolean
-- name: publish_to_ado_feed
-  type: boolean
 
 jobs:
 - job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging
@@ -46,18 +42,21 @@ jobs:
     value: ${{ parameters.ort_version }}
   - name: GDN_CODESIGN_TARGETDIRECTORY
     value: '$(Build.ArtifactStagingDirectory)/nuget'
+  - name: os
+    value: ${{ parameters.os }}
   - name: ort_filename
     ${{ if eq(parameters.ep, 'cpu') }}:
-      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}'
+      value: 'Microsoft.ML.OnnxRuntime'
     ${{ elseif eq(parameters.ep, 'cuda') }}:
-      ${{if eq(parameters.cuda_version, '11.8') }}:
-        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
+      ${{if eq(parameters.os, 'win') }}:
+        value: 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
       ${{ else }}:
-        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
+        value: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
     ${{ elseif eq(parameters.ep, 'directml')}}:
-      value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}'
+      value: 'Microsoft.ML.OnnxRuntime.DirectML'
     ${{ else }}:
-      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}'
+      value: 'Microsoft.ML.OnnxRuntime'
+
   - name: genai_nuget_ext
     ${{ if eq(parameters.ep, 'cpu') }}:
       value: ''
@@ -86,7 +85,8 @@ jobs:
     - template: steps/capi-linux-step.yml
       parameters:
         target: 'onnxruntime-genai'
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+        arch: ${{ parameters.arch }}
+        ep: ${{ parameters.ep }}
 
     # TODO: Add a step to build the linux nuget package
 
@@ -94,12 +94,9 @@ jobs:
     - template: steps/capi-win-step.yml
       parameters:
         target: 'onnxruntime-genai'
+        arch: ${{ parameters.arch }}
         ep: ${{ parameters.ep }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
-
     - template: steps/nuget-win-step.yml
-    - ${{ if eq(parameters.publish_to_ado_feed, true)}}:
-      - template: steps/nuget-ado-feed-releasing-step.yml
 
   - template: steps/compliant-and-cleanup-step.yml
 
diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml
index 32cc3e0e3..f5a183289 100644
--- a/.pipelines/stages/jobs/py-packaging-job.yml
+++ b/.pipelines/stages/jobs/py-packaging-job.yml
@@ -13,10 +13,7 @@ parameters:
   values:
   - 'linux'
   - 'win'
-- name: use_build_in_ort
-  type: boolean
-- name: publish_to_ado_feed
-  type: boolean
+
 
 jobs:
 - job: python_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_packaging
@@ -69,18 +66,21 @@ jobs:
     value: ${{ parameters.ep }}
   - name: ort_version
     value: ${{ parameters.ort_version }}
+  - name: os
+    value: ${{ parameters.os }}
+
   - name: ort_filename
     ${{ if eq(parameters.ep, 'cpu') }}:
-      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ort_version }}'
+      value: 'Microsoft.ML.OnnxRuntime'
     ${{ elseif eq(parameters.ep, 'cuda') }}:
-      ${{if eq(parameters.cuda_version, '11.8') }}:
-        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-gpu-${{ parameters.ort_version }}'
+      ${{if eq(parameters.os, 'win') }}:
+        value: 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
       ${{ else }}:
-        value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-cuda12-${{ parameters.ort_version }}'
+        value: 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
     ${{ elseif eq(parameters.ep, 'directml')}}:
-      value: 'Microsoft.ML.OnnxRuntime.DirectML.${{ parameters.ort_version }}'
+      value: 'Microsoft.ML.OnnxRuntime.DirectML'
     ${{ else }}:
-      value: 'onnxruntime-${{ parameters.os }}-${{ parameters.arch }}-${{ parameters.ep}}-${{ parameters.ort_version }}'
+      value: 'Microsoft.ML.OnnxRuntime'
 
   - name: dml_dir
     value: 'Microsoft.AI.DirectML.1.14.1'
@@ -109,20 +109,16 @@ jobs:
     - template: steps/capi-linux-step.yml
       parameters:
         target: 'python'
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
-
+        arch: ${{ parameters.arch }}
+        ep: ${{ parameters.ep }}
 
   # Windows job needs to set the python version and install the required packages
   - ${{ if eq(parameters.os, 'win') }}:
     - template: steps/capi-win-step.yml
       parameters:
         target: 'python'
+        arch: ${{ parameters.arch }}
         ep: ${{ parameters.ep }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
-
-
-  - ${{ if eq(parameters.publish_to_ado_feed, true)}}:
-    - template: steps/py-ado-feed-releasing-step.yml
 
   - template: steps/compliant-and-cleanup-step.yml
 
diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml
index 1e518f08b..16638017d 100644
--- a/.pipelines/stages/jobs/steps/capi-linux-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml
@@ -1,8 +1,12 @@
 parameters:
 - name: target
   type: string
-- name: use_build_in_ort
-  type: boolean
+- name: ep
+  type: string
+  default: 'cpu'
+- name: arch
+  type: string
+  default: 'x64'
 steps:
 
 - checkout: self
@@ -29,23 +33,9 @@ steps:
     echo "ep=$(ep)"
   displayName: 'Print Parameters'
 
-- ${{ if eq(parameters.use_build_in_ort, false) }}:
-  - template: utils/download-ort.yml
-    parameters:
-      archiveType: 'tgz'
-- ${{ else }}:
-  - bash: |
-      set -e -x
-      azcopy copy --recursive "https://lotusscus.blob.core.windows.net/tmp/ort/$(os)/$(ep)" '$(Build.Repository.LocalPath)/tmp_ort'
-      displayName: 'Download ONNXRuntime'
-  - task: CopyFiles@2
-    inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/lib'
-      TargetFolder: '$(Build.Repository.LocalPath)/ort/lib'
-  - task: CopyFiles@2
-    inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/include'
-      TargetFolder: '$(Build.Repository.LocalPath)/ort/include'
+- template: utils/download-ort.yml
+  parameters:
+    archiveType: 'tgz'
 
 - bash: |
     set -e -x
@@ -57,7 +47,7 @@ steps:
       --container-registry onnxruntimebuildcache \
       --manylinux-src manylinux \
       --multiple_repos \
-      --repository onnxruntime$(ep)build$(arch)
+      --repository ortgenai$(ep)build$(arch)
   displayName: 'Get Docker Image'
   workingDirectory: '$(Build.Repository.LocalPath)'
 
@@ -67,7 +57,7 @@ steps:
       docker run \
       --rm \
       --volume $(Build.Repository.LocalPath):/ort_genai_src \
-      -w /ort_genai_src/ onnxruntime$(ep)build$(arch) \
+      -w /ort_genai_src/ ortgenai$(ep)build$(arch) \
       bash -c " \
           /usr/bin/cmake --preset linux_gcc_$(ep)_release \
             -DENABLE_TESTS=OFF && \
@@ -75,6 +65,7 @@ steps:
             --target onnxruntime-genai"
     displayName: 'Build GenAi'
     workingDirectory: '$(Build.Repository.LocalPath)'
+
   - task: BinSkim@4
     displayName: 'Run BinSkim'
     inputs:
@@ -83,12 +74,22 @@ steps:
   - template: utils/capi-archive.yml
     parameters:
       archiveType: tar
+  - script: |
+      set -e -x
+      docker run \
+      --rm \
+      --volume $(Build.Repository.LocalPath):/ort_genai_src \
+      -w /ort_genai_src/ ortgenai$(ep)build$(arch) \
+      bash -c " \
+          /usr/bin/cmake --build --preset linux_gcc_$(ep)_release --target package"
+    displayName: 'Package C/C++ API'
+    workingDirectory: '$(Build.Repository.LocalPath)'
 
   - task: PublishBuildArtifacts@1
     displayName: 'Publish Artifact: ONNXRuntime Genai capi'
     inputs:
       ArtifactName: $(artifactName)-capi
-      PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi'
+      PathtoPublish: '$(Build.Repository.LocalPath)/build/$(ep)/package'
 
 - ${{ if eq(parameters.target, 'python') }}:
   - bash: |
@@ -96,7 +97,7 @@ steps:
       docker run \
       --rm \
       --volume $(Build.Repository.LocalPath):/ort_genai_src \
-      -w /ort_genai_src/ onnxruntime$(ep)build$(arch) \
+      -w /ort_genai_src/ ortgenai$(ep)build$(arch) \
       bash -c " \
           /usr/bin/cmake --preset linux_gcc_$(ep)_release \
             -DENABLE_TESTS=OFF \
@@ -118,7 +119,7 @@ steps:
       docker run \
       --rm \
       --volume $(Build.Repository.LocalPath):/ort_genai_src \
-      -w /ort_genai_src/ onnxruntime$(ep)build$(arch) \
+      -w /ort_genai_src/ ortgenai$(ep)build$(arch) \
       bash -c " \
           /usr/bin/cmake --build --preset linux_gcc_$(ep)_release \
             -DENABLE_TESTS=OFF \
diff --git a/.pipelines/stages/jobs/steps/capi-win-step.yml b/.pipelines/stages/jobs/steps/capi-win-step.yml
index 7026e058e..4d527cda1 100644
--- a/.pipelines/stages/jobs/steps/capi-win-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-win-step.yml
@@ -2,11 +2,12 @@ parameters:
 - name: target
   type: string
   default: 'onnxruntime-genai'
-- name: use_build_in_ort
-  type: boolean
 - name: ep
   type: string
   default: 'cpu'
+- name: arch
+  type: string
+  default: 'x64'
 steps:
 - bash: |
     echo "##[error]Error: ep and arch are not set"
@@ -33,23 +34,12 @@ steps:
     echo "cuda_version=$(cuda_version)"
     echo "target=${{ parameters.target }}"
   displayName: 'Print Parameters'
-- ${{ if eq(parameters.use_build_in_ort, false) }}:
-  - template: utils/download-ort.yml
-    parameters:
-      archiveType: 'zip'
-      ep: ${{ parameters.ep }}
-- ${{ else }}:
-    - bash: |
-        azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/tmp/ort/$(os)/$(ep)" '$(Build.Repository.LocalPath)/tmp_ort'
-      displayName: 'Download ONNXRuntime'
-    - task: CopyFiles@2
-      inputs:
-        SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/lib'
-        TargetFolder: '$(Build.Repository.LocalPath)/ort/lib'
-    - task: CopyFiles@2
-      inputs:
-        SourceFolder: '$(Build.Repository.LocalPath)/tmp_ort/**/include'
-        TargetFolder: '$(Build.Repository.LocalPath)/ort/include'
+
+
+- template: utils/download-ort.yml
+  parameters:
+    archiveType: 'zip'
+    ep: ${{ parameters.ep }}
 
 - ${{ if eq(parameters.ep, 'directml') }}:
   - powershell: |
@@ -98,15 +88,16 @@ steps:
       AnalyzeTargetGlob: '$(Build.Repository.LocalPath)\**\*genai.dll'
     continueOnError: true
 
-  - template: utils/capi-archive.yml
-    parameters:
-      archiveType: zip
+  - powershell: |
+      cmake --build --preset windows_$(arch)_$(ep)_release --target package
+    displayName: 'Package C/C++ API'
+    workingDirectory: '$(Build.Repository.LocalPath)'
 
   - task: PublishBuildArtifacts@1
     displayName: 'Publish Artifact: ONNXRuntime Genai capi'
     inputs:
       ArtifactName: $(artifactName)-capi
-      PathtoPublish: '$(Build.ArtifactStagingDirectory)/capi'
+      PathtoPublish: '$(Build.Repository.LocalPath)\build\$(ep)\package'
 
 - ${{ if eq(parameters.target, 'python') }}:
   - task: BinSkim@4
diff --git a/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml b/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml
deleted file mode 100644
index 331a9ea7c..000000000
--- a/.pipelines/stages/jobs/steps/nuget-ado-feed-releasing-step.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-steps:
-- task: NuGetToolInstaller@1
-  inputs:
-    versionSpec: 6.8.x
-
-- powershell: |
-    New-Item -Path $(Agent.TempDirectory) -Name "binfiles" -ItemType "directory"
-    $base_path_name = Join-Path -Path $(Agent.TempDirectory) -ChildPath "binfiles"
-    Get-ChildItem $(GDN_CODESIGN_TARGETDIRECTORY) -Filter *.nupkg |
-        Foreach-Object {
-          $dir_name = Join-Path -Path $base_path_name -ChildPath $_.Basename
-          $cmd = "7z.exe x $($_.FullName) -y -o$dir_name"
-          Write-Output $cmd
-          Invoke-Expression -Command $cmd
-        }
-    dir $(Agent.TempDirectory)
-    tree $(Agent.TempDirectory)
-  workingDirectory: '$(Agent.TempDirectory)'
-
-- task: CodeSign@1
-  displayName: 'Run Codesign Validation'
-
-- task: PublishSecurityAnalysisLogs@3
-  displayName: 'Publish Security Analysis Logs'
-  continueOnError: true
-
-- task: PostAnalysis@2
-  inputs:
-    GdnBreakAllTools: true
-    GdnBreakPolicy: M365
-    GdnBreakPolicyMinSev: Error
-
-- template: utils/get-nuget-package-version-as-variable.yml
-  parameters:
-    packageFolder: '$(GDN_CODESIGN_TARGETDIRECTORY)'
-#This task must be run on a Windows machine
-- task: NuGetCommand@2
-  displayName: 'NuGet push to Azure DevOps Feed'
-  inputs:
-    command: push
-    packagesToPush: '$(GDN_CODESIGN_TARGETDIRECTORY)/*.nupkg'
-    publishVstsFeed: 'PublicPackages/onnxruntime-genai'
-    allowPackageConflicts: true
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/nuget-win-step.yml b/.pipelines/stages/jobs/steps/nuget-win-step.yml
index af502e0df..aae44d69a 100644
--- a/.pipelines/stages/jobs/steps/nuget-win-step.yml
+++ b/.pipelines/stages/jobs/steps/nuget-win-step.yml
@@ -16,7 +16,7 @@ steps:
     DisplayName: 'ESRP - Sign C# dlls'
     Pattern: '*OnnxRuntimeGenAI*.dll'
 - powershell: |
-    $VERSION = '0.2.0-rc4'
+    $VERSION = '0.2.0-rc5'
     nuget.exe pack Microsoft.ML.OnnxRuntimeGenAI.nuspec `
       -Prop version=$VERSION `
       -Prop genai_nuget_ext=$(genai_nuget_ext) `
diff --git a/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml b/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml
deleted file mode 100644
index 85c0a7e3d..000000000
--- a/.pipelines/stages/jobs/steps/py-ado-feed-releasing-step.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-steps:
-- task: TwineAuthenticate@1
-  inputs:
-    artifactFeed: PublicPackages/onnxruntime-genai
-- script: 'python -m twine upload -r onnxruntime-genai --config-file $(PYPIRC_PATH) --non-interactive *.whl'
-  workingDirectory: '$(Build.ArtifactStagingDirectory)/wheel'
-  displayName: 'Uploading wheels to PublicPackages/onnxruntime-genai'
-  retryCountOnTaskFailure: 3
-  env:
-    SYSTEM_ACCESSTOKEN: $(System.AccessToken)
\ No newline at end of file
diff --git a/.pipelines/stages/jobs/steps/utils/capi-archive.yml b/.pipelines/stages/jobs/steps/utils/capi-archive.yml
index 1395b31f7..3de2f2703 100644
--- a/.pipelines/stages/jobs/steps/utils/capi-archive.yml
+++ b/.pipelines/stages/jobs/steps/utils/capi-archive.yml
@@ -20,7 +20,7 @@ steps:
     inputs:
       SourceFolder: '$(Build.Repository.LocalPath)/$(buildDir)'
       Contents: |
-        onnxruntime-genai.so
+        libonnxruntime-genai.so
       TargetFolder: '$(Build.ArtifactStagingDirectory)\$(artifactName)\lib'
 - ${{ else }}:
   - task: CopyFiles@2
diff --git a/.pipelines/stages/jobs/steps/utils/download-ort.yml b/.pipelines/stages/jobs/steps/utils/download-ort.yml
index 5346bade8..fd393c62f 100644
--- a/.pipelines/stages/jobs/steps/utils/download-ort.yml
+++ b/.pipelines/stages/jobs/steps/utils/download-ort.yml
@@ -5,60 +5,58 @@ parameters:
   type: string
   default: cpu
 steps:
-- bash: |
-    echo "##[error]Error: ort_version and ort_filename are not set"
-    exit 1
-  displayName: 'Check if variables ort_version and ort_filename are set'
-  condition: or( eq (variables['ort_version'], ''), eq (variables['ort_filename'], ''))
 
-#Special case for DML
-- ${{ if ne(parameters.ep, 'directml') }}:
-  - task: DownloadGitHubRelease@0
-    inputs:
-      connection: 'GitHub - Release'
-      userRepository: 'microsoft/onnxruntime'
-      defaultVersionType: 'specificTag'
-      version: 'v$(ort_version)'
-      itemPattern: '$(ort_filename).${{ parameters.archiveType }}'
-      downloadPath: '$(Build.Repository.LocalPath)'
-    displayName: Download $(ort_filename)
+- task: DownloadPackage@1
+  inputs:
+    packageType: 'nuget'
+    feed: '2692857e-05ef-43b4-ba9c-ccf1c22c437c/7982ae20-ed19-4a35-a362-a96ac99897b7'
+    definition: '$(ort_filename)' # Can also be package name
+    version: '$(ort_version)'
+    extract: false
+    downloadPath: '$(Build.Repository.LocalPath)'
+  displayName: Download Onnxruntime file
+- ${{ if eq(parameters.archiveType, 'zip') }}:
   - task: ExtractFiles@1
     inputs:
-      archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).${{ parameters.archiveType }}'
-      destinationFolder: '$(Build.Repository.LocalPath)'
+      archiveFilePatterns: '$(Build.Repository.LocalPath)/*.nupkg'
+      destinationFolder: '$(Build.Repository.LocalPath)/ort'
       cleanDestinationFolder: false
       overwriteExistingFiles: true
     displayName: Unzip OnnxRuntime
   - task: CopyFiles@2
     inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)/$(ort_filename)'
-      TargetFolder: '$(Build.Repository.LocalPath)/ort'
-    displayName: Copy OnnxRuntime to ort
+      SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/$(os)-$(arch)/native'
+      TargetFolder: '$(Build.Repository.LocalPath)/ort/lib'
 - ${{ else }}:
-  - task: DownloadPackage@1
-    inputs:
-      packageType: 'nuget'
-      feed: '2692857e-05ef-43b4-ba9c-ccf1c22c437c/7982ae20-ed19-4a35-a362-a96ac99897b7'
-      definition: 'Microsoft.ML.OnnxRuntime.DirectML' # Can also be package name
-      version: '$(ort_version)'
-      extract: false
-      downloadPath: '$(Build.Repository.LocalPath)'
-    displayName: Download $(ort_filename)
+  - script: |
+      mv $(Build.Repository.LocalPath)/$(ort_filename).nupkg $(Build.Repository.LocalPath)/$(ort_filename).zip
+    displayName: Rename OnnxRuntime nuget package to zip
   - task: ExtractFiles@1
     inputs:
-      archiveFilePatterns: '$(Build.Repository.LocalPath)/*.nupkg'
+      archiveFilePatterns: '$(Build.Repository.LocalPath)/$(ort_filename).zip'
       destinationFolder: '$(Build.Repository.LocalPath)/ort'
       cleanDestinationFolder: false
       overwriteExistingFiles: true
     displayName: Unzip OnnxRuntime
   - task: CopyFiles@2
     inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/win-x64/native'
+      SourceFolder: '$(Build.Repository.LocalPath)/ort/runtimes/$(os)-$(arch)/native'
       TargetFolder: '$(Build.Repository.LocalPath)/ort/lib'
-  - task: CopyFiles@2
-    inputs:
-      SourceFolder: '$(Build.Repository.LocalPath)/ort/build/native/include'
-      TargetFolder: '$(Build.Repository.LocalPath)/ort/include'
+  # TODO: Find out why do we need to to have libonnxruntime.so.ort_stable_version
+  - script: |
+      set -e -x
+      ort_stable_version=$(echo $(ort_version) | cut -d- -f1)
+      echo $ort_stable_version
+      cp libonnxruntime.so libonnxruntime.so.$ort_stable_version
+    displayName: Copy libonnxruntime.so to libonnxruntime.so.
+    workingDirectory: '$(Build.Repository.LocalPath)/ort/lib'
+
+- task: CopyFiles@2
+  inputs:
+    SourceFolder: '$(Build.Repository.LocalPath)/ort/'
+    Contents: '**/native/include/**'
+    TargetFolder: '$(Build.Repository.LocalPath)/ort/include'
+    flattenFolders: true
 
 - task: DeleteFiles@1
   inputs:
diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml
index 8bc0da10a..37709022e 100644
--- a/.pipelines/stages/nuget-packaging-stage.yml
+++ b/.pipelines/stages/nuget-packaging-stage.yml
@@ -11,15 +11,14 @@ parameters:
   type: boolean
 - name: ort_version
   type: string
+- name: ort_cuda_version
+  type: string
 - name: ort_dml_version
   type: string
 - name: cuda_version
   type: string
   default: ''
-- name: use_build_in_ort
-  type: boolean
-- name: publish_to_ado_feed
-  type: boolean
+
 
 stages:
 - stage: nuget_packaging
@@ -31,8 +30,7 @@ stages:
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_win_dml, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -40,18 +38,16 @@ stages:
         ep: 'directml'
         ort_version: ${{ parameters.ort_dml_version }}
         os: 'win'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
-        ort_version: ${{ parameters.ort_version }}
+        ort_version: ${{ parameters.ort_cuda_version }}
         os: 'win'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
@@ -59,15 +55,12 @@ stages:
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
     - template: jobs/nuget-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
-        ort_version: ${{ parameters.ort_version }}
+        ort_version: ${{ parameters.ort_cuda_version }}
         os: 'linux'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
\ No newline at end of file
diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml
index 39a0d8e77..d2e220d5a 100644
--- a/.pipelines/stages/py-packaging-stage.yml
+++ b/.pipelines/stages/py-packaging-stage.yml
@@ -11,15 +11,14 @@ parameters:
   type: boolean
 - name: ort_version
   type: string
+- name: ort_cuda_version
+  type: string
 - name: ort_dml_version
   type: string
 - name: cuda_version
   type: string
   default: ''
-- name: use_build_in_ort
-  type: boolean
-- name: publish_to_ado_feed
-  type: boolean
+
 
 stages:
 - stage: python_packaging
@@ -31,8 +30,7 @@ stages:
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
         os: 'win'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_win_dml, true) }}:
     - template: jobs/py-packaging-job.yml
       parameters:
@@ -40,18 +38,16 @@ stages:
         ep: 'directml'
         ort_version: ${{ parameters.ort_dml_version }}
         os: 'win'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_win_cuda, true) }}:
     - template: jobs/py-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
-        ort_version: ${{ parameters.ort_version }}
+        ort_version: ${{ parameters.ort_cuda_version }}
         os: 'win'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
 
   - ${{ if eq(parameters.enable_linux_cpu, true) }}:
     - template: jobs/py-packaging-job.yml
@@ -60,15 +56,17 @@ stages:
         ep: 'cpu'
         ort_version: ${{ parameters.ort_version }}
         os: 'linux'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
+
   - ${{ if eq(parameters.enable_linux_cuda, true) }}:
     - template: jobs/py-packaging-job.yml
       parameters:
         arch: 'x64'
         cuda_version: ${{ parameters.cuda_version }}
         ep: 'cuda'
-        ort_version: ${{ parameters.ort_version }}
+        ort_version: ${{ parameters.ort_cuda_version }}
         os: 'linux'
-        publish_to_ado_feed: ${{ parameters.publish_to_ado_feed }}
-        use_build_in_ort: ${{ parameters.use_build_in_ort }}
\ No newline at end of file
+
+
+
+
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index acf8f22f6..63c438245 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,7 @@ message("Building onnxruntime-genai for version ${VERSION_INFO}")
 # Checking if CUDA is supported
 include(CheckLanguage)
 add_compile_definitions(BUILDING_ORT_GENAI_C)
+
 if(USE_CUDA)
   check_language(CUDA)
   if(CMAKE_CUDA_COMPILER)
@@ -150,8 +151,8 @@ if(USE_DML)
   target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${WIL_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
   target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>/directx)
   target_include_directories(onnxruntime-genai-static PUBLIC $<TARGET_PROPERTY:${DIRECTX_HEADERS_TARGET},INTERFACE_INCLUDE_DIRECTORIES>)
-  target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib)
-  target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib)
+  target_link_libraries(onnxruntime-genai PRIVATE d3d12.lib dxcore.lib dxguid.lib dxgi.lib)
+  target_link_libraries(onnxruntime-genai-static PUBLIC d3d12.lib dxcore.lib dxguid.lib dxgi.lib)
 
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/_deps ABSOLUTE)
   set(DXC_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.Direct3D.DXC.1.7.2308.12)
@@ -245,5 +246,7 @@ foreach(DLL_FILE ${onnxruntime_libs})
   )
 endforeach()
 
+include(cmake/package.cmake)
+
 # Have visual studio put all files into one single folder vs the default split of header files into a separate folder
 source_group(TREE ${GENERATORS_ROOT} FILES ${generator_srcs})
diff --git a/VERSION_INFO b/VERSION_INFO
index ff4a316ec..47035a2ca 100644
--- a/VERSION_INFO
+++ b/VERSION_INFO
@@ -1 +1 @@
-0.2.0rc4
\ No newline at end of file
+0.2.0rc5
\ No newline at end of file
diff --git a/benchmark/c/main.cpp b/benchmark/c/main.cpp
index 76d25458b..4e502de00 100644
--- a/benchmark/c/main.cpp
+++ b/benchmark/c/main.cpp
@@ -111,7 +111,7 @@ void WriteE2EStats(std::string_view label,
             << "\n";
 }
 
-std::string GeneratePrompt(size_t num_prompt_tokens, OgaModel& model, const OgaTokenizer& tokenizer) {
+std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer) {
   const char* const base_prompt = "A";
   auto base_prompt_sequences = OgaSequences::Create();
 
@@ -231,6 +231,7 @@ void RunBenchmark(const benchmark::Options& opts) {
 }  // namespace
 
 int main(int argc, char** argv) {
+  OgaHandle handle;
   try {
     const auto opts = benchmark::ParseOptionsFromCommandLine(argc, argv);
     RunBenchmark(opts);
diff --git a/cmake/cxx_standard.cmake b/cmake/cxx_standard.cmake
index 7e752d40b..52732c2f2 100644
--- a/cmake/cxx_standard.cmake
+++ b/cmake/cxx_standard.cmake
@@ -9,4 +9,8 @@ elseif (USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION
 else ()
     message("Test is using C++20")
     set(CMAKE_CXX_STANDARD 20)
-endif ()
\ No newline at end of file
+endif ()
+
+if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+    add_compile_definitions(USE_EXPERIMENTAL_FILESYSTEM)
+endif()
\ No newline at end of file
diff --git a/cmake/package.cmake b/cmake/package.cmake
new file mode 100644
index 000000000..9c9102689
--- /dev/null
+++ b/cmake/package.cmake
@@ -0,0 +1,29 @@
+set_target_properties(
+  onnxruntime-genai PROPERTIES PUBLIC_HEADER
+  "${PROJECT_SOURCE_DIR}/src/ort_genai_c.h;${PROJECT_SOURCE_DIR}/src/ort_genai.h"
+)
+install(TARGETS
+  onnxruntime-genai
+  LIBRARY
+  PUBLIC_HEADER
+)
+set(CPACK_PACKAGE_VENDOR "Microsoft")
+set(CPACK_PACKAGE_NAME "onnxruntime-genai")
+set(CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/LICENSE")
+set(CPACK_RESOURCE_FILE_README "${PROJECT_SOURCE_DIR}/README.md")
+set(CPACK_PACKAGE_HOMEPAGE_URL "https://github.com/microsoft/onnxruntime-genai")
+set(CPACK_OUTPUT_FILE_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/package")
+set(CPACK_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${VERSION_INFO}-${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
+if (WIN32)
+  set(CPACK_GENERATOR "ZIP")
+else ()
+  set(CPACK_GENERATOR "TGZ")
+endif ()
+set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY TRUE)
+install(FILES
+  "${PROJECT_SOURCE_DIR}/README.md"
+  "${PROJECT_SOURCE_DIR}/ThirdPartyNotices.txt"
+  "${PROJECT_SOURCE_DIR}/SECURITY.md"
+  "${PROJECT_SOURCE_DIR}/LICENSE"
+  DESTINATION .)
+include(CPack)
\ No newline at end of file
diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json
index cd0c0a0b9..1ea6d85c8 100644
--- a/cmake/presets/CMakeMacOSConfigPresets.json
+++ b/cmake/presets/CMakeMacOSConfigPresets.json
@@ -6,7 +6,7 @@
   "configurePresets": [
         {
       "name": "macos_default",
-      "generator": "Ninja",
+      "generator": "Unix Makefiles",
       "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {
         "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
diff --git a/examples/c/src/main.cpp b/examples/c/src/main.cpp
index e4be639f2..f78aa196e 100644
--- a/examples/c/src/main.cpp
+++ b/examples/c/src/main.cpp
@@ -9,7 +9,8 @@ void CXX_API(const char* model_path) {
   auto tokenizer = OgaTokenizer::Create(*model);
 
   const char* prompt = "def is_prime(num):";
-  std::cout << "Prompt: " << std::endl << prompt << std::endl;
+  std::cout << "Prompt: " << std::endl
+            << prompt << std::endl;
 
   auto sequences = OgaSequences::Create();
   tokenizer->Encode(prompt, *sequences);
@@ -21,14 +22,15 @@ void CXX_API(const char* model_path) {
   auto output_sequences = model->Generate(*params);
   auto out_string = tokenizer->Decode(output_sequences->Get(0));
 
-  std::cout << "Output: " << std::endl << out_string << std::endl;
+  std::cout << "Output: " << std::endl
+            << out_string << std::endl;
 }
 
 // C API Example
 
 void CheckResult(OgaResult* result) {
   if (result) {
-    std::string string=OgaResultGetError(result);
+    std::string string = OgaResultGetError(result);
     OgaDestroyResult(result);
     throw std::runtime_error(string);
   }
@@ -84,6 +86,8 @@ int main(int argc, char** argv) {
     return -1;
   }
 
+  // Responsible for cleaning up the library during shutdown
+  OgaHandle handle;
 
   std::cout << "-------------" << std::endl;
   std::cout << "Hello, Phi-2!" << std::endl;
diff --git a/examples/csharp/HelloPhi2/Program.cs b/examples/csharp/HelloPhi2/Program.cs
index 993af8b57..fecb24ad7 100644
--- a/examples/csharp/HelloPhi2/Program.cs
+++ b/examples/csharp/HelloPhi2/Program.cs
@@ -1,6 +1,8 @@
 ﻿// See https://aka.ms/new-console-template for more information
 using Microsoft.ML.OnnxRuntimeGenAI;
 
+OgaHandle ogaHandle = new OgaHandle();
+
 Console.WriteLine("-------------");
 Console.WriteLine("Hello, Phi-2!");
 Console.WriteLine("-------------");
diff --git a/nuget.config b/nuget.config
index 3e0389a52..f3853aee6 100644
--- a/nuget.config
+++ b/nuget.config
@@ -3,11 +3,14 @@
   <solution>
     <add key="disableSourceControlIntegration" value="true" />
   </solution>
-  <packageSources>
-    <clear />
-    <add key="NuGet Official" value="https://api.nuget.org/v3/index.json" />
-  </packageSources>
-  <disabledPackageSources>
-    <clear />
-  </disabledPackageSources>
+    <packageRestore>
+        <!--Allow NuGet to download missing packages -->
+        <add key="enabled" value="True" />
+        <!-- Automatically check for missing packages during build in Visual Studio -->
+        <add key="automatic" value="True" />
+    </packageRestore>
+    <packageSources>
+        <clear/>
+        <add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />
+    </packageSources>
 </configuration>
\ No newline at end of file
diff --git a/onnxruntime-genai.sln b/onnxruntime-genai.sln
new file mode 100644
index 000000000..5e59cc82e
--- /dev/null
+++ b/onnxruntime-genai.sln
@@ -0,0 +1,36 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31903.59
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{2253BDCC-33C9-431E-889A-56E3E75D10BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI", "src\csharp\Microsoft.ML.OnnxRuntimeGenAI.csproj", "{CA0EC087-3AF5-44D5-93F0-489420EBA014}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{505E2406-98C2-46DD-973A-3CEB95CF3626}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.OnnxRuntimeGenAI.Tests", "test\csharp\Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj", "{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014}.Release|Any CPU.Build.0 = Release|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{CA0EC087-3AF5-44D5-93F0-489420EBA014} = {2253BDCC-33C9-431E-889A-56E3E75D10BA}
+		{24C405DE-9E56-4E1F-BDDF-332DFD8BCB73} = {505E2406-98C2-46DD-973A-3CEB95CF3626}
+	EndGlobalSection
+EndGlobal
diff --git a/src/config.cpp b/src/config.cpp
index 39341f5b5..7dc3133ec 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -397,7 +397,7 @@ struct RootObject_Element : JSON::Element {
   JSON::Element& t_;
 };
 
-void ParseConfig(const std::filesystem::path& filename, Config& config) {
+void ParseConfig(const fs::path& filename, Config& config) {
   std::ifstream file(filename, std::ios::binary | std::ios::ate);
   if (!file.is_open()) {
     throw std::runtime_error("Error opening " + filename.string());
@@ -421,7 +421,7 @@ void ParseConfig(const std::filesystem::path& filename, Config& config) {
   }
 }
 
-Config::Config(const std::filesystem::path& path) : config_path{path} {
+Config::Config(const fs::path& path) : config_path{path} {
   ParseConfig(path / "genai_config.json", *this);
 
   if (model.context_length == 0)
diff --git a/src/config.h b/src/config.h
index b94e05ca0..8fb5debdc 100644
--- a/src/config.h
+++ b/src/config.h
@@ -6,9 +6,9 @@ namespace Generators {
 
 struct Config {
   Config() = default;
-  Config(const std::filesystem::path& path);
+  Config(const fs::path& path);
 
-  std::filesystem::path config_path;  // Path of the config directory
+  fs::path config_path;  // Path of the config directory
 
   using ProviderOption = std::pair<std::string, std::string>;
   struct ProviderOptions {
diff --git a/src/csharp/NativeMethods.cs b/src/csharp/NativeMethods.cs
index f2906f3df..8960bdb0e 100644
--- a/src/csharp/NativeMethods.cs
+++ b/src/csharp/NativeMethods.cs
@@ -71,7 +71,7 @@ internal class NativeLib
                                                                                          IntPtr /* const OgaSequences* */ sequences);
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaCreateGenerator(IntPtr /* const OgaModel* */ model,
                                                                         IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                         out IntPtr /* OgaGenerator** */ generator);
 
@@ -129,7 +129,7 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
         // This function is used to generate sequences for the given model using the given generator parameters.
         // The OgaSequences object is an array of sequences, where each sequence is an array of tokens.
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
-        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* OgaModel* */ model,
+        public static extern IntPtr /* OgaResult* */ OgaGenerate(IntPtr /* const OgaModel* */ model,
                                                                  IntPtr /* const OgaGeneratorParams* */ generatorParams,
                                                                  out IntPtr /* OgaSequences** */ sequences);
 
@@ -176,5 +176,8 @@ public static extern UIntPtr OgaSequencesGetSequenceCount(IntPtr /* const OgaSeq
 
         [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
         public static extern IntPtr /* OgaResult* */ OgaGetCurrentGpuDeviceId(out IntPtr /* int32_t */ device_id);
+
+        [DllImport(NativeLib.DllName, CallingConvention = CallingConvention.Winapi)]
+        public static extern void OgaShutdown();
     }
 }
diff --git a/src/csharp/Utils.cs b/src/csharp/Utils.cs
index 815652a71..2a8723280 100644
--- a/src/csharp/Utils.cs
+++ b/src/csharp/Utils.cs
@@ -7,6 +7,14 @@
 
 namespace Microsoft.ML.OnnxRuntimeGenAI
 {
+    public class OgaHandle
+    {
+        ~OgaHandle()
+        {
+            NativeMethods.OgaShutdown();
+        }
+    }
+
     public class Utils
     {
         public static void SetCurrentGpuDeviceId(int device_id)
diff --git a/src/dml/dml_adapter_info.h b/src/dml/dml_adapter_info.h
index eb021b8ae..1638649bb 100644
--- a/src/dml/dml_adapter_info.h
+++ b/src/dml/dml_adapter_info.h
@@ -11,6 +11,7 @@
 enum class VendorID {
   Undefined = 0,
   Intel = 0x8086,
+  Microsoft = 0x1414,
 };
 
 // Retrieves information from a DXCore or DXGI adapter.
@@ -27,4 +28,5 @@ class AdapterInfo {
   void Initialize(IDXCoreAdapter* adapter);
 
   ::VendorID vendor_id_;
+  uint32_t device_id_;
 };
diff --git a/src/dml/dml_helpers.cpp b/src/dml/dml_helpers.cpp
index e7a0c2f08..d7954b84c 100644
--- a/src/dml/dml_helpers.cpp
+++ b/src/dml/dml_helpers.cpp
@@ -4,11 +4,82 @@
 #include <stdexcept>
 #include <dxcore.h>
 #include <dxcore_interface.h>
+#include <dxgi1_6.h>
 #include "dml_helpers.h"
 #include "dml_adapter_info.h"
 
 namespace DmlHelpers {
 
+static bool IsSoftwareAdapter(IDXGIAdapter1* adapter) {
+  DXGI_ADAPTER_DESC1 desc = {};
+  THROW_IF_FAILED(adapter->GetDesc1(&desc));
+
+  // See here for documentation on filtering WARP adapter:
+  // https://docs.microsoft.com/en-us/windows/desktop/direct3ddxgi/d3d10-graphics-programming-guide-dxgi#new-info-about-enumerating-adapters-for-windows-8
+  const bool is_basic_render_driver_vendor_id = desc.VendorId == static_cast<UINT>(VendorID::Microsoft);
+  const bool is_basic_render_driver_device_id = desc.DeviceId == 0x8c;
+  return desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE || (is_basic_render_driver_vendor_id && is_basic_render_driver_device_id);
+};
+
+static std::vector<ComPtr<IDXGIAdapter1>> EnumerateAdapters() {
+  ComPtr<IDXGIFactory4> dxgi_factory;
+  THROW_IF_FAILED(CreateDXGIFactory(IID_PPV_ARGS(&dxgi_factory)));
+
+  std::vector<ComPtr<IDXGIAdapter1>> adapter_infos;
+
+  ComPtr<IDXGIFactory6> dxgi_factory6;
+  if (SUCCEEDED(dxgi_factory.As(&dxgi_factory6))) {
+    // Enumerate adapters by performance. This only works in Windows 10 Version 1803 and later.
+    ComPtr<IDXGIAdapter1> adapter;
+    for (uint32_t adapter_index = 0;
+         dxgi_factory6->EnumAdapterByGpuPreference(
+             adapter_index,
+             DXGI_GPU_PREFERENCE_HIGH_PERFORMANCE,
+             IID_PPV_ARGS(&adapter)) != DXGI_ERROR_NOT_FOUND;
+         adapter_index++) {
+      // Since we enumerate by performance, we can ignore everything that comes after the first software adapter, which includes the IDD
+      // adapters. This is necessary for now because IDD (e.g. remote desktop) adapters don't have the DXGI_ADAPTER_FLAG_SOFTWARE flag,
+      // even though they run on software.
+      if (IsSoftwareAdapter(adapter.Get())) {
+        break;
+      }
+
+      // Make sure that we are able to create the device
+      ComPtr<ID3D12Device> d3d12_device;
+      THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&d3d12_device)));
+
+      if (d3d12_device) {
+        adapter_infos.emplace_back(std::move(adapter));
+      }
+    }
+  } else {
+    // Enumerate adapters without ordering.
+    ComPtr<IDXGIAdapter1> adapter;
+    for (uint32_t adapter_index = 0; dxgi_factory->EnumAdapters1(adapter_index, &adapter) != DXGI_ERROR_NOT_FOUND; adapter_index++) {
+      // We can't assume the ordering of hardware and software adapters, so keep looping. This path should only execute on Windows 10
+      // version 1709 or earlier; IDD (e.g. remote desktop) adapters do not exist when taking this code path.
+      if (IsSoftwareAdapter(adapter.Get())) {
+        continue;
+      }
+
+      // Make sure that we are able to create the device
+      ComPtr<ID3D12Device> d3d12_device;
+      THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&d3d12_device)));
+
+      if (d3d12_device) {
+        adapter_infos.emplace_back(std::move(adapter));
+      }
+    }
+  }
+
+  return adapter_infos;
+}
+
+static ComPtr<IDXGIAdapter1> CreatePerformantAdapter() {
+  auto filtered_adapters = EnumerateAdapters();
+  return filtered_adapters.front();
+}
+
 DmlObjects CreateDmlObjects() {
   D3D12_COMMAND_QUEUE_DESC command_queue_description = {
       D3D12_COMMAND_LIST_TYPE_COMPUTE,
@@ -19,7 +90,8 @@ DmlObjects CreateDmlObjects() {
 
   DmlObjects dml_objects;
 
-  THROW_IF_FAILED(D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device)));
+  auto adapter = CreatePerformantAdapter();
+  THROW_IF_FAILED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&dml_objects.d3d12_device)));
   THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandQueue(&command_queue_description, IID_PPV_ARGS(&dml_objects.command_queue)));
   THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&dml_objects.command_allocator)));
   THROW_IF_FAILED(dml_objects.d3d12_device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, dml_objects.command_allocator.Get(), nullptr, IID_PPV_ARGS(&dml_objects.command_list)));
diff --git a/src/filesystem.h b/src/filesystem.h
new file mode 100644
index 000000000..45c4c7015
--- /dev/null
+++ b/src/filesystem.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// TODO(baijumeswani): Remove experimental when packaging pipeline can use GCC > 8
+#ifdef USE_EXPERIMENTAL_FILESYSTEM
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
diff --git a/src/generators.cpp b/src/generators.cpp
index 49751c3e9..133d735da 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -11,6 +11,23 @@
 
 namespace Generators {
 
+static bool _ = (Ort::InitApi(), false);
+
+OrtGlobals::OrtGlobals() : env_{OrtEnv::Create(OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR)} {}
+
+std::unique_ptr<OrtGlobals>& GetOrtGlobals() {
+  static auto globals = std::make_unique<OrtGlobals>();
+  return globals;
+}
+
+void Shutdown() {
+  GetOrtGlobals().reset();
+}
+
+OrtEnv& GetOrtEnv() {
+  return *GetOrtGlobals()->env_;
+}
+
 // IEEE 752-2008 binary16 format, 1 sign bit, 5 bit exponent, 10 bit fraction
 float Float16ToFloat32(uint16_t v) {
   // Extract sign, exponent, and fraction from numpy.float16
@@ -44,7 +61,25 @@ GeneratorParams::GeneratorParams(const Model& model)
       eos_token_id{model.config_->model.eos_token_id},
       vocab_size{model.config_->model.vocab_size},
       device_type{model.device_type_},
-      cuda_stream{model.cuda_stream_} {
+      cuda_stream{model.cuda_stream_},
+      is_cuda_graph_enabled_{IsCudaGraphEnabled(model.config_->model.decoder.session_options)} {
+}
+
+void GeneratorParams::TryGraphCapture(int max_bs) {
+  if (!is_cuda_graph_enabled_ || device_type == DeviceType::CPU) {
+    // no-op
+    return;
+  }
+
+  if (DeviceType::CUDA == device_type || DeviceType::DML == device_type) {
+    if (max_bs == 0) {
+      throw std::runtime_error("Graph capture is enabled, but max_batch_size is not set.");
+    }
+    use_cuda_graph = true;
+    max_batch_size = max_bs;
+  } else {
+    throw std::runtime_error("CUDA graph is not supported on this device");
+  }
 }
 
 std::unique_ptr<Generator> CreateGenerator(const Model& model, const GeneratorParams& params) {
diff --git a/src/generators.h b/src/generators.h
index 3b2e65fa8..beb958353 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -5,8 +5,9 @@
 #include <assert.h>
 #include <cmath>
 #include <cstring>
-#include <filesystem>
+#include "filesystem.h"
 #include <functional>
+#include <iostream>
 #include "span.h"
 #include <memory>
 #include <numeric>
@@ -61,6 +62,7 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
 
   int batch_size{1};
   int max_batch_size{0};
+  bool use_cuda_graph{};
   int sequence_length{};
   int BatchBeamSize() const { return search.num_beams * batch_size; }
 
@@ -97,6 +99,11 @@ struct GeneratorParams : std::enable_shared_from_this<GeneratorParams> {
   std::vector<int32_t> input_ids_owner;  // Backing memory of input_ids in some cases
 
   std::shared_ptr<GeneratorParams> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
+
+  void TryGraphCapture(int max_bs);
+
+ private:
+  bool is_cuda_graph_enabled_{};
 };
 
 struct Generator {
@@ -114,6 +121,23 @@ struct Generator {
   bool computed_logits_{};  // Set to true in ComputeLogits() and false after appending a token to ensure a 1 to 1 call ratio
 };
 
+struct OrtGlobals {
+  OrtGlobals();
+
+  std::unique_ptr<OrtEnv> env_;
+#if USE_CUDA
+  std::unique_ptr<OrtMemoryInfo> memory_info_cuda_;
+  std::unique_ptr<Ort::Allocator> allocator_cuda_;
+#endif
+ private:
+  OrtGlobals(const OrtGlobals&) = delete;
+  void operator=(const OrtGlobals&) = delete;
+};
+
+std::unique_ptr<OrtGlobals>& GetOrtGlobals();
+void Shutdown();  // Do this once at exit, Ort code will fail after this call
+OrtEnv& GetOrtEnv();
+
 std::shared_ptr<Model> CreateModel(OrtEnv& ort_env, const char* config_path);
 std::shared_ptr<GeneratorParams> CreateGeneratorParams(const Model& model);
 std::shared_ptr<GeneratorParams> CreateGeneratorParams();  // For benchmarking purposes only
diff --git a/src/logging.cpp b/src/logging.cpp
index 6c334f50a..edd698168 100644
--- a/src/logging.cpp
+++ b/src/logging.cpp
@@ -44,7 +44,7 @@ void SetLogString(std::string_view name, std::string_view value) {
     if (value.empty())
       gp_logfile.reset();
     else {
-      std::filesystem::path filename{value};
+      fs::path filename{std::string(value)};
       gp_logfile = std::make_unique<std::ofstream>(filename);
     }
 
diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp
index 140f2a8cd..96cc029b8 100644
--- a/src/models/captured_graph_pool.cpp
+++ b/src/models/captured_graph_pool.cpp
@@ -24,7 +24,7 @@ static std::tuple<int, int, int> MakeKey(int max_batch_size, int max_length, int
 }
 
 CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, const GeneratorParams& params) const {
-  if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
+  if (!params.use_cuda_graph || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) {
     return nullptr;
   }
 
diff --git a/src/models/decoder_only.cpp b/src/models/decoder_only.cpp
index 83d1f03d3..53f4f6697 100644
--- a/src/models/decoder_only.cpp
+++ b/src/models/decoder_only.cpp
@@ -26,7 +26,7 @@ DecoderOnly_State::DecoderOnly_State(const DecoderOnly_Model& model, RoamingArra
 
 RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int32_t> next_tokens, RoamingArray<int32_t> next_indices) {
   if (first_run_) {
-    if (model_.use_cuda_graph_) {
+    if (params_->use_cuda_graph) {
       model_.run_options_->AddConfigEntry("gpu_graph_id", "-1");
     }
     first_run_ = false;
@@ -37,7 +37,7 @@ RoamingArray<float> DecoderOnly_State::Run(int current_length, RoamingArray<int3
   State::Run(*model_.session_decoder_, *model_.run_options_);
 
   // Set the graph id for the following runs.
-  if (model_.use_cuda_graph_) {
+  if (params_->use_cuda_graph) {
     int new_batch_size = static_cast<int>(input_ids_.GetShape()[0]);
     if (new_batch_size != current_batch_size_) {
       current_batch_size_ = new_batch_size;
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 0c4436928..3c2e0dbfa 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -117,7 +117,7 @@ KV_Cache::KV_Cache(const Model& model, State& state)
     : model_{model},
       state_{state},
       layer_count_{model_.config_->model.decoder.num_hidden_layers},
-      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1 && (model_.device_type_ == DeviceType::CUDA || model_.device_type_ == DeviceType::DML)},
+      past_present_share_buffer_{state_.params_->search.past_present_share_buffer && state_.params_->search.num_beams == 1},
       shape_{state_.params_->BatchBeamSize(), model.config_->model.decoder.num_key_value_heads, 0, model.config_->model.decoder.head_size} {
   if (g_log.enabled && g_log.warning && past_present_share_buffer_ != state_.params_->search.past_present_share_buffer)
     Log("warning", "past_present_share_buffer search option set to true, but has been disabled due to the current configuration. See https://aka.ms/generate_config for details");
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 9226e5c05..a5de9979f 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -187,14 +187,12 @@ std::vector<std::string> Tokenizer::DecodeBatch(std::span<const int32_t> sequenc
 // has been destroyed. Without this, we will crash in the Onnxruntime BFCArena code when deleting tensors due to the
 // arena already being destroyed.
 Ort::Allocator* GetCudaAllocator(OrtSession& session) {
-  static std::unique_ptr<OrtMemoryInfo> memory_info_cuda_;
-  static std::unique_ptr<Ort::Allocator> allocator_cuda_;
-
-  if (!allocator_cuda_) {
-    memory_info_cuda_ = OrtMemoryInfo::Create("Cuda", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
-    allocator_cuda_ = Ort::Allocator::Create(session, *memory_info_cuda_);
+  auto& globals = *GetOrtGlobals();
+  if (!globals.allocator_cuda_) {
+    globals.memory_info_cuda_ = OrtMemoryInfo::Create("Cuda", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+    globals.allocator_cuda_ = Ort::Allocator::Create(session, *globals.memory_info_cuda_);
   }
-  return allocator_cuda_.get();
+  return globals.allocator_cuda_.get();
 }
 #endif
 
@@ -316,7 +314,7 @@ void Model::CreateSessionOptions() {
   }
 
   if (options.enable_profiling.has_value()) {
-    std::filesystem::path profile_file_prefix{options.enable_profiling.value()};
+    fs::path profile_file_prefix{options.enable_profiling.value()};
     ort_options.EnableProfiling(profile_file_prefix.c_str());
   }
 
@@ -539,26 +537,4 @@ std::unique_ptr<OrtValue> Model::ExpandInputs(std::unique_ptr<OrtValue>& input,
   return expanded;
 }
 
-void Model::GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params) {
-  bool is_cuda_graph_enabled = device_type_ == DeviceType::DML || IsCudaGraphEnabled(config_->model.decoder.session_options);
-  max_batch_size_ = params.max_batch_size;
-
-  if (DeviceType::CUDA == device_type_) {
-    if (is_cuda_graph_enabled) {
-      if (max_batch_size_ == 0) {
-        throw std::runtime_error("CUDA graph is enabled, but max_batch_size is not set.");
-      }
-      use_cuda_graph_ = true;
-    }
-  } else if (DeviceType::DML == device_type_) {
-    if (max_batch_size_ == 0) {
-      throw std::runtime_error("max_batch_size needs to be set when using DirectML.");
-    }
-
-    use_cuda_graph_ = true;
-  } else if (is_cuda_graph_enabled) {
-    throw std::runtime_error("CUDA graph is not supported on this device");
-  }
-}
-
 }  // namespace Generators
diff --git a/src/models/model.h b/src/models/model.h
index b569373f8..d1b0a1ec0 100644
--- a/src/models/model.h
+++ b/src/models/model.h
@@ -119,8 +119,6 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::unique_ptr<OrtValue> ExpandInputs(std::unique_ptr<OrtValue>& input, int num_beams) const;
 
-  void GetMaxBatchSizeFromGeneratorParams(const GeneratorParams& params);
-
   CapturedGraphPool* GetCapturedGraphPool() const { return captured_graph_pool_.get(); }
 
   std::unique_ptr<Config> config_;
@@ -136,9 +134,6 @@ struct Model : std::enable_shared_from_this<Model> {
 
   std::shared_ptr<Model> external_owner_;  // Set to 'this' when created by the C API to preserve lifetime
 
-  bool use_cuda_graph_{};
-  int max_batch_size_{};
-
 #if USE_DML
   DmlExecutionContext* GetDmlExecutionContext() const { return dml_execution_context_.get(); }
   DmlReadbackHeap* GetDmlReadbackHeap() const { return dml_readback_heap_.get(); }
diff --git a/src/ort_genai.h b/src/ort_genai.h
index fb863dae2..aa6553a5c 100644
--- a/src/ort_genai.h
+++ b/src/ort_genai.h
@@ -75,7 +75,7 @@ struct OgaModel : OgaAbstract {
     return std::unique_ptr<OgaModel>(p);
   }
 
-  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) {
+  std::unique_ptr<OgaSequences> Generate(const OgaGeneratorParams& params) const {
     OgaSequences* p;
     OgaCheckResult(OgaGenerate(this, &params, &p));
     return std::unique_ptr<OgaSequences>(p);
@@ -201,7 +201,7 @@ struct OgaGeneratorParams : OgaAbstract {
 };
 
 struct OgaGenerator : OgaAbstract {
-  static std::unique_ptr<OgaGenerator> Create(OgaModel& model, const OgaGeneratorParams& params) {
+  static std::unique_ptr<OgaGenerator> Create(const OgaModel& model, const OgaGeneratorParams& params) {
     OgaGenerator* p;
     OgaCheckResult(OgaCreateGenerator(&model, &params, &p));
     return std::unique_ptr<OgaGenerator>(p);
@@ -235,3 +235,10 @@ struct OgaGenerator : OgaAbstract {
 
   static void operator delete(void* p) { OgaDestroyGenerator(reinterpret_cast<OgaGenerator*>(p)); }
 };
+
+struct OgaHandle {
+  OgaHandle() = default;
+  ~OgaHandle() noexcept {
+    OgaShutdown();
+  }
+};
diff --git a/src/ort_genai_c.cpp b/src/ort_genai_c.cpp
index ce5eb56e9..c383d22f0 100644
--- a/src/ort_genai_c.cpp
+++ b/src/ort_genai_c.cpp
@@ -13,16 +13,6 @@
 
 namespace Generators {
 
-std::unique_ptr<OrtEnv> g_ort_env;
-
-OrtEnv& GetOrtEnv() {
-  if (!g_ort_env) {
-    Ort::InitApi();
-    g_ort_env = OrtEnv::Create();
-  }
-  return *g_ort_env;
-}
-
 struct Result {
   explicit Result(const char* what) : what_{what} {}
   std::string what_;
@@ -39,6 +29,10 @@ extern "C" {
     return reinterpret_cast<OgaResult*>(std::make_unique<Generators::Result>(e.what()).release()); \
   }
 
+void OGA_API_CALL OgaShutdown() {
+  Generators::Shutdown();
+}
+
 const char* OGA_API_CALL OgaResultGetError(const OgaResult* result) {
   return reinterpret_cast<const Generators::Result*>(result)->what_.c_str();
 }
@@ -111,7 +105,7 @@ OgaResult* OGA_API_CALL OgaGeneratorParamsSetSearchBool(OgaGeneratorParams* gene
 OgaResult* OGA_API_CALL OgaGeneratorParamsTryGraphCaptureWithMaxBatchSize(OgaGeneratorParams* generator_params, int32_t max_batch_size) {
   OGA_TRY
   auto* params = reinterpret_cast<Generators::GeneratorParams*>(generator_params);
-  params->max_batch_size = max_batch_size;
+  params->TryGraphCapture(max_batch_size);
   return nullptr;
   OGA_CATCH
 }
@@ -146,23 +140,17 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera
   OGA_CATCH
 }
 
-OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
+OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out) {
   OGA_TRY
-  auto* model_p = reinterpret_cast<Generators::Model*>(model);
-  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
-  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
-  auto result = Generators::Generate(*model_p, *params);
+  auto result = Generators::Generate(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params));
   *out = reinterpret_cast<OgaSequences*>(std::make_unique<Generators::TokenSequences>(std::move(result)).release());
   return nullptr;
   OGA_CATCH
 }
 
-OgaResult* OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
+OgaResult* OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaGenerator** out) {
   OGA_TRY
-  auto* model_p = reinterpret_cast<Generators::Model*>(model);
-  auto* params = reinterpret_cast<const Generators::GeneratorParams*>(generator_params);
-  model_p->GetMaxBatchSizeFromGeneratorParams(*params);
-  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*model_p, *params).release());
+  *out = reinterpret_cast<OgaGenerator*>(CreateGenerator(*reinterpret_cast<const Generators::Model*>(model), *reinterpret_cast<const Generators::GeneratorParams*>(generator_params)).release());
   return nullptr;
   OGA_CATCH
 }
diff --git a/src/ort_genai_c.h b/src/ort_genai_c.h
index df4c47dde..7c703061b 100644
--- a/src/ort_genai_c.h
+++ b/src/ort_genai_c.h
@@ -40,6 +40,10 @@ typedef struct OgaSequences OgaSequences;
 typedef struct OgaTokenizer OgaTokenizer;
 typedef struct OgaTokenizerStream OgaTokenizerStream;
 
+/* \brief Call this on process exit to cleanly shutdown the genai library & its onnxruntime usage
+ */
+OGA_EXPORT void OGA_API_CALL OgaShutdown();
+
 /*
  * \param[in] result OgaResult that contains the error message.
  * \return Error message contained in the OgaResult. The const char* is owned by the OgaResult
@@ -111,7 +115,7 @@ OGA_EXPORT void OGA_API_CALL OgaDestroyModel(OgaModel* model);
  *             after it is done using the sequences.
  * \return OgaResult containing the error message if the generation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaGenerate(const OgaModel* model, const OgaGeneratorParams* generator_params, OgaSequences** out);
 
 /*
  * \brief Creates a OgaGeneratorParams from the given model.
@@ -161,7 +165,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetWhisperDecoderInputIDs(O
  * \param[out] out The created generator.
  * \return OgaResult containing the error message if the generator creation failed.
  */
-OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
+OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateGenerator(const OgaModel* model, const OgaGeneratorParams* params, OgaGenerator** out);
 
 /*
  * \brief Destroys the given generator.
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index bc85d8bce..6dbf82371 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -64,8 +64,12 @@ if(BUILD_WHEEL)
     "libcufft.so.11"
     "libcurand.so.10"
     "libnvinfer.so.8"
+    "libnvinfer.so.10"
     "libnvinfer_plugin.so.8"
+    "libnvinfer_plugin.so.10"
     "libnvonnxparser.so.8"
+    "libnvonnxparser.so.10"
+
   )
   set(modified_exclude_list)
   foreach(item IN LISTS auditwheel_exclude_list)
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 1a9c37a3e..3865de4bc 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -167,16 +167,17 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             "use_rotemb_in_attn": False,                     # Use rotary embeddings within attention op (instead of a separate RotaryEmbedding op)
             "use_packed_matmul": False,                      # Use packed MatMul (instead of 3 separate MatMuls for Q/K/V)
         }
-        if self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16:
+        enable_GQA_on_CPU = True if "enable_GQA_on_CPU" in extra_options and extra_options["enable_GQA_on_CPU"] == "1" else False
+        if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
             # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
-            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")
+            print("GroupQueryAttention (GQA) is used in this model.")
 
             # DML doesn't support packed Q/K/V for GQA yet
             self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads
 
             # GQA + Rot.Emb. does not require `position ids` as input
-            if self.ep == "cuda":
+            if self.ep in {"cuda", "cpu"}:
                 self.attention_attrs["use_rotemb_in_attn"] = True
                 self.input_names.remove("position_ids")
 
@@ -227,7 +228,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
                     "num_key_value_heads": self.num_kv_heads,
                 },
                 "eos_token_id": config.eos_token_id,
-                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id,
+                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id,
                 "type": self.model_type[ : self.model_type.find("For")].lower(),
                 "vocab_size": self.vocab_size,
             },
@@ -1979,6 +1980,7 @@ def get_args():
                 enable_cuda_graph = 1 : The model can use CUDA graph capture for CUDA execution provider. If enabled, all nodes being placed on the CUDA EP
                     is the prerequisite for the CUDA graph to be used correctly. It is not guaranteed that cuda graph be enabled as it depends on the model
                     and the graph structure.
+                enable_GQA_on_CPU = Enalbe G(Group)Query(Q)Attention(A) on CPU.
             """),
     )
 
diff --git a/src/python/python.cpp b/src/python/python.cpp
index dd9d5a9f3..1d8a4e567 100644
--- a/src/python/python.cpp
+++ b/src/python/python.cpp
@@ -24,15 +24,6 @@ pybind11::array_t<T> ToPython(std::span<T> v) {
 
 namespace Generators {
 
-std::unique_ptr<OrtEnv> g_ort_env;
-
-OrtEnv& GetOrtEnv() {
-  if (!g_ort_env) {
-    g_ort_env = OrtEnv::Create();
-  }
-  return *g_ort_env;
-}
-
 // A roaming array is one that can be in CPU or GPU memory, and will copy the memory as needed to be used from anywhere
 template <typename T>
 struct PyRoamingArray : RoamingArray<T> {
@@ -113,7 +104,7 @@ struct PyGeneratorParams {
   }
 
   void TryUseCudaGraphWithMaxBatchSize(pybind11::int_ max_batch_size) {
-    params_->max_batch_size = max_batch_size.cast<int>();
+    params_->TryGraphCapture(max_batch_size.cast<int>());
   }
 
   pybind11::array_t<int32_t> py_input_ids_;
@@ -124,7 +115,6 @@ struct PyGeneratorParams {
 struct PyGenerator {
   PyGenerator(Model& model, PyGeneratorParams& params) {
     params.Prepare();
-    model.GetMaxBatchSizeFromGeneratorParams(params);
     generator_ = CreateGenerator(model, params);
   }
 
@@ -186,6 +176,14 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
 
     )pbdoc";
 
+  // Add a cleanup call to happen before global variables are destroyed
+  static int unused{};  // The capsule needs something to reference
+  pybind11::capsule cleanup(
+      &unused, "cleanup", [](PyObject*) {
+        Generators::Shutdown();
+      });
+  m.add_object("_cleanup", cleanup);
+
   // So that python users can catch OrtExceptions specifically
   pybind11::register_exception<Ort::Exception>(m, "OrtException");
 
@@ -203,9 +201,6 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def("set_search_options", &PyGeneratorParams::SetSearchOptions)  // See config.h 'struct Search' for the options
       .def("try_use_cuda_graph_with_max_batch_size", &PyGeneratorParams::TryUseCudaGraphWithMaxBatchSize);
 
-  // We need to init the OrtApi before we can use it
-  Ort::InitApi();
-
   pybind11::class_<TokenizerStream>(m, "TokenizerStream")
       .def("decode", [](TokenizerStream& t, int32_t token) { return t.Decode(token); });
 
@@ -233,7 +228,7 @@ PYBIND11_MODULE(onnxruntime_genai, m) {
       .def(pybind11::init([](const std::string& config_path) {
         return CreateModel(GetOrtEnv(), config_path.c_str());
       }))
-      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); model.GetMaxBatchSizeFromGeneratorParams(params); return Generate(model, params); })
+      .def("generate", [](Model& model, PyGeneratorParams& params) { params.Prepare(); return Generate(model, params); })
       .def_property_readonly("device_type", [](const Model& s) { return s.device_type_; });
 
   pybind11::class_<PyGenerator>(m, "Generator")
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
index 866515be1..1d784df2e 100644
--- a/src/python/setup.py.in
+++ b/src/python/setup.py.in
@@ -13,6 +13,7 @@ class BinaryDistribution(Distribution):
 setup(
     name='@TARGET_NAME@',
     version='@VERSION_INFO@',
+    description='Generative AI API for ONNX Runtime',
     packages=['onnxruntime_genai', 'onnxruntime_genai.models'],
     include_package_data=True,
     package_data={'': ['*.pyd', '*.dll', '*.so*']},
diff --git a/src/smartptrs.h b/src/smartptrs.h
index 9eab82abc..5591cfde5 100644
--- a/src/smartptrs.h
+++ b/src/smartptrs.h
@@ -115,7 +115,7 @@ struct cuda_stream_holder {
 #else
 struct cuda_stream_holder {
   void Create() {
-    assert(false);
+    throw std::runtime_error("Trying to create a cuda stream in a non cuda build");
   }
 
   operator cudaStream_t() const { return v_; }
diff --git a/src/tokenizer/CMakeLists.txt b/src/tokenizer/CMakeLists.txt
index 69d603715..135dedce6 100644
--- a/src/tokenizer/CMakeLists.txt
+++ b/src/tokenizer/CMakeLists.txt
@@ -8,11 +8,10 @@ file(GLOB tokenizer_srcs CONFIGURE_DEPENDS
 )
 
 FetchContent_Declare(GSL URL https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip)
-FetchContent_MakeAvailable(GSL)
+onnxruntime_fetchcontent_makeavailable(GSL)
 
 FetchContent_Declare(simdjson URL https://github.com/simdjson/simdjson/archive/refs/tags/v3.6.3.zip URL_HASH SHA1=2b063a2e81f74a5d1cb937fadf3d2fca0f1edb09)
-FetchContent_MakeAvailable(simdjson)
-
+onnxruntime_fetchcontent_makeavailable(simdjson)
 add_library(tokenizer STATIC ${tokenizer_srcs})
 
 message(STATUS "GSL_SOURCE_DIR: ${GSL_SOURCE_DIR}")
diff --git a/src/tokenizer/c_api/tfmtok_c.cc b/src/tokenizer/c_api/tfmtok_c.cc
index 02c57ce65..3dc9be009 100644
--- a/src/tokenizer/c_api/tfmtok_c.cc
+++ b/src/tokenizer/c_api/tfmtok_c.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include <cstdarg>
-#include <filesystem>
+#include "../filesystem.h"
 #include <algorithm>
 
 #include "tfmtok.h"
@@ -117,7 +117,7 @@ tfmError_t TFM_API_CALL TfmCreateTokenizer(TfmTokenizer** tokenizer,
     return kTfmErrorInvalidArgument;
   }
 
-  if (!std::filesystem::is_directory(tokenizer_path)) {
+  if (!fs::is_directory(tokenizer_path)) {
     last_error_message = std::string("Cannot find the directory of ") + tokenizer_path;
     return kTfmErrorInvalidArgument;
   }
diff --git a/src/tokenizer/config.cc b/src/tokenizer/config.cc
index dbc0908cf..a40b7d7db 100644
--- a/src/tokenizer/config.cc
+++ b/src/tokenizer/config.cc
@@ -4,7 +4,7 @@
 #include <string>
 #include <fstream>
 #include <streambuf>
-#include <filesystem>
+#include "../filesystem.h"
 
 #include "config.h"
 
@@ -68,8 +68,7 @@ TfmStatus TokenConfig::LoadJson(const std::string& json_path) {
   simdjson::dom::parser parser;
   simdjson::dom::element root;
 
-  if (!std::filesystem::exists(
-          std::filesystem::path(json_path).lexically_normal())) {
+  if (!fs::exists(fs::path(json_path))) {
     return {kTfmErrorInvalidFile, std::string(json_path) + " not found"};
   }
   std::string json_text = PatchJsonText(json_path);
diff --git a/src/tokenizer/token_bpe.cc b/src/tokenizer/token_bpe.cc
index 93c897eea..80ac9d5bf 100644
--- a/src/tokenizer/token_bpe.cc
+++ b/src/tokenizer/token_bpe.cc
@@ -237,15 +237,17 @@ std::vector<tfmTokenId_t> BPETokenizer::Encode(std::string_view sv_input, int64_
       text = text.strip()
     */
     std::u32string str = RemoveConsecutiveSpaces(input);
-    if (IsUnicodeSpace(str.front())) {
-      str.erase(str.begin());
-    }
-    if (IsUnicodeSpace(str.back())) {
-      str.pop_back();
+    if (!str.empty()) {
+      if (IsUnicodeSpace(str.front())) {
+        str.erase(str.begin());
+      }
+      if (IsUnicodeSpace(str.back())) {
+        str.pop_back();
+      }
+      // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
+      str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
+      str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
     }
-    // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
-    str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
-    str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
     input = str;
   }
 
@@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta
         token.push_back(' ');
       }
     }  // end case of whitespace_token_
+
+    bpe_state->incomplete_utf8_ += token;
+    token.clear();
+    std::string& s_utf8 = bpe_state->incomplete_utf8_;
+    size_t utf8_len = 1;
+    size_t utf8_all_len = 0;
+    for (size_t i = 0; i < s_utf8.size(); i += utf8_len) {
+      utf8_len = UTF8Len(s_utf8[i]);
+      if (utf8_len <= s_utf8.size() - i) {
+        utf8_all_len += utf8_len;
+        auto _t = s_utf8.substr(i, utf8_len);
+        token += ValidateUTF8(_t) ? _t : "";
+      }
+    }
+    s_utf8 = s_utf8.substr(utf8_all_len);
   }
 
   return status;
diff --git a/src/tokenizer/token_bpe.h b/src/tokenizer/token_bpe.h
index ed5f1f23c..2327b3a60 100644
--- a/src/tokenizer/token_bpe.h
+++ b/src/tokenizer/token_bpe.h
@@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl {
     BPEDeocerState() = default;
     ~BPEDeocerState() override = default;
     bool f_special_last;
+    std::string incomplete_utf8_;
   };
 
  public:
diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc
index b2a0622e7..4f52acd72 100644
--- a/src/tokenizer/tokenizer.cc
+++ b/src/tokenizer/tokenizer.cc
@@ -1,7 +1,7 @@
 #include "token_bpe.h"
 #include "token_rwkv.h"
 
-#include <filesystem>
+#include "../filesystem.h"
 #include <memory>
 
 namespace tfm {
@@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   if (type.empty()) {
     if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) {
       type = "BPE";
-    } else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) {
+    } /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) {
       // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model
       type = "SPM";
-    } else {
+    } */ else {
       status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument");
     }
   }
@@ -43,7 +43,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   } /* else if (type == "SPM") {
     token_ptr = std::make_unique<SpmTokenizer>();
   } */ else {
-    status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported.");
+    status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported.");
   }
 
   if (status.ok()) {
diff --git a/src/tokenizer/utils/unescape.cc b/src/tokenizer/utils/unescape.cc
index f42e962f9..f94a1f192 100644
--- a/src/tokenizer/utils/unescape.cc
+++ b/src/tokenizer/utils/unescape.cc
@@ -41,27 +41,60 @@ std::string EncodeUTF8Char(char32_t utf8_char) {
   return {utf8_buf};
 }
 
-bool ValidateUTF8(const std::string& data) {
-  int cnt = 0;
-  for (size_t i = 0; i < data.size(); i++) {
-    int x = data[i];
-    if (!cnt) {
-      if ((x >> 5) == 0b110) {
-        cnt = 1;
-      } else if ((x >> 4) == 0b1110) {
-        cnt = 2;
-      } else if ((x >> 3) == 0b11110) {
-        cnt = 3;
-      } else if ((x >> 7) != 0) {
+ bool ValidateUTF8(const std::string& data) {
+    const unsigned char* s = reinterpret_cast<const unsigned char*>(data.c_str());
+    const unsigned char* s_end = s + data.size();
+    if (*s_end != '\0')
+      return false;
+
+    while (*s) {
+      if (*s < 0x80)
+        /* 0xxxxxxx */
+        s++;
+      else if ((s[0] & 0xe0) == 0xc0) {
+        /* 110XXXXx 10xxxxxx */
+        if (s + 1 >= s_end) {
+          return false;
+        }
+        if ((s[1] & 0xc0) != 0x80 ||
+            (s[0] & 0xfe) == 0xc0) /* overlong? */
+          return false;
+        else
+          s += 2;
+      } else if ((s[0] & 0xf0) == 0xe0) {
+        /* 1110XXXX 10Xxxxxx 10xxxxxx */
+        if (s + 2 >= s_end) {
+          return false;
+        }
+        if ((s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+            (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+            (s[0] == 0xef && s[1] == 0xbf &&
+             (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+          return false;
+        else
+          s += 3;
+      } else if ((s[0] & 0xf8) == 0xf0) {
+        /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+        if (s + 3 >= s_end) {
+          return false;
+        }
+        if ((s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[3] & 0xc0) != 0x80 ||
+            (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+            (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+          return false;
+        else
+          s += 4;
+      } else
         return false;
-      }
-    } else {
-      if ((x >> 6) != 0b10) return false;
-      cnt--;
     }
+
+    return true;
   }
-  return cnt == 0;
-}
+
 
 bool IsDigit(char c) { return c >= '0' && c <= '9'; }
 bool IsHexDigit(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); }
diff --git a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
similarity index 92%
rename from test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj
rename to test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
index e4ec8e6d8..978deb04e 100644
--- a/test/csharp/Microsoft.OnnxRuntimeGenAI.Tests.csproj
+++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
@@ -12,7 +12,8 @@
     <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
     <Configurations>Debug;RelWithDebInfo;Release</Configurations>
-
+    <RestoreAdditionalProjectSources>https://api.nuget.org/v3/index.json</RestoreAdditionalProjectSources>
+    <RestoreSources>$(RestoreAdditionalProjectSources);$(RestoreSources)</RestoreSources>
     <RootNamespace>Microsoft.ML.OnnxRuntimeGenAI.Tests</RootNamespace>
     <AssemblyName>Microsoft.ML.OnnxRuntimeGenAI.Tests</AssemblyName>
   </PropertyGroup>
diff --git a/test/main.cpp b/test/main.cpp
index 2f69632b0..ee21918c4 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -5,20 +5,16 @@
 #include <generators.h>
 #include <iostream>
 
-extern std::unique_ptr<OrtEnv> g_ort_env;
-
 int main(int argc, char** argv) {
   std::cout << "Generators Utility Library" << std::endl;
   std::cout << "Initializing OnnxRuntime... ";
   std::cout.flush();
   try {
-    Ort::InitApi();
-    g_ort_env = OrtEnv::Create();
     std::cout << "done" << std::endl;
     ::testing::InitGoogleTest(&argc, argv);
     int result = RUN_ALL_TESTS();
     std::cout << "Shutting down OnnxRuntime... ";
-    g_ort_env.reset();
+    Generators::Shutdown();
     std::cout << "done" << std::endl;
     return result;
   } catch (const std::exception& e) {
diff --git a/test/model_tests.cpp b/test/model_tests.cpp
index edeeb4ea4..66ceaee83 100644
--- a/test/model_tests.cpp
+++ b/test/model_tests.cpp
@@ -10,7 +10,6 @@
 #ifndef MODEL_PATH
 #define MODEL_PATH "../../test/test_models/"
 #endif
-std::unique_ptr<OrtEnv> g_ort_env;
 
 // To generate this file:
 // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20
@@ -33,7 +32,7 @@ TEST(ModelTests, GreedySearchGptFp32) {
   // To generate this file:
   // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2 --output tiny_gpt2_greedysearch_fp16.onnx --use_gpu --max_length 20
   // And copy the resulting gpt2_init_past_fp32.onnx file into these two files (as it's the same for gpt2)
-  auto model = Generators::CreateModel(*g_ort_env,
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(),
                                        MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
 
   auto params = Generators::CreateGeneratorParams(*model);
@@ -74,7 +73,7 @@ TEST(ModelTests, BeamSearchGptFp32) {
   //        --output tiny_gpt2_beamsearch_fp16.onnx --use_gpu --max_length 20
   // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default)
 
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
 
   auto params = Generators::CreateGeneratorParams(*model);
   params->batch_size = static_cast<int>(input_ids_shape[0]);
@@ -119,7 +118,7 @@ void Test_GreedySearch_Gpt_Cuda(const char* model_path, const char* model_label)
       0, 0, 0, 52, 204, 204, 204, 204, 204, 204,
       0, 0, 195, 731, 731, 114, 114, 114, 114, 114};
 
-  auto model = Generators::CreateModel(*g_ort_env, model_path);
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), model_path);
 
   auto params = Generators::CreateGeneratorParams(*model);
   params->batch_size = static_cast<int>(input_ids_shape[0]);
@@ -164,7 +163,7 @@ void Test_BeamSearch_Gpt_Cuda(const char* model_path, const char* model_label) {
   // python convert_generation.py --model_type gpt2 -m hf-internal-testing/tiny-random-gpt2
   //        --output tiny_gpt2_beamsearch_fp16.onnx --use_gpu --max_length 20
   // (with separate_gpt2_decoder_for_init_run set to False as it is now set to True by default)
-  auto model = Generators::CreateModel(*g_ort_env, model_path);
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), model_path);
 
   auto params = Generators::CreateGeneratorParams(*model);
   params->batch_size = static_cast<int>(input_ids_shape[0]);
@@ -215,7 +214,7 @@ Print all primes between 1 and n
 
   std::cout << "With prompt:" << prompt << "\r\n";
 
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "phi-2");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "phi-2");
   auto tokenizer = model->CreateTokenizer();
   auto tokens = tokenizer->Encode(prompt);
 
@@ -253,7 +252,7 @@ Print all primes between 1 and n
 
   std::cout << "With prompt:" << prompt << "\r\n";
 
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "phi-2");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "phi-2");
   auto tokenizer = model->CreateTokenizer();
   auto tokens = tokenizer->Encode(prompt);
 
diff --git a/test/sampling_benchmark.cpp b/test/sampling_benchmark.cpp
index 6190e2507..e614b2b20 100644
--- a/test/sampling_benchmark.cpp
+++ b/test/sampling_benchmark.cpp
@@ -14,13 +14,11 @@
 #define MODEL_PATH "../../test/test_models/"
 #endif
 
-extern std::unique_ptr<OrtEnv> g_ort_env;
-
 // Defined in sampling_tests.cpp
 void CreateRandomLogits(float* logits, int num_large, int vocab_size, int batch_size, std::mt19937& engine);
 
 TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
@@ -54,7 +52,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCpu) {
 }
 
 TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   int k = 5;
@@ -91,7 +89,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCpu) {
 }
 
 TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   float p = 0.95f;
@@ -132,7 +130,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCpu) {
 #include "tests_helper.cuh"
 
 TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
@@ -175,10 +173,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPCuda) {
 }
 
 TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) {
-  std::unique_ptr<OrtEnv> g_ort_env;
-  Ort::InitApi();
-  g_ort_env = OrtEnv::Create();
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   int k = 5;
@@ -218,10 +213,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopKCuda) {
 }
 
 TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) {
-  std::unique_ptr<OrtEnv> g_ort_env;
-  Ort::InitApi();
-  g_ort_env = OrtEnv::Create();
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 1;
   float p = 0.95f;
@@ -266,10 +258,7 @@ TEST(Benchmarks, BenchmarkRandomizedSamplingTopPAndKCuda) {
 }
 
 TEST(Benchmarks, BenchmarkRandomizedSelectTopCuda) {
-  std::unique_ptr<OrtEnv> g_ort_env;
-  Ort::InitApi();
-  g_ort_env = OrtEnv::Create();
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 12;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; // Needs to match batch_size
diff --git a/test/sampling_tests.cpp b/test/sampling_tests.cpp
index 239c71ab6..d2fe34cc6 100644
--- a/test/sampling_tests.cpp
+++ b/test/sampling_tests.cpp
@@ -12,10 +12,9 @@
 #ifndef MODEL_PATH
 #define MODEL_PATH "../../test/test_models/"
 #endif
-extern std::unique_ptr<OrtEnv> g_ort_env;
 
 TEST(SamplingTests, BatchedSamplingTopPCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   std::vector<int32_t> input_ids{0, 1, 2, 3};
   std::vector<int32_t> expected_output{1, 2, 3, 4};
   auto output_span = Generators::cpu_span<int32_t>(expected_output);
@@ -45,7 +44,7 @@ TEST(SamplingTests, BatchedSamplingTopPCpu) {
 }
 
 TEST(SamplingTests, BatchedSamplingTopKCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   std::vector<int32_t> input_ids{0, 1, 2, 3};
   std::vector<float> logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f,
                                 0.25f, 2.0f, 1.25f, 1.5f, 0.25f,
@@ -78,7 +77,7 @@ TEST(SamplingTests, BatchedSamplingTopKCpu) {
 }
 
 TEST(SamplingTests, BatchedSamplingTopPAndKCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   std::vector<int32_t> input_ids{0, 1, 2, 3};
   std::vector<float> logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f,
                                 0.25f, 2.0f, 1.25f, 1.5f, 0.25f,
@@ -128,7 +127,7 @@ void CreateRandomLogits(float* logits, int num_large, int vocab_size, int batch_
 }
 
 TEST(SamplingTests, RandomizedSamplingTopPCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
@@ -165,7 +164,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) {
 }
 
 TEST(SamplingTests, RandomizedSamplingTopKCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   int k = 5;
@@ -203,7 +202,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
 }
 
 TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   float p = 0.95f;
@@ -246,7 +245,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCpu) {
 #include "tests_helper.cuh"
 
 TEST(SamplingTests, BatchedSamplingTopPCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   std::vector<int32_t> input_ids{0, 1, 2, 3};
   std::vector<int32_t> expected_output{1, 2, 3, 4};
   auto output_span = Generators::cpu_span<int32_t>(expected_output);
@@ -278,7 +277,7 @@ TEST(SamplingTests, BatchedSamplingTopPCuda) {
 }
 
 TEST(SamplingTests, BatchedSamplingTopKCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   std::vector<int32_t> input_ids{0, 1, 2, 3};
   std::vector<float> logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f,
                                 0.25f, 2.0f, 1.25f, 1.5f, 0.25f,
@@ -312,7 +311,7 @@ TEST(SamplingTests, BatchedSamplingTopKCuda) {
 }
 
 TEST(SamplingTests, BatchedSamplingTopPAndKCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   std::vector<int32_t> input_ids{0, 1, 2, 3};
   std::vector<float> logits_cpu{2.0f, 1.5f, 1.25f, 0.25f, 0.25f,
                                 0.25f, 2.0f, 1.25f, 1.5f, 0.25f,
@@ -347,7 +346,7 @@ TEST(SamplingTests, BatchedSamplingTopPAndKCuda) {
 }
 
 TEST(SamplingTests, RandomizedSamplingTopPCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};
@@ -388,7 +387,7 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
 }
 
 TEST(SamplingTests, RandomizedSamplingTopKCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   int k = 5;
@@ -430,7 +429,7 @@ TEST(SamplingTests, RandomizedSamplingTopKCuda) {
 }
 
 TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   float p = 0.95f;
@@ -474,7 +473,7 @@ TEST(SamplingTests, RandomizedSamplingTopPAndKCuda) {
 }
 
 TEST(SamplingTests, RandomizedSamplingSelectTopCuda) {
-  auto model = Generators::CreateModel(*g_ort_env, MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
+  auto model = Generators::CreateModel(Generators::GetOrtEnv(), MODEL_PATH "hf-internal-testing/tiny-random-gpt2-fp32");
   int vocab_size = 32000;  // vocab size of llama
   int batch_size = 5;
   std::vector<int32_t> input_ids{0, 1, 2, 3, 4};