From 39a0da19cb6a042218a604609b3f680dfc53440a Mon Sep 17 00:00:00 2001
From: Chester Liu <4710575+skyline75489@users.noreply.github.com>
Date: Thu, 12 Sep 2024 11:02:40 +0800
Subject: [PATCH] Enable Wheel and NuGet packaging on macOS x64 & arm64 (#841)

* Implement #693 on macOS for non-framework usage.
* Enable macOS x64 & ARM64 wheel build & packaging
* Enable macOS x64 & ARM64 NuGet build & packaging

Python validation run:
*
https://aiinfra.visualstudio.com/ONNX%20Runtime/_build/results?buildId=548627&view=results

NuGet validation run:
*
https://aiinfra.visualstudio.com/ONNX%20Runtime/_build/results?buildId=548631&view=results
---
 .github/workflows/linux-cpu-x64-build.yml     | 15 +--
 .github/workflows/linux-gpu-x64-build.yml     |  2 +-
 .github/workflows/mac-cpu-arm64-build.yml     | 44 +++++++--
 .pipelines/nuget-publishing.yml               |  8 ++
 .pipelines/pypl-publishing.yml                |  7 ++
 .pipelines/stages/capi-packaging-stage.yml    | 21 +++-
 .pipelines/stages/jobs/capi-packaging-job.yml | 13 +++
 .../stages/jobs/nuget-packaging-job.yml       | 17 ++++
 .../stages/jobs/nuget-validation-job.yml      |  7 +-
 .pipelines/stages/jobs/py-packaging-job.yml   | 16 ++-
 .pipelines/stages/jobs/py-validation-job.yml  | 36 +++----
 .../stages/jobs/steps/capi-linux-step.yml     |  2 +-
 .../stages/jobs/steps/capi-macos-step.yml     | 99 +++++++++++++++++++
 .../jobs/steps/compliant-and-cleanup-step.yml |  1 +
 .../utils/download-huggingface-model.yml      |  2 +-
 .pipelines/stages/nuget-packaging-stage.yml   |  8 +-
 .pipelines/stages/nuget-validation-stage.yml  | 12 +++
 .pipelines/stages/py-packaging-stage.yml      | 19 +++-
 .pipelines/stages/py-validation-stage.yml     | 12 +++
 CMakeLists.txt                                |  6 +-
 cmake/package.cmake                           |  6 +-
 cmake/presets/CMakeMacOSBuildPresets.json     |  8 +-
 cmake/presets/CMakeMacOSConfigPresets.json    | 25 ++++-
 examples/csharp/HelloPhi/Program.cs           |  1 +
 src/models/onnxruntime_api.h                  | 59 +++++++----
 src/python/CMakeLists.txt                     |  2 +-
 src/python/py/_dll_directory.py               | 16 ++-
 test/CMakeLists.txt                           |  2 +-
 ...Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj | 10 ++
 test/python/requirements-macos.txt            |  4 +
 .../nuget/generate_nuspec_for_native_nuget.py |  4 +-
 31 files changed, 410 insertions(+), 74 deletions(-)
 create mode 100644 .pipelines/stages/jobs/steps/capi-macos-step.yml
 create mode 100644 test/python/requirements-macos.txt

diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
index 6f970fdc0..5594cc5e1 100644
--- a/.github/workflows/linux-cpu-x64-build.yml
+++ b/.github/workflows/linux-cpu-x64-build.yml
@@ -82,25 +82,28 @@ jobs:
         run: |
           rm -rf ort
 
+      - name: Verify Build Artifacts
+        if: always()
+        continue-on-error: true
+        run: |
+          ls -l ${{ github.workspace }}/build/cpu
+
       # This will also download all the test models to the test/test_models directory
       # These models are used by the python tests as well as C#, C++ and others.
       - name: Run the python tests
         run: |
+          export ORTGENAI_LOG_ORT_LIB=1
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models
 
       - name: Build the C# API and Run the C# Tests
         run: |
+          export ORTGENAI_LOG_ORT_LIB=1
           cd test/csharp
           dotnet test /p:Configuration=Release /p:NativeBuildOutputDir="../../build/cpu/"
 
-      - name: Verify Build Artifacts
-        if: always()
-        continue-on-error: true
-        run: |
-          ls -l ${{ github.workspace }}/build/cpu
-
       - name: Run tests
         run: |
           set -e -x
+          export ORTGENAI_LOG_ORT_LIB=1
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GITHUB_WORKSPACE/build/cpu/
           ./build/cpu/test/unit_tests
diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
index 857d54fd1..a675b1037 100644
--- a/.github/workflows/linux-gpu-x64-build.yml
+++ b/.github/workflows/linux-gpu-x64-build.yml
@@ -154,4 +154,4 @@ jobs:
             --rm \
             --volume /data/ortgenai_pytorch_models:/data/ortgenai_pytorch_models \
             --volume $GITHUB_WORKSPACE:/ort_genai_src \
-            -w /ort_genai_src onnxruntimecudabuildx64 bash -c "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ort_genai_src/build/cuda/ /ort_genai_src/build/cuda/test/unit_tests"
+            -w /ort_genai_src onnxruntimecudabuildx64 bash -c "ORTGENAI_LOG_ORT_LIB=1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ort_genai_src/build/cuda/ /ort_genai_src/build/cuda/test/unit_tests"
diff --git a/.github/workflows/mac-cpu-arm64-build.yml b/.github/workflows/mac-cpu-arm64-build.yml
index 9f861a96d..a5f969284 100644
--- a/.github/workflows/mac-cpu-arm64-build.yml
+++ b/.github/workflows/mac-cpu-arm64-build.yml
@@ -14,7 +14,7 @@ env:
   ORT_PACKAGE_NAME: "Microsoft.ML.OnnxRuntime"
 jobs:
   mac-cpu-arm64-build:
-    runs-on: macos-latest
+    runs-on: macos-latest # arm64
     steps:
       - name: Checkout OnnxRuntime GenAI repo
         uses: actions/checkout@v4
@@ -36,26 +36,50 @@ jobs:
           mv ${{ env.ORT_PACKAGE_NAME }}/build/native/include ort/
           mv ${{ env.ORT_PACKAGE_NAME }}/runtimes/osx-arm64/native/* ort/lib/
 
-
       - name: Configure CMake
         run: |
-          cmake --preset macos_cpu_release
+          cmake --preset macos_arm64_cpu_release
 
       - name: Build with CMake
         run: |
-          cmake --build --preset macos_cpu_release --parallel
+          cmake --build --preset macos_arm64_cpu_release --parallel
         continue-on-error: false
 
+      - name: Install the python wheel and test dependencies
+        run: |
+          python3 -m venv genai-macos-venv
+          source genai-macos-venv/bin/activate
+          python3 -m pip install -r test/python/requirements.txt
+          python3 -m pip install -r test/python/requirements-macos.txt
+          python3 -m pip install build/cpu/osx-arm64/wheel/onnxruntime_genai*.whl --no-deps
+
+      - name: Remove the ort lib and header files
+        run: |
+          rm -rf ort
+
       - name: Verify Build Artifacts
         if: always()
         continue-on-error: true
         run: |
-          ls -l ${{ github.workspace }}/build
+          ls -l ${{ github.workspace }}/build/cpu/osx-arm64
 
-      - name: Upload Build Artifacts
-        uses: actions/upload-artifact@v3
-        with:
-          name: onnxruntime-genai-mac-cpu-arm64
-          path: ${{ github.workspace }}/build/**/*.a
+      # This will also download all the test models to the test/test_models directory
+      # These models are used by the python tests as well as C#, C++ and others.
+      - name: Run the python tests
+        run: |
+          source genai-macos-venv/bin/activate
+          export ORTGENAI_LOG_ORT_LIB=1
+          python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models
 
+      - name: Build the C# API and Run the C# Tests
+        run: |
+          export ORTGENAI_LOG_ORT_LIB=1
+          cd test/csharp
+          dotnet test /p:Configuration=Release /p:NativeBuildOutputDir="../../build/cpu/osx-arm64"
 
+      - name: Run tests
+        run: |
+          set -e -x
+          export ORTGENAI_LOG_ORT_LIB=1
+          export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$GITHUB_WORKSPACE/build/cpu/osx-arm64
+          ./build/cpu/osx-arm64/test/unit_tests
diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
index 137e2c36f..726493826 100644
--- a/.pipelines/nuget-publishing.yml
+++ b/.pipelines/nuget-publishing.yml
@@ -33,6 +33,11 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_macos_cpu
+  displayName: 'Whether MacOS CPU package is built.'
+  type: boolean
+  default: true
+
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
@@ -83,6 +88,7 @@ stages:
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
     enable_win_dml: ${{ parameters.enable_win_dml }}
     enable_win_arm64: ${{ parameters.enable_win_arm64 }}
+    enable_macos_cpu: ${{ parameters.enable_macos_cpu }}
     ort_version: ${{ parameters.ort_version }}
     ort_cuda_version: ${{ parameters.ort_cuda_version }}
     ort_dml_version: ${{ parameters.ort_dml_version }}
@@ -97,6 +103,7 @@ stages:
     enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
     enable_win_dml: ${{ parameters.enable_win_dml }}
     enable_win_arm64: ${{ parameters.enable_win_arm64 }}
+    enable_macos_cpu: ${{ parameters.enable_macos_cpu }}
     ort_version: ${{ parameters.ort_version }}
     ort_cuda_version: ${{ parameters.ort_cuda_version }}
     ort_dml_version: ${{ parameters.ort_dml_version }}
@@ -111,6 +118,7 @@ stages:
       enable_linux_cuda: ${{ parameters.enable_linux_cuda }}
       enable_win_dml: ${{ parameters.enable_win_dml }}
       enable_win_arm64: ${{ parameters.enable_win_arm64 }}
+      enable_macos_cpu: ${{ parameters.enable_macos_cpu }}
       ort_version: ${{ parameters.ort_version }}
       ort_cuda_version: ${{ parameters.ort_cuda_version }}
       ort_dml_version: ${{ parameters.ort_dml_version }}
diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml
index 6ecbc938e..099679cc3 100644
--- a/.pipelines/pypl-publishing.yml
+++ b/.pipelines/pypl-publishing.yml
@@ -39,6 +39,11 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_macos_cpu
+  displayName: 'Whether MacOS CPU package is built.'
+  type: boolean
+  default: true
+
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
@@ -97,6 +102,7 @@ stages:
     enable_win_cuda: ${{ parameters.enable_win_cuda }}
     enable_win_dml: ${{ parameters.enable_win_dml }}
     enable_win_arm64_cpu: ${{ parameters.enable_win_arm64_cpu }}
+    enable_macos_cpu: ${{ parameters.enable_macos_cpu }}
     ort_version: ${{ parameters.ort_version }}
     ort_cuda_118_version: ${{ parameters.ort_cuda_118_version }}
     ort_cuda_122_version: ${{ parameters.ort_cuda_122_version }}
@@ -113,6 +119,7 @@ stages:
       enable_win_cuda: ${{ parameters.enable_win_cuda }}
       enable_win_dml: ${{ parameters.enable_win_dml }}
       enable_win_arm64_cpu: ${{ parameters.enable_win_arm64_cpu }}
+      enable_macos_cpu: ${{ parameters.enable_macos_cpu }}
       ort_version: ${{ parameters.ort_version }}
       ort_cuda_118_version: ${{ parameters.ort_cuda_118_version }}
       ort_cuda_122_version: ${{ parameters.ort_cuda_122_version }}
diff --git a/.pipelines/stages/capi-packaging-stage.yml b/.pipelines/stages/capi-packaging-stage.yml
index 1b9021a72..2da3971a8 100644
--- a/.pipelines/stages/capi-packaging-stage.yml
+++ b/.pipelines/stages/capi-packaging-stage.yml
@@ -11,6 +11,8 @@ parameters:
   type: boolean
 - name: enable_linux_cuda
   type: boolean
+- name: enable_macos_cpu
+  type: boolean
 - name: ort_version
   type: string
 - name: ort_cuda_version
@@ -91,4 +93,21 @@ stages:
         ep: 'cuda'
         ort_version: ${{ parameters.ort_cuda_version }}
         os: 'linux'
-        build_config: ${{ parameters.build_config }}
\ No newline at end of file
+        build_config: ${{ parameters.build_config }}
+
+  - ${{ if eq(parameters.enable_macos_cpu, true) }}:
+    - template: jobs/capi-packaging-job.yml
+      parameters:
+        arch: 'x64'
+        ep: 'cpu'
+        ort_version: ${{ parameters.ort_version }}
+        os: 'osx'
+        build_config: ${{ parameters.build_config }}
+
+    - template: jobs/capi-packaging-job.yml
+      parameters:
+        arch: 'arm64'
+        ep: 'cpu'
+        ort_version: ${{ parameters.ort_version }}
+        os: 'osx'
+        build_config: ${{ parameters.build_config }}
diff --git a/.pipelines/stages/jobs/capi-packaging-job.yml b/.pipelines/stages/jobs/capi-packaging-job.yml
index 1d936eccb..aa9e9efa4 100644
--- a/.pipelines/stages/jobs/capi-packaging-job.yml
+++ b/.pipelines/stages/jobs/capi-packaging-job.yml
@@ -13,6 +13,7 @@ parameters:
   values:
   - 'linux'
   - 'win'
+  - 'osx'
 - name: build_config
   type: string
   default: 'release'
@@ -33,6 +34,10 @@ jobs:
       pool: 'onnxruntime-genai-windows-vs-2022-arm64'
     ${{ else }}:
       pool: 'onnxruntime-Win-CPU-2022'
+  ${{ if eq(parameters.os, 'osx') }}:
+    pool:
+      vmImage: 'macOS-latest'
+
   timeoutInMinutes: 180
   #  set variables here to be used in the template and steps
   variables:
@@ -130,4 +135,12 @@ jobs:
         ep: ${{ parameters.ep }}
         build_config: ${{ parameters.build_config }}
 
+  - ${{ if eq(parameters.os, 'osx') }}:
+    - template: steps/capi-macos-step.yml
+      parameters:
+        target: 'onnxruntime-genai'
+        arch: ${{ parameters.arch }}
+        ep: ${{ parameters.ep }}
+        build_config: ${{ parameters.build_config }}
+
   - template: steps/compliant-and-cleanup-step.yml
diff --git a/.pipelines/stages/jobs/nuget-packaging-job.yml b/.pipelines/stages/jobs/nuget-packaging-job.yml
index e01374b9e..8599ee207 100644
--- a/.pipelines/stages/jobs/nuget-packaging-job.yml
+++ b/.pipelines/stages/jobs/nuget-packaging-job.yml
@@ -32,6 +32,11 @@ parameters:
   type: boolean
   default: false
 
+- name: enable_macos_cpu
+  displayName: 'Whether MacOS CPU package is built.'
+  type: boolean
+  default: false
+
 - name: ort_version
   type: string
 
@@ -130,6 +135,18 @@ jobs:
         ArtifactName: onnxruntime-genai-linux-cuda-x64-capi
         TargetPath: '$(Build.BinariesDirectory)/artifact-downloads'
 
+  - ${{ if eq(parameters.enable_macos_cpu, true) }}:
+    - template: steps/utils/flex-download-pipeline-artifact.yml
+      parameters:
+        StepName: 'Download osx-cpu-x64 capi Artifacts'
+        ArtifactName: onnxruntime-genai-osx-cpu-x64-capi
+        TargetPath: '$(Build.BinariesDirectory)/artifact-downloads'
+    - template: steps/utils/flex-download-pipeline-artifact.yml
+      parameters:
+        StepName: 'Download osx-cpu-arm64 capi Artifacts'
+        ArtifactName: onnxruntime-genai-osx-cpu-arm64-capi
+        TargetPath: '$(Build.BinariesDirectory)/artifact-downloads'
+
   - checkout: self
     path: onnxruntime-genai
     clean: true
diff --git a/.pipelines/stages/jobs/nuget-validation-job.yml b/.pipelines/stages/jobs/nuget-validation-job.yml
index e382a3ca7..c5498e12e 100644
--- a/.pipelines/stages/jobs/nuget-validation-job.yml
+++ b/.pipelines/stages/jobs/nuget-validation-job.yml
@@ -21,6 +21,7 @@ parameters:
   values:
   - 'linux'
   - 'win'
+  - 'osx'
 
 jobs:
 - job: nuget_${{ parameters.os }}_${{ parameters.ep }}_${{ parameters.arch }}_validation
@@ -43,6 +44,9 @@ jobs:
       pool: 'onnxruntime-genai-windows-vs-2022-arm64'
     ${{ else }}:
       pool: 'onnxruntime-Win-CPU-2022'
+  ${{ if eq(parameters.os, 'osx') }}:
+    pool:
+      vmImage: 'macOS-latest'
 
   timeoutInMinutes: 180
   #  set variables here to be used in the template and steps
@@ -158,7 +162,7 @@ jobs:
       env:
         NUGET_PLUGIN_HANDSHAKE_TIMEOUT_IN_SECONDS: 180
         NUGET_PLUGIN_REQUEST_TIMEOUT_IN_SECONDS: 180
-  - ${{ elseif eq(parameters.os, 'linux') }}:
+  - ${{ elseif or(eq(parameters.os, 'linux'), eq(parameters.os, 'osx')) }}:
     - bash: |
        dotnet --info
        cp $(Build.BinariesDirectory)/nuget/* examples/csharp/HelloPhi/
@@ -166,6 +170,7 @@ jobs:
        mv models/$(prebuild_phi3_mini_model_folder) models/phi-3
        dotnet restore -r $(os)-$(arch) /property:Configuration=$(csproj_configuration) --source https://api.nuget.org/v3/index.json --source https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json --source $PWD --disable-parallel --verbosity detailed
        dotnet build ./HelloPhi.csproj -r $(os)-$(arch) /property:Configuration=$(csproj_configuration) --no-restore --self-contained
+       ls -l ./bin/$(csproj_configuration)/net6.0/$(os)-$(arch)/
       displayName: 'Perform dotnet restore & build'
       workingDirectory: '$(Build.Repository.LocalPath)'
       env:
diff --git a/.pipelines/stages/jobs/py-packaging-job.yml b/.pipelines/stages/jobs/py-packaging-job.yml
index 99582c9f1..db39b9afb 100644
--- a/.pipelines/stages/jobs/py-packaging-job.yml
+++ b/.pipelines/stages/jobs/py-packaging-job.yml
@@ -16,6 +16,7 @@ parameters:
   values:
   - 'linux'
   - 'win'
+  - 'osx'
 - name: build_config
   type: string
   default: 'release'
@@ -37,9 +38,12 @@ jobs:
       pool: 'onnxruntime-genai-windows-vs-2022-arm64'
     ${{ else }}:
       pool: 'onnxruntime-Win-CPU-2022'
+  ${{ if eq(parameters.os, 'osx') }}:
+    pool:
+      vmImage: 'macOS-latest'
 
   strategy:
-    ${{ if eq(parameters.arch, 'arm64') }}:
+    ${{ if and(eq(parameters.os, 'win'), eq(parameters.arch, 'arm64')) }}: # Older Python versions don't have official win-arm64 build.
       matrix:
         Python311:
           PyDotVer: '3.11'
@@ -73,6 +77,8 @@ jobs:
   - name: skipComponentGovernanceDetection
     ${{ if eq(parameters.os, 'linux') }}:
       value: true
+    ${{ if eq(parameters.os, 'osx') }}:
+      value: true
     ${{ if eq(parameters.os, 'win') }}:
       value: false
   - name: arch
@@ -179,5 +185,13 @@ jobs:
         ep: ${{ parameters.ep }}
         build_config: ${{ parameters.build_config }}
 
+  - ${{ if eq(parameters.os, 'osx') }}:
+    - template: steps/capi-macos-step.yml
+      parameters:
+        target: 'python'
+        arch: ${{ parameters.arch }}
+        ep: ${{ parameters.ep }}
+        build_config: ${{ parameters.build_config }}
+
   - template: steps/compliant-and-cleanup-step.yml
 
diff --git a/.pipelines/stages/jobs/py-validation-job.yml b/.pipelines/stages/jobs/py-validation-job.yml
index 6e3bd6625..2282f8f77 100644
--- a/.pipelines/stages/jobs/py-validation-job.yml
+++ b/.pipelines/stages/jobs/py-validation-job.yml
@@ -24,6 +24,7 @@ parameters:
   values:
   - 'linux'
   - 'win'
+  - 'osx'
 
 jobs:
 - job: python_${{ parameters.os }}_${{ parameters.ep }}${{ parameters.cuda_display_version }}_${{ parameters.arch }}_validation
@@ -46,6 +47,9 @@ jobs:
       pool: 'onnxruntime-genai-windows-vs-2022-arm64'
     ${{ else }}:
       pool: 'onnxruntime-Win-CPU-2022'
+  ${{ if eq(parameters.os, 'osx') }}:
+    pool:
+      vmImage: 'macOS-latest'
 
   timeoutInMinutes: 240
   workspace:
@@ -147,22 +151,13 @@ jobs:
       addToPath: true
       architecture: $(arch)
 
-  - ${{ if eq(parameters.os, 'linux') }}:
-    - template: steps/utils/flex-download-pipeline-artifact.yml
-      parameters:
-        StepName: 'Download Python Wheel Artifacts'
-        ArtifactName: $(ArtifactName)
-        TargetPath: '$(Build.BinariesDirectory)/wheel'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-  - ${{ if eq(parameters.os, 'win') }}:
-    - template: steps/utils/flex-download-pipeline-artifact.yml
-      parameters:
-        StepName: 'Download Python Wheel Artifacts'
-        ArtifactName: $(ArtifactName)-wheel
-        TargetPath: '$(Build.BinariesDirectory)/wheel'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
+  - template: steps/utils/flex-download-pipeline-artifact.yml
+    parameters:
+      StepName: 'Download Python Wheel Artifacts'
+      ArtifactName: $(ArtifactName)-wheel
+      TargetPath: '$(Build.BinariesDirectory)/wheel'
+      SpecificArtifact: ${{ parameters.specificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
 
   - template: steps/utils/download-huggingface-model.yml
     parameters:
@@ -174,7 +169,7 @@ jobs:
       HuggingFaceToken: $(HF_TOKEN)
       os: ${{ parameters.os }}
 
-  - ${{ if eq(parameters.os, 'linux') }}:
+  - ${{ if or(eq(parameters.os, 'linux'), eq(parameters.os, 'osx')) }}:
     - ${{ if eq(parameters.ep, 'cuda') }}:
       - bash: |
           set -e -x
@@ -204,7 +199,12 @@ jobs:
       - bash: |
           export ORTGENAI_LOG_ORT_LIB=1
           python -m pip install -r test/python/requirements.txt
-          python -m pip install -r test/python/requirements-cpu.txt
+          if [[ "$(os)" == "linux" ]]; then
+            python -m pip install -r test/python/requirements-cpu.txt
+          fi
+          if [[ "$(os)" == "osx" ]]; then
+            python -m pip install -r test/python/requirements-macos.txt
+          fi
           cd examples/python
           python -m pip install --no-index --find-links=$(Build.BinariesDirectory)/wheel $(pip_package_name)
           python model-generate.py -m ./models/$(prebuild_phi3_mini_model_folder) --min_length 25 --max_length 50 --verbose
diff --git a/.pipelines/stages/jobs/steps/capi-linux-step.yml b/.pipelines/stages/jobs/steps/capi-linux-step.yml
index c3074f607..de7e5ae6c 100644
--- a/.pipelines/stages/jobs/steps/capi-linux-step.yml
+++ b/.pipelines/stages/jobs/steps/capi-linux-step.yml
@@ -169,7 +169,7 @@ steps:
   - task: PublishBuildArtifacts@1
     displayName: 'Publish Artifact: ONNXRuntime python wheel'
     inputs:
-      ArtifactName: $(artifactName)
+      ArtifactName: $(artifactName)-wheel
       PathtoPublish: '$(Build.ArtifactStagingDirectory)/wheel'
 
 - script: |
diff --git a/.pipelines/stages/jobs/steps/capi-macos-step.yml b/.pipelines/stages/jobs/steps/capi-macos-step.yml
new file mode 100644
index 000000000..73818fdaf
--- /dev/null
+++ b/.pipelines/stages/jobs/steps/capi-macos-step.yml
@@ -0,0 +1,99 @@
+parameters:
+- name: target
+  type: string
+- name: ep
+  type: string
+  default: 'cpu'
+- name: arch
+  type: string
+  default: 'x64'
+- name: build_config
+  type: string
+  default: 'release'
+
+steps:
+
+- checkout: self
+  clean: true
+  path: onnxruntime-genai
+  submodules: recursive
+
+- template: utils/set-nightly-build-option-variable.yml
+
+- bash: |
+    echo "arch=$(arch)"
+    echo "ort_filename=$(ort_filename)"
+    echo "ort_version=$(ort_version)"
+    echo "ep=$(ep)"
+    echo "cuda_version=$(cuda_version)"
+    echo "target=${{ parameters.target }}"
+    echo "build_config=${{ parameters.build_config }}"
+  displayName: 'Print Parameters'
+
+- template: utils/download-ort.yml
+  parameters:
+    archiveType: 'zip'
+
+- powershell: |
+    $env:MACOSX_DEPLOYMENT_TARGET = "12.0" # Monterey
+    cmake --preset macos_$(arch)_$(ep)_$(build_config)
+  displayName: 'Configure CMake C API'
+  workingDirectory: '$(Build.Repository.LocalPath)'
+
+- powershell: |
+    cmake --build --preset macos_$(arch)_$(ep)_$(build_config) --parallel --target ${{ parameters.target }}
+  displayName: 'Build C API'
+  workingDirectory: '$(Build.Repository.LocalPath)'
+
+- ${{ if eq(parameters.target, 'onnxruntime-genai') }}:
+  - powershell: |
+      python -m pip install wheel
+    displayName: 'Install wheel'
+
+  - powershell: |
+      cmake --build --preset macos_$(arch)_$(ep)_$(build_config) --target package
+    displayName: 'Package C/C++ API'
+    workingDirectory: '$(Build.Repository.LocalPath)'
+
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish Artifact: ONNXRuntime Genai capi'
+    inputs:
+      ArtifactName: $(artifactName)-capi
+      PathtoPublish: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/package'
+
+- ${{ if eq(parameters.target, 'python') }}:
+  - powershell: |
+      python -m pip install wheel
+    displayName: 'Install wheel'
+
+  - powershell: |
+      # From: https://github.com/pypa/cibuildwheel/blob/93542c397cfe940bcbb8f1eff5c37d345ea16653/cibuildwheel/macos.py#L247-L260
+      if ("$(arch)" -eq "arm64") {
+          $env:_PYTHON_HOST_PLATFORM = "macosx-12.0-arm64"
+          $env:ARCHFLAGS = "-arch arm64"
+      }
+      else {
+          $env:_PYTHON_HOST_PLATFORM = "macosx-12.0-x86_64"
+          $env:ARCHFLAGS = "-arch x86_64"
+      }
+      cmake --build --preset macos_$(arch)_$(ep)_$(build_config) --parallel --PyPackageBuild
+    displayName: 'Build Python Wheel'
+    workingDirectory: '$(Build.Repository.LocalPath)'
+
+  - powershell: |
+      Get-ChildItem -Path $(Build.Repository.LocalPath) -Recurse
+    displayName: 'List all files in the repo for debugging'
+    workingDirectory: '$(Build.Repository.LocalPath)'
+
+  - task: CopyFiles@2
+    displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)'
+    inputs:
+      SourceFolder: '$(Build.Repository.LocalPath)/build/$(ep)/$(os)-$(arch)/wheel'
+      Contents: '*.whl'
+      TargetFolder: '$(Build.ArtifactStagingDirectory)/wheel'
+
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish Artifact: ONNXRuntime python wheel'
+    inputs:
+      ArtifactName: $(artifactName)-wheel
+      PathtoPublish: '$(Build.ArtifactStagingDirectory)/wheel'
diff --git a/.pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml b/.pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml
index aa2f7b547..19ee48dcc 100644
--- a/.pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml
+++ b/.pipelines/stages/jobs/steps/compliant-and-cleanup-step.yml
@@ -4,6 +4,7 @@ steps:
   inputs:
     debugMode: false
   continueOnError: true
+  condition: ne(variables['os'], 'osx')  # Not available on macOS. See https://eng.ms/docs/products/credential-risk-exposure-defense/solutions/credscan_client/overview
 
 - task: TSAUpload@2
   displayName: 'TSA upload'
diff --git a/.pipelines/stages/jobs/steps/utils/download-huggingface-model.yml b/.pipelines/stages/jobs/steps/utils/download-huggingface-model.yml
index 30014e6ff..0dc5a8fc1 100644
--- a/.pipelines/stages/jobs/steps/utils/download-huggingface-model.yml
+++ b/.pipelines/stages/jobs/steps/utils/download-huggingface-model.yml
@@ -15,7 +15,7 @@ parameters:
     type: string
 
 steps:
-  - ${{ if eq(parameters.os, 'linux') }}:
+  - ${{ if or(eq(parameters.os, 'linux'), eq(parameters.os, 'osx')) }}:
     - bash: |
         python -m pip install "huggingface_hub[cli]"
         huggingface-cli login --token $HF_TOKEN
diff --git a/.pipelines/stages/nuget-packaging-stage.yml b/.pipelines/stages/nuget-packaging-stage.yml
index 4137b29bd..00ad62dd2 100644
--- a/.pipelines/stages/nuget-packaging-stage.yml
+++ b/.pipelines/stages/nuget-packaging-stage.yml
@@ -29,6 +29,11 @@ parameters:
   type: boolean
   default: true
 
+- name: enable_macos_cpu
+  displayName: 'Whether MacOS CPU package is built.'
+  type: boolean
+  default: true
+
 - name: ort_version
   type: string
 - name: ort_cuda_version
@@ -42,7 +47,7 @@ parameters:
 stages:
 - stage: nuget_packaging
   jobs:
-    - ${{ if or(eq(parameters.enable_linux_cpu, true), eq(parameters.enable_win_cpu, true)) }}:
+    - ${{ if or(eq(parameters.enable_linux_cpu, true), eq(parameters.enable_win_cpu, true), eq(parameters.enable_macos_cpu, true)) }}:
       - template: jobs/nuget-packaging-job.yml
         parameters:
           ep: 'cpu'
@@ -51,6 +56,7 @@ stages:
           enable_linux_cpu: ${{ parameters.enable_linux_cpu }}
           enable_win_cpu: ${{ parameters.enable_win_cpu }}
           enable_win_arm64: ${{ parameters.enable_win_arm64 }}
+          enable_macos_cpu: ${{ parameters.enable_macos_cpu }}
     - ${{ if or(eq(parameters.enable_linux_cuda, true), eq(parameters.enable_win_cuda, true)) }}:
       - template: jobs/nuget-packaging-job.yml
         parameters:
diff --git a/.pipelines/stages/nuget-validation-stage.yml b/.pipelines/stages/nuget-validation-stage.yml
index 380fda54b..d8c8b3ca4 100644
--- a/.pipelines/stages/nuget-validation-stage.yml
+++ b/.pipelines/stages/nuget-validation-stage.yml
@@ -19,6 +19,8 @@ parameters:
   type: boolean
 - name: enable_linux_cuda
   type: boolean
+- name: enable_macos_cpu
+  type: boolean
 - name: ort_version
   type: string
 - name: ort_cuda_version
@@ -104,3 +106,13 @@ stages:
         os: 'linux'
         SpecificArtifact: ${{ parameters.SpecificArtifact }}
         BuildId: ${{ parameters.BuildId }}
+
+  - ${{ if eq(parameters.enable_macos_cpu, true) }}:
+    - template: jobs/nuget-validation-job.yml
+      parameters:
+        arch: 'x64'
+        ep: 'cpu'
+        ort_version: ${{ parameters.ort_version }}
+        os: 'osx'
+        SpecificArtifact: ${{ parameters.SpecificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
\ No newline at end of file
diff --git a/.pipelines/stages/py-packaging-stage.yml b/.pipelines/stages/py-packaging-stage.yml
index d43868ed0..d7bab62c8 100644
--- a/.pipelines/stages/py-packaging-stage.yml
+++ b/.pipelines/stages/py-packaging-stage.yml
@@ -13,6 +13,8 @@ parameters:
   type: boolean
 - name: enable_linux_rocm
   type: boolean
+- name: enable_macos_cpu
+  type: boolean
 - name: ort_version
   type: string
 - name: ort_cuda_118_version
@@ -121,5 +123,18 @@ stages:
         ort_version: ${{ parameters.ort_rocm_version }}
         os: 'linux'
         build_config: ${{ parameters.build_config }}
-
-
+  - ${{ if eq(parameters.enable_macos_cpu, true) }}:
+    - template: jobs/py-packaging-job.yml
+      parameters:
+        arch: 'x64'
+        ep: 'cpu'
+        ort_version: ${{ parameters.ort_version }}
+        os: 'osx'
+        build_config: ${{ parameters.build_config }}
+    - template: jobs/py-packaging-job.yml
+      parameters:
+        arch: 'arm64'
+        ep: 'cpu'
+        ort_version: ${{ parameters.ort_version }}
+        os: 'osx'
+        build_config: ${{ parameters.build_config }}
diff --git a/.pipelines/stages/py-validation-stage.yml b/.pipelines/stages/py-validation-stage.yml
index d619c3acb..0b6abda5a 100644
--- a/.pipelines/stages/py-validation-stage.yml
+++ b/.pipelines/stages/py-validation-stage.yml
@@ -19,6 +19,8 @@ parameters:
   type: boolean
 - name: enable_linux_cuda
   type: boolean
+- name: enable_macos_cpu
+  type: boolean
 - name: ort_version
   type: string
 - name: ort_cuda_118_version
@@ -123,3 +125,13 @@ stages:
         os: 'linux'
         SpecificArtifact: ${{ parameters.SpecificArtifact }}
         BuildId: ${{ parameters.BuildId }}
+
+  - ${{ if eq(parameters.enable_macos_cpu, true) }}:
+    - template: jobs/py-validation-job.yml
+      parameters:
+        arch: 'x64'
+        ep: 'cpu'
+        ort_version: ${{ parameters.ort_version }}
+        os: 'osx'
+        SpecificArtifact: ${{ parameters.SpecificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06d196d7b..a35772f4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,8 +84,10 @@ target_link_directories(onnxruntime-genai PRIVATE ${ORT_LIB_DIR})
 
 # we keep the shared libraries disconnected on Android as they will come from separate AARs and we don't want to force
 # the ORT version to match in both.
-if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux"))
-target_link_libraries(onnxruntime-genai PRIVATE ${ONNXRUNTIME_LIB})
+if(CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux" OR (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND (NOT BUILD_APPLE_FRAMEWORK)))
+  add_compile_definitions(_ORT_GENAI_USE_DLOPEN)
+else()
+  target_link_libraries(onnxruntime-genai PRIVATE ${ONNXRUNTIME_LIB})
 endif()
 
 set_target_properties(onnxruntime-genai PROPERTIES FOLDER "Sources")
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 87ad79345..5939b1078 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -47,7 +47,11 @@ elseif (LINUX)
     set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-linux-arm64")
   endif ()
 elseif (APPLE)
-  set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-osx-arm64")
+  if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
+    set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-osx-x64")
+  else ()
+    set(CPACK_PACKAGE_FILE_NAME "onnxruntime-genai-${VERSION_INFO}-osx-arm64")
+  endif()
 endif ()
 
 if (WIN32)
diff --git a/cmake/presets/CMakeMacOSBuildPresets.json b/cmake/presets/CMakeMacOSBuildPresets.json
index e27635e9c..d3634c3d2 100644
--- a/cmake/presets/CMakeMacOSBuildPresets.json
+++ b/cmake/presets/CMakeMacOSBuildPresets.json
@@ -5,8 +5,12 @@
   ],
   "buildPresets": [
     {
-      "name": "macos_cpu_release",
-      "configurePreset": "macos_cpu_release"
+      "name": "macos_x64_cpu_release",
+      "configurePreset": "macos_x64_cpu_release"
+    },
+    {
+      "name": "macos_arm64_cpu_release",
+      "configurePreset": "macos_arm64_cpu_release"
     },
     {
       "name": "macos_cpu_debug",
diff --git a/cmake/presets/CMakeMacOSConfigPresets.json b/cmake/presets/CMakeMacOSConfigPresets.json
index a425dc595..7de4f39a9 100644
--- a/cmake/presets/CMakeMacOSConfigPresets.json
+++ b/cmake/presets/CMakeMacOSConfigPresets.json
@@ -4,13 +4,12 @@
     "CMakeLinuxDefaultConfigPresets.json"
   ],
   "configurePresets": [
-        {
+    {
       "name": "macos_default",
       "generator": "Unix Makefiles",
       "binaryDir": "${sourceDir}/build/cpu",
       "cacheVariables": {
         "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
-        "CMAKE_OSX_ARCHITECTURES": "arm64",
         "USE_CUDA": "OFF",
         "USE_ROCM": "OFF"
       },
@@ -87,6 +86,28 @@
         "macos_default",
         "macos_minsizerel_default"
       ]
+    },
+    {
+      "name": "macos_x64_cpu_release",
+      "inherits": [
+        "macos_cpu_release"
+      ],
+      "cacheVariables": {
+        "CMAKE_OSX_ARCHITECTURES": "x86_64"
+      },
+      "displayName": "macos x64 cpu release",
+      "binaryDir": "${sourceDir}/build/cpu/osx-x64"
+    },
+    {
+      "name": "macos_arm64_cpu_release",
+      "inherits": [
+        "macos_cpu_release"
+      ],
+      "cacheVariables": {
+        "CMAKE_OSX_ARCHITECTURES": "arm64"
+      },
+      "displayName": "macos arm64 cpu release",
+      "binaryDir": "${sourceDir}/build/cpu/osx-arm64"
     }
   ]
 }
\ No newline at end of file
diff --git a/examples/csharp/HelloPhi/Program.cs b/examples/csharp/HelloPhi/Program.cs
index 82576c1ad..26e20a353 100644
--- a/examples/csharp/HelloPhi/Program.cs
+++ b/examples/csharp/HelloPhi/Program.cs
@@ -76,6 +76,7 @@ void PrintUsage()
     var sequences = tokenizer.Encode($"<|user|>{prompt}<|end|><|assistant|>");
 
     using GeneratorParams generatorParams = new GeneratorParams(model);
+    generatorParams.SetSearchOption("min_length", 50);
     generatorParams.SetSearchOption("max_length", 200);
     generatorParams.SetInputSequences(sequences);
     if (option == 1) // Complete Output
diff --git a/src/models/onnxruntime_api.h b/src/models/onnxruntime_api.h
index ddc617f79..5466c7cb7 100644
--- a/src/models/onnxruntime_api.h
+++ b/src/models/onnxruntime_api.h
@@ -75,10 +75,11 @@ p_session_->Run(nullptr, input_names, inputs, std::size(inputs), output_names, o
 #include "../logging.h"
 #include "env_utils.h"
 
-#if defined(__ANDROID__)
-#include <android/log.h>
+#if defined(__linux__)
 #include <dlfcn.h>
 
+#if defined(__ANDROID__)
+#include <android/log.h>
 #define TAG "GenAI"
 
 #define LOG_DEBUG(...) __android_log_print(ANDROID_LOG_DEBUG, TAG, __VA_ARGS__)
@@ -86,9 +87,15 @@ p_session_->Run(nullptr, input_names, inputs, std::size(inputs), output_names, o
 #define LOG_WARN(...) __android_log_print(ANDROID_LOG_WARN, TAG, __VA_ARGS__)
 #define LOG_ERROR(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
 #define LOG_FATAL(...) __android_log_print(ANDROID_LOG_FATAL, TAG, __VA_ARGS__)
+#endif
 
-#elif defined(__linux__)
+#elif defined(__APPLE__)
+#include "TargetConditionals.h"
+#if TARGET_OS_OSX && _ORT_GENAI_USE_DLOPEN
+#define MACOS_USE_DLOPEN
 #include <dlfcn.h>
+#endif
+#endif
 
 #ifndef PATH_MAX
 #define PATH_MAX (4096)
@@ -103,8 +110,6 @@ p_session_->Run(nullptr, input_names, inputs, std::size(inputs), output_names, o
 #define LOG_ERROR(...) LOG_WHEN_ENABLED(Generators::Log("error", __VA_ARGS__))
 #define LOG_FATAL(...) LOG_WHEN_ENABLED(Generators::Log("fatal", __VA_ARGS__))
 
-#endif
-
 /** \brief Free functions and a few helpers are defined inside this namespace. Otherwise all types are the C API types
  *
  */
@@ -115,7 +120,7 @@ using OrtApiBaseFn = const OrtApiBase* (*)(void);
 /// Before using this C++ wrapper API, you MUST call Ort::InitApi to set the below 'api' variable
 inline const OrtApi* api{};
 
-#if defined(__linux__)
+#if defined(__linux__) || defined(MACOS_USE_DLOPEN)
 inline std::string GetCurrentModuleDir() {
   Dl_info dl_info;
   dladdr((void*)GetCurrentModuleDir, &dl_info);
@@ -135,14 +140,16 @@ inline void* LoadDynamicLibraryIfExists(const std::string& path) {
   if (ort_lib_handle == nullptr) {
     char* err = dlerror();
     LOG_WARN("Error while dlopen: %s", (err != nullptr ? err : "Unknown"));
-    // Trying current dir
-    std::string current_module_dir = GetCurrentModuleDir();
-    std::string local_path{current_module_dir + "/" + path};
-    LOG_INFO("Attempting to dlopen %s", local_path.c_str());
-    ort_lib_handle = dlopen(local_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (path.front() != '/') {
+      // If not absolute path, try search for current dir
+      std::string current_module_dir = GetCurrentModuleDir();
+      std::string local_path{current_module_dir + "/" + path};
+      LOG_INFO("Attempting to dlopen %s", local_path.c_str());
+      ort_lib_handle = dlopen(local_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    }
   }
   if (ort_lib_handle) {
-#if !defined(__ANDROID__)  // RTLD_DI_ORIGIN not available on Android
+#if !defined(__ANDROID__) && !defined(__APPLE__)  // RTLD_DI_ORIGIN not available on Android & Darwin
     char pathname[PATH_MAX];
     dlinfo((void*)ort_lib_handle, RTLD_DI_ORIGIN, &pathname);
     LOG_INFO("Loaded native library at %s", pathname);
@@ -196,7 +203,7 @@ inline void InitApi() {
     Generators::SetLogBool("ort_lib", true);
   }
 
-#if defined(__linux__)
+#if defined(__linux__) || defined(MACOS_USE_DLOPEN)
   // If the GenAI library links against the onnxruntime library, it will have a dependency on a specific
   // version of OrtGetApiBase.
   //
@@ -218,15 +225,31 @@ inline void InitApi() {
   //     any libonnxruntime.so that supports one of those versions.
   //
 
-  const std::string path = "libonnxruntime.so";  // "libonnxruntime4j_jni.so" is also an option if we have issues
-  void* ort_lib_handle = LoadDynamicLibraryIfExists(path);
+  void* ort_lib_handle = nullptr;
+  const char* ort_lib_path = std::getenv("ORT_LIB_PATH");
+  if (ort_lib_path) {
+    ort_lib_handle = LoadDynamicLibraryIfExists(ort_lib_path);
+  }
 
-#if !defined(__ANDROID__)
+#if defined(__linux__)
   if (ort_lib_handle == nullptr) {
+    // For Android and NuGet Linux package, the file name is libonnxruntime.so
+    // "libonnxruntime4j_jni.so" is also an option on Android if we have issues
+    ort_lib_handle = LoadDynamicLibraryIfExists("libonnxruntime.so");
+  }
+
+  if (ort_lib_handle == nullptr) {
+    // On Linux it can also be `libonnxruntime.so.1`. See: https://github.com/microsoft/onnxruntime/pull/21339
     ort_lib_handle = LoadDynamicLibraryIfExists("libonnxruntime.so.1");
   }
 #endif
 
+#if defined(MACOS_USE_DLOPEN)
+  if (ort_lib_handle == nullptr) {
+    ort_lib_handle = LoadDynamicLibraryIfExists("libonnxruntime.dylib");
+  }
+#endif
+
   if (ort_lib_handle == nullptr) {
     throw std::runtime_error(std::string("Failed to load onnxruntime. Set ORTGENAI_LOG_ORT_LIB envvar to enable detailed logging."));
   }
@@ -238,11 +261,11 @@ inline void InitApi() {
   }
 
   InitApiWithDynamicFn(ort_api_base_fn);
-#else   // defined(__linux__)
+#else   // defined(__linux__) || defined(MACOS_USE_DLOPEN)
   api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
   if (!api)
     throw std::runtime_error("Onnxruntime is installed but is too old, please install a newer version");
-#endif  // defined(__linux__)
+#endif  // defined(__linux__) || defined(MACOS_USE_DLOPEN)
 }
 
 /** \brief All C++ methods that can fail will throw an exception of this type
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 6628862a3..c17a944ad 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -9,7 +9,7 @@ target_include_directories(python PRIVATE ${ORT_HEADER_DIR})
 target_link_directories(python PRIVATE ${ORT_LIB_DIR})
 target_link_libraries(python PRIVATE onnxruntime-genai-static)
 
-if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux"))
+if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux" OR CMAKE_SYSTEM_NAME STREQUAL "Darwin"))
 target_link_libraries(python PRIVATE ${ONNXRUNTIME_LIB})
 endif()
 
diff --git a/src/python/py/_dll_directory.py b/src/python/py/_dll_directory.py
index 0e47dbdce..1b09994c0 100644
--- a/src/python/py/_dll_directory.py
+++ b/src/python/py/_dll_directory.py
@@ -12,6 +12,10 @@ def _is_linux():
     return sys.platform.startswith("linux")
 
 
+def _is_macos():
+    return sys.platform.startswith("darwin")
+
+
 def add_onnxruntime_dependency(package_id: str):
     """Add the onnxruntime shared library dependency.
     
@@ -38,7 +42,7 @@ def add_onnxruntime_dependency(package_id: str):
             import ctypes
             _ = ctypes.CDLL(dml_path)
 
-    elif _is_linux():
+    elif _is_linux() or _is_macos():
         import importlib.util
         import ctypes
         import glob
@@ -50,11 +54,17 @@ def add_onnxruntime_dependency(package_id: str):
         # Load the onnxruntime shared library here since we can find the path in python with ease.
         # This avoids needing to know the exact path of the shared library from native code.
         ort_package_path = ort_package.submodule_search_locations[0]
-        ort_lib_path = glob.glob(os.path.join(ort_package_path, "capi", "libonnxruntime.so*"))
+        if _is_linux():
+            ort_lib_path = glob.glob(os.path.join(ort_package_path, "capi", "libonnxruntime.so*"))
+        elif _is_macos():
+            ort_lib_path = glob.glob(os.path.join(ort_package_path, "capi", "libonnxruntime*.dylib"))
         if not ort_lib_path:
             raise ImportError("Could not find the onnxruntime shared library.")
 
-        _ = ctypes.CDLL(ort_lib_path[0])
+        target_lib_path = ort_lib_path[0]
+        os.environ["ORT_LIB_PATH"] = target_lib_path
+
+        _ = ctypes.CDLL(target_lib_path)
 
 
 def add_cuda_dependency():
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index da3502bb4..cb5967171 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -26,7 +26,7 @@ target_link_libraries(unit_tests PRIVATE
   GTest::gtest_main
 )
 
-if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux"))
+if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "Linux" OR CMAKE_SYSTEM_NAME STREQUAL "Darwin"))
 target_link_libraries(unit_tests PRIVATE ${ONNXRUNTIME_LIB})
 endif()
 
diff --git a/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
index 7d6c8ea74..711719a34 100644
--- a/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
+++ b/test/csharp/Microsoft.ML.OnnxRuntimeGenAI.Tests.csproj
@@ -63,6 +63,16 @@
       <Visible>false</Visible>
     </None>
 
+    <!-- macOS -->
+    <None Condition="Exists('$(NativeBuildOutputDir)\libonnxruntime-genai.dylib')" Include="$(NativeBuildOutputDir)\libonnxruntime-genai.dylib">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Visible>false</Visible>
+    </None>
+    <None Condition="Exists('$(NativeBuildOutputDir)\libonnxruntime.dylib')" Include="$(NativeBuildOutputDir)\libonnxruntime.dylib">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Visible>false</Visible>
+    </None>
+
     <None Include="$(TestsSrcRoot)\test_models\**">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
diff --git a/test/python/requirements-macos.txt b/test/python/requirements-macos.txt
new file mode 100644
index 000000000..dcf4c3d97
--- /dev/null
+++ b/test/python/requirements-macos.txt
@@ -0,0 +1,4 @@
+-f https://download.pytorch.org/whl/torch_stable.html
+-i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/
+torch==2.2.1
+ort-nightly==1.20.0.dev20240903006
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 5ea725e17..ba464add6 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -122,7 +122,7 @@ def add_native_artifact_if_exists(xml_lines, runtime, artifact):
                 f'<file src="{p.absolute()}" target="runtimes\{runtime}\\native" />'
             )
 
-    runtimes = ["win-x64", "win-arm64", "linux-x64"]
+    runtimes = ["win-x64", "win-arm64", "linux-x64", "osx-x64", "osx-arm64"]
     for runtime in runtimes:
       if runtime.startswith("win"):
           add_native_artifact_if_exists(lines, runtime, "onnxruntime-genai.lib")
@@ -130,6 +130,8 @@ def add_native_artifact_if_exists(xml_lines, runtime, artifact):
           add_native_artifact_if_exists(lines, runtime, "d3d12core.dll")
       if runtime.startswith("linux"):
           add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai.so")
+      if runtime.startswith("osx"):
+          add_native_artifact_if_exists(lines, runtime, "libonnxruntime-genai.dylib")
 
     # targets
     for dotnet in ["netstandard2.0", "net8.0", "native"]: