Merge branch 'refs/heads/main' into Cjian/gha-ort-nightlhy

# Conflicts: # .github/workflows/linux-cpu-x64-build.yml # .github/workflows/linux-gpu-x64-build.yml # .github/workflows/mac-cpu-arm64-build.yml # .github/workflows/win-cpu-x64-build.yml # .github/workflows/win-gpu-x64-build.yml
microsoft · Apr 19, 2024 · 1ef8525 · 1ef8525
2 parents 9315bac + 2a7a890
commit 1ef8525
Show file tree

Hide file tree

Showing 92 changed files with 3,339 additions and 582 deletions.
diff --git a/.github/workflows/linux-cpu-arm64-build.yml b/.github/workflows/linux-cpu-arm64-build.yml
@@ -4,9 +4,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-aarch64-1.17.1"
-  ort_zip: "onnxruntime-linux-aarch64-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-aarch64-1.17.1.tgz"
+  ort_dir: "onnxruntime-linux-aarch64-1.17.3"
+  ort_zip: "onnxruntime-linux-aarch64-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-aarch64-1.17.3.tgz"
 jobs:
   linux-cpu-arm64-build:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2004-ARM-CPU" ]

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -12,9 +12,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-linux-x64-1.17.1"
-  ort_zip: "onnxruntime-linux-x64-1.17.1.tgz"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-1.17.1.tgz"
+  ort_dir: "onnxruntime-linux-x64-1.17.3"
+  ort_zip: "onnxruntime-linux-x64-1.17.3.tgz"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-linux-x64-1.17.3.tgz"
 jobs:
   job:
     runs-on: [ "self-hosted", "1ES.Pool=onnxruntime-genai-Ubuntu2204-AMD-CPU" ]

diff --git a/.github/workflows/win-cpu-arm64-build.yml b/.github/workflows/win-cpu-arm64-build.yml
@@ -11,9 +11,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 env:
-  ort_dir: "onnxruntime-win-arm64-1.17.1"
+  ort_dir: "onnxruntime-win-arm64-1.17.3"
   ort_zip: "$(ort_dir).zip"
-  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/$(ort_zip)"
+  ort_url: "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/$(ort_zip)"
   binaryDir: 'build/cpu'
 
 jobs:
@@ -33,7 +33,7 @@ jobs:
 
     - name: Download OnnxRuntime
       run: |
-        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-win-arm64-1.17.1.zip"
+        $env:ort_url = "https://github.com/microsoft/onnxruntime/releases/download/v1.17.3/onnxruntime-win-arm64-1.17.3.zip"
         Invoke-WebRequest -Uri $env:ort_url -OutFile $env:ort_zip
 
     - name: Unzip OnnxRuntime

diff --git a/.gitignore b/.gitignore
@@ -22,4 +22,6 @@ examples/python/genai_models
 examples/python/hf_cache
 
 !test/test_models/hf-internal-testing/
-!test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx
+!test/test_models/hf-internal-testing/tiny-random-gpt2*/*.onnx
+
+.ipynb_checkpoints/
diff --git a/.pipelines/nuget-publishing.yml b/.pipelines/nuget-publishing.yml
@@ -23,7 +23,7 @@ parameters:
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
-  default: '1.17.1'
+  default: '1.17.3'
 
 - name: cuda_version
   displayName: 'CUDA version'

diff --git a/.pipelines/pypl-publishing.yml b/.pipelines/pypl-publishing.yml
@@ -22,7 +22,7 @@ parameters:
 - name: ort_version
   displayName: 'OnnxRuntime version'
   type: string
-  default: '1.17.1'
+  default: '1.17.3'
 
 - name: cuda_version
   displayName: 'CUDA version'

diff --git a/README.md b/README.md
@@ -53,6 +53,7 @@ See full documentation at [https://onnxruntime.ai/docs/genai].
 1. Build the model
 ```shell
 python -m onnxruntime_genai.models.builder -m microsoft/phi-2 -e cpu -p int4 -o ./models/phi2
+# You can append --extra_options enable_cuda_graph=1 to build an onnx model that supports using cuda graph in ORT.
 ```
 
 2. Run inference
@@ -75,6 +76,8 @@ tokens = tokenizer.encode(prompt)
 
 params = og.GeneratorParams(model)
 params.set_search_options({"max_length":200})
+# Add the following line to enable cuda graph by passing the maximum batch size.
+# params.try_use_cuda_graph_with_max_batch_size(16)
 params.input_ids = tokens
 
 output_tokens = model.generate(params)

diff --git a/VERSION_INFO b/VERSION_INFO
@@ -1 +1 @@
-0.1.0rc4
+0.2.0-dev
diff --git a/benchmark/python/benchmark_e2e.py b/benchmark/python/benchmark_e2e.py
@@ -18,7 +18,7 @@ def generate_prompt(model, tokenizer, prompt_length) -> str:
     prompt = "a"
     tokens = tokenizer.encode(prompt)
     params=og.GeneratorParams(model)
-    params.set_search_options({"do_sample":True, "top_k":5, "temperature":temperature, "max_length":prompt_length, "min_length":prompt_length+1})
+    params.set_search_options(do_sample=True, top_k=5, temperature=temperature, max_length=prompt_length, min_length=prompt_length+1)
     params.input_ids = tokens
     generator=og.Generator(model, params)
     while not generator.is_done():
@@ -68,7 +68,7 @@ def main(args):
 
     params = og.GeneratorParams(model)
     params.input_ids = tokens
-    params.set_search_options({"do_sample":True, "top_k":args.top_k, "top_p":args.top_p, "temperature":temperature, "max_length":max_length, "min_length":max_length})
+    params.set_search_options(do_sample=True, top_k=args.top_k, top_p=args.top_p, temperature=temperature, max_length=max_length, min_length=max_length)
 
     if args.verbose: print("Running warmup runs...")
     for _ in tqdm(range(args.warmup)):
@@ -99,7 +99,7 @@ def main(args):
         # Prepare run
         params = og.GeneratorParams(model)
         params.input_ids = tokens
-        params.set_search_options({"max_length":max_length, "min_length":max_length})
+        params.set_search_options(max_length=max_length, min_length=max_length)
         generator = og.Generator(model, params)
 
         # Measure prompt processing

diff --git a/build.py b/build.py
@@ -20,6 +20,10 @@ def is_linux():
     """Check if the current platform is Linux."""
     return sys.platform.startswith("linux")
 
+def is_mac():
+    """Check if the current platform is MacOS"""
+    return sys.platform.startswith("darwin")
+
 
 def platform():
     """Get the current platform."""
@@ -110,7 +114,7 @@ def build(
     Args:
         skip_wheel: Whether to skip building the Python wheel. Defaults to False.
     """
-    if not is_windows() and not is_linux():
+    if not is_windows() and not is_linux() and not is_mac():
         raise OSError(f"Unsupported platform {platform()}.")
 
     if cuda_home and not use_cuda: