microsoft · YUNQIUGUO · Nov 7, 2023 · Nov 7, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml
@@ -0,0 +1,44 @@
+# yaml-language-server: $schema=https://json.schemastore.org/github-action.json
+
+name: 'Rust toolchain setup'
+description: 'Common setup steps for GitHub workflows for Rust projects'
+
+runs:
+  using: composite
+  steps:
+    - uses: dtolnay/[email protected]
+      with:
+        components: clippy, rustfmt
+    - uses: extractions/setup-just@v1
+      with:
+        just-version: '1.15.0' # optional semver specification, otherwise latest
+
+    ###
+    ### Linux setup
+    ###
+    - name: rustup
+      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
+      if: ${{ (runner.os == 'Linux') }}
+      run: |
+        rustup set profile minimal
+        rustup install
+      shell: bash
+    # - name: Cargo login
+    #   if: ${{ (runner.os == 'Linux') }}
+    #   run: just cargo-login-ci
+    #   shell: bash
+
+      ###
+      ### Windows setup
+      ###
+    - name: rustup
+      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
+      if: ${{ (runner.os == 'Windows') }}
+      run: |
+        rustup set profile minimal
+        rustup install
+      shell: pwsh
+    # - name: Cargo login
+    #   if: ${{ (runner.os == 'Windows') }}
+    #   run: just cargo-login-ci-windows
+    #   shell: pwsh
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
@@ -0,0 +1,132 @@
+name: Rust
+
+on: [pull_request]
+
+env:
+  CARGO_TERM_COLOR: always
+  RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug
+  RUST_BACKTRACE: 1
+  MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml
+
+jobs:
+  fmt:
+    name: Rustfmt
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/rust-toolchain-setup
+      - name: vendor onnxruntime source
+        run: just vendor
+      - name: fmt
+        run: cargo fmt --all -- --check
+
+  download:
+    name: Download prebuilt ONNX Runtime archive from build.rs
+    runs-on: ubuntu-latest
+    env:
+      ORT_RUST_STRATEGY=download
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/rust-toolchain-setup
+      - run: rustup target install x86_64-unknown-linux-gnu
+      - run: rustup target install x86_64-apple-darwin
+      - run: rustup target install i686-pc-windows-msvc
+      - run: rustup target install x86_64-pc-windows-msvc
+      # ******************************************************************
+      - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu)
+        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu)
+        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz
+      # ******************************************************************
+      - name: Download prebuilt archive (CPU, x86_64-apple-darwin)
+        run: cargo build --target x86_64-apple-darwin  --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin)
+        run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz
+      # ******************************************************************
+      - name: Download prebuilt archive (CPU, i686-pc-windows-msvc)
+        run: cargo build --target i686-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc)
+        run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip
+      # ******************************************************************
+      - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc)
+        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc)
+        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip
+      # ******************************************************************
+      - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu)
+        env:
+          ORT_USE_CUDA: "yes"
+        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu)
+        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz
+      # ******************************************************************
+      - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc)
+        env:
+          ORT_USE_CUDA: "yes"
+        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc)
+        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip
+
+  test:
+    name: Test Suite
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          [
+            x86_64-unknown-linux-gnu,
+            x86_64-apple-darwin,
+            x86_64-pc-windows-msvc,
+            i686-pc-windows-msvc,
+          ]
+        include:
+          - target: x86_64-unknown-linux-gnu
+            os: ubuntu-latest
+          - target: x86_64-apple-darwin
+            os: macos-latest
+          - target: x86_64-pc-windows-msvc
+            os: windows-latest
+          - target: i686-pc-windows-msvc
+            os: windows-latest
+    env:
+      CARGO_BUILD_TARGET: ${{ matrix.target }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/rust-toolchain-setup
+      - name: vendor onnxruntime source
+        run: just vendor
+      - run: rustup target install ${{ matrix.target }}
+      - name: Install additional packages (macOS)
+        if: contains(matrix.target, 'x86_64-apple-darwin')
+        run: brew install libomp
+      - name: Build (cargo build)
+        run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Build tests (cargo test)
+        run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }}
+      - name: Build onnxruntime with 'model-fetching' feature
+        run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching
+      - name: Test onnxruntime-sys
+        run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture
+      - name: Test onnxruntime
+        run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture
+
+  clippy:
+    name: Clippy
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/rust-toolchain-setup
+      - name: vendor onnxruntime source
+        run: just vendor
+      - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings
+
+  package-sys:
+    name: Package onnxruntime-sys
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/rust-toolchain-setup
+      - name: vendor onnxruntime source
+        run: just vendor
+      - run: cargo package --allow-dirty --package onnxruntime-sys
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -13,14 +13,15 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v4.1.1
+      - uses: actions/stale@v8.0.0
         with:
           # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
           exempt-issue-labels: contributions welcome, feature request, regression
           # Override exempt-all-assignees but only to exempt the issues with an assignee to be marked as stale automatically
           exempt-all-issue-assignees: true
           # Used to ignore the issues and pull requests created before the start date
-          start-date: 20220419
+          # Start date should be April 19, 2022 - corresponds to the day previous stale bot stopped working
+          start-date: '2022-04-19T00:00:00Z'
           # Number of days without activity before the actions/stale action labels an issue
           days-before-issue-stale: 30
           # Number of days without activity before the actions/stale action closes an issue

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -16,25 +16,6 @@
     },
     // Enable Python linting and Pylance type checking
     "python.analysis.typeCheckingMode": "basic",
-    "python.formatting.provider": "black",
-    "python.formatting.blackArgs": [
-        "--line-length",
-        "120"
-    ],
-    "python.sortImports.args": [
-        "--profile",
-        "black",
-        "--line-length",
-        "120"
-    ],
-    "python.linting.enabled": true,
-    "python.linting.flake8Enabled": true,
-    "python.linting.pylintEnabled": true,
-    "python.linting.pydocstyleEnabled": true,
-    "python.linting.pydocstyleArgs": [
-        "--convention=google"
-    ],
-    "python.linting.banditEnabled": true,
     "cpplint.lineLength": 120,
     "cpplint.filters": [
         "-build/include_subdir",

@@ -286,7 +286,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "c4f6b8c6bc94ff69048492fb34df0dfaf1983933",
+          "commitHash": "6f47420213f757831fae65c686aa471749fa8d60",
           "repositoryUrl": "https://github.com/NVIDIA/cutlass.git"
         },
         "comments": "cutlass"
@@ -316,7 +316,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "d52ec01652b7d620386251db92455968d8d90bdc",
+          "commitHash": "a4f72a314a85732ed67d5aa8d1088d207a7e0e61",
           "repositoryUrl": "https://github.com/ROCmSoftwarePlatform/composable_kernel.git"
         },
         "comments": "composable_kernel"

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -114,9 +114,7 @@ option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
 option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
 option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
 option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
-
-#It's preferred to turn it OFF when onnxruntime is dynamically linked to PROTOBUF. But Tensort always required the full version of protobuf.
-cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF "NOT onnxruntime_USE_TENSORRT" ON)
+option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
 option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
 option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
 option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
@@ -526,7 +524,21 @@ if(NOT WIN32 AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android")
   find_package(Iconv REQUIRED)
   set(ICONV_LIB Iconv::Iconv)
 endif()
+
 find_package(Patch)
+if (WIN32 AND NOT Patch_FOUND)
+    # work around CI machines missing patch from the git install by falling back to the binary in this repo.
+    # replicate what happens in https://github.com/Kitware/CMake/blob/master/Modules/FindPatch.cmake but without
+    # the hardcoded suffixes in the path to the patch binary.
+    find_program(Patch_EXECUTABLE NAMES patch PATHS ${PROJECT_SOURCE_DIR}/external/git.Win32.2.41.03.patch)
+    if(Patch_EXECUTABLE)
+      set(Patch_FOUND 1)
+      if (NOT TARGET Patch::patch)
+        add_executable(Patch::patch IMPORTED)
+        set_property(TARGET Patch::patch PROPERTY IMPORTED_LOCATION ${Patch_EXECUTABLE})
+      endif()
+    endif()
+endif()
 if(Patch_FOUND)
   message("Patch found: ${Patch_EXECUTABLE}")
 endif()

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -51,7 +51,7 @@ pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8b
 re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
-cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.0.0.zip;0f95b3c1fc1bd1175c4a90b2c9e39074d1bccefd
+cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a795034a89d4f48a79d1f009f7a04c8dee
 utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
-composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/d52ec01652b7d620386251db92455968d8d90bdc.zip;6b5ce8edf3625f8817086c194fbf94b664e1b0e0
+composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/a4f72a314a85732ed67d5aa8d1088d207a7e0e61.zip;f57357ab6d300e207a632d034ebc8aa036a090d9
diff --git a/cmake/external/abseil-cpp.natvis b/cmake/external/abseil-cpp.natvis
@@ -30,7 +30,6 @@
     <Intrinsic Name="_capacity" Expression="_commonfields().capacity_"/>
     <Intrinsic Name="_control" Expression="_commonfields().control_"/>
     <Intrinsic Name="_slots" Expression="(slot_type*)(_commonfields().slots_)"/>
-    <DisplayString Condition="_size() == 0">empty</DisplayString>
     <DisplayString IncludeView="noparens">size={ _size() }</DisplayString>
     <DisplayString ExcludeView="noparens">size=({_size()})</DisplayString>
     <Expand>

diff --git a/cmake/external/composable_kernel.cmake b/cmake/external/composable_kernel.cmake
@@ -12,13 +12,14 @@ if(NOT composable_kernel_POPULATED)
   FetchContent_Populate(composable_kernel)
   set(BUILD_DEV OFF CACHE BOOL "Disable -Weverything, otherwise, error: 'constexpr' specifier is incompatible with C++98 [-Werror,-Wc++98-compat]" FORCE)
   # Exclude i8 device gemm instances due to excessive long compilation time and not being used
-  set(DTYPES fp32 fp16 bf16)
+  set(DTYPES fp32 fp16 bf16 fp8)
   set(INSTANCES_ONLY ON)
   add_subdirectory(${composable_kernel_SOURCE_DIR} ${composable_kernel_BINARY_DIR} EXCLUDE_FROM_ALL)
 
   add_library(onnxruntime_composable_kernel_includes INTERFACE)
   target_include_directories(onnxruntime_composable_kernel_includes INTERFACE
     ${composable_kernel_SOURCE_DIR}/include
+    ${composable_kernel_BINARY_DIR}/include
     ${composable_kernel_SOURCE_DIR}/library/include)
   target_compile_definitions(onnxruntime_composable_kernel_includes INTERFACE __fp32__ __fp16__ __bf16__)
 endif()
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
@@ -4,7 +4,6 @@ if (onnxruntime_USE_FLASH_ATTENTION OR onnxruntime_USE_MEMORY_EFFICIENT_ATTENTIO
     cutlass
     URL ${DEP_URL_cutlass}
     URL_HASH SHA1=${DEP_SHA1_cutlass}
-    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass.patch
   )
 
   FetchContent_GetProperties(cutlass)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
@@ -1,23 +1,14 @@
-
 if (onnxruntime_USE_PREINSTALLED_EIGEN)
     add_library(eigen INTERFACE)
     file(TO_CMAKE_PATH ${eigen_SOURCE_PATH} eigen_INCLUDE_DIRS)
     target_include_directories(eigen INTERFACE ${eigen_INCLUDE_DIRS})
 else ()
-    if (onnxruntime_USE_ACL)
-        FetchContent_Declare(
-            eigen
-            URL ${DEP_URL_eigen}
-            URL_HASH SHA1=${DEP_SHA1_eigen}
-            PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-space-change --ignore-whitespace < ${PROJECT_SOURCE_DIR}/patches/eigen/Fix_Eigen_Build_Break.patch
-        )
-    else()
-        FetchContent_Declare(
-            eigen
-            URL ${DEP_URL_eigen}
-            URL_HASH SHA1=${DEP_SHA1_eigen}
-        )
-    endif()
+    FetchContent_Declare(
+        eigen
+        URL ${DEP_URL_eigen}
+        URL_HASH SHA1=${DEP_SHA1_eigen}
+    )
+
     FetchContent_Populate(eigen)
     set(eigen_INCLUDE_DIRS  "${eigen_SOURCE_DIR}")
 endif()
diff --git a/cmake/external/git.Win32.2.41.03.patch/msys-2.0.dll b/cmake/external/git.Win32.2.41.03.patch/msys-2.0.dll
diff --git a/cmake/external/git.Win32.2.41.03.patch/msys-gcc_s-1.dll b/cmake/external/git.Win32.2.41.03.patch/msys-gcc_s-1.dll
diff --git a/cmake/external/git.Win32.2.41.03.patch/patch.exe b/cmake/external/git.Win32.2.41.03.patch/patch.exe
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -335,6 +335,7 @@ if(onnxruntime_USE_CUDA)
     URL ${DEP_URL_microsoft_gsl}
     URL_HASH SHA1=${DEP_SHA1_microsoft_gsl}
     PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/gsl/1064.patch
+    FIND_PACKAGE_ARGS 4.0 NAMES Microsoft.GSL
   )
 else()
   FetchContent_Declare(

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
@@ -282,11 +282,7 @@ endif()
 
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
-  else() # macOS
-    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  endif()
+  set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
 
   # Setup the various directories required. Remove any existing ones so we start with a clean directory.
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -33,6 +33,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qpostprocessor.cpp
   ${MLAS_SRC_DIR}/qlgavgpool.cpp
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
+  ${MLAS_SRC_DIR}/sqnbitgemm.cpp
 )
 
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
@@ -68,6 +69,7 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
         ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
         ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
+        ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -334,6 +336,7 @@ else()
           ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
           ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
           ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
+          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
         )
         if (NOT APPLE)
           set(mlas_platform_srcs