Merge branch 'main' of https://github.com/microsoft/onnxruntime into …

…bugapi
microsoft · Dec 21, 2023 · 8b7d8cd · 8b7d8cd
2 parents faf3f4f + 914bc40
commit 8b7d8cd
Show file tree

Hide file tree

Showing 122 changed files with 27,306 additions and 3,375 deletions.
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -7,7 +7,7 @@ jobs:
   triage:
     runs-on: ubuntu-latest
     steps:
-    - uses: github/issue-labeler@v3.2
+    - uses: github/issue-labeler@v3.3
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: .github/labeler.yml

diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
@@ -25,7 +25,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Set up JDK 11
-        uses: actions/setup-java@v3
+        uses: actions/setup-java@v4
         with:
           java-version: '11'
           distribution: 'adopt'

diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml
@@ -25,7 +25,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - name: Setup Node.js
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@v4
         with:
           node-version: 18
       - name: Generate JS docs 

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -26,7 +26,7 @@ jobs:
           python-version: '3.11.x'
           architecture: 'x64'
 
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
         with:
           node-version: 18
 

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -87,6 +87,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
+option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -1166,6 +1167,17 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
+set(USE_JBLAS FALSE)
+if (onnxruntime_USE_JBLAS AND NOT onnxruntime_MINIMAL_BUILD)
+  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
+    add_compile_definitions(MLAS_JBLAS)
+    set(USE_JBLAS TRUE)
+  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
+    add_compile_definitions(MLAS_JBLAS)
+    set(USE_JBLAS TRUE)
+  endif()
+endif()
+
 # TVM EP
 if (onnxruntime_USE_TVM)
   if (NOT TARGET tvm)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -45,6 +45,15 @@ endif()
 
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
+function(add_jblas)
+    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas) 
+    target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
+    target_sources(onnxruntime_mlas PRIVATE
+        ${MLAS_SRC_DIR}/jblas_gemm.cpp
+     )
+    set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
+endfunction()
+
 #TODO: set MASM flags properly
 function(setup_mlas_source_for_windows)
 
@@ -200,7 +209,6 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/q4gemm_avx512.cpp
       )
     endif()
-
   else()
     target_sources(onnxruntime_mlas PRIVATE
       ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
@@ -566,7 +574,7 @@ else()
             )
           set_source_files_properties(${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
           set_source_files_properties(${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
-	    endif()
+        endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
           onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs})
@@ -604,6 +612,10 @@ else()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
 
+if(USE_JBLAS)
+  add_jblas()
+endif()
+
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2824,6 +2824,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>size of each input feature</dd>
 <dt><tt>N</tt> : int (required)</dt>
 <dd>size of each output feature</dd>
+<dt><tt>accuracy_level</tt> : int</dt>
+<dd>The minimum accuracy level of input A, can be: 0(unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8) (default unset). It is used to control how input A is quantized or downcast internally while doing computation, for example: 0 means input A will not be quantized or downcast while doing computation. 4 means input A can be quantized with the same block_size to int8 internally from type T1.</dd>
 <dt><tt>bits</tt> : int (required)</dt>
 <dd>number of bits used for weight quantization (default 4)</dd>
 <dt><tt>block_size</tt> : int (required)</dt>

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -80,7 +80,8 @@ Do not modify directly.*
 |Crop|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
 |CumSum|*in* x:**T**<br> *in* axis:**T2**<br> *out* y:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int32), tensor(int64)|
 |||[11, 13]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int32), tensor(int64)|
-|DFT|*in* input:**T1**<br> *in* dft_length:**T2**<br> *in* axis:**tensor(int64)**<br> *out* output:**T1**<br><br>or<br><br>*in* input:**T1**<br> *in* dft_length:**T2**<br> *out* output:**T1**|17+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
+|DFT|*in* input:**T1**<br> *in* dft_length:**T2**<br> *in* axis:**tensor(int64)**<br> *out* output:**T1**<br><br>or<br><br>*in* input:**T1**<br> *in* dft_length:**T2**<br> *out* output:**T1**|20+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
+|||[17, 19]|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int32), tensor(int64)|
 |DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[11, 12]|**T** = tensor(double), tensor(float)|
 |||[1, 10]|**T** = tensor(double), tensor(float)|

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3593,17 +3593,11 @@ struct OrtApi {
    *
    * QNN supported keys:
    *   "backend_path": file path to QNN backend library.
-   *   "qnn_context_cache_enable": 1 to enable QNN graph creation from cached QNN context file. If it's enabled: QNN EP will
-   *    load from cached QNN context binary if it exist. It will generate a context binary file if it's not exist
-   *   "qnn_context_cache_path": explicitly provide the QNN context cache file. Default to model_file.onnx.bin if not provided.
    *   "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off.
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
    *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
-   *   "qnn_context_embed_mode", 1 means dump the QNN context binary into node attribute EPContext->ep_cache_context in the ONNX skeleton model.
-   *   0 means dump the QNN context binary into separate bin file and set the path to EPContext->ep_cache_context.
-   *   The path is relative path to the ONNX skeleton model file.
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -235,3 +235,18 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 // Use this config to control the minimum size of the initializer when externalizing it during serialization
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
+
+// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
+// "0": disable. (default)
+// "1": enable.
+static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
+
+// Specify the file path for the Onnx model which has EP context.
+// Default to original_file_name_ctx.onnx if not specified
+static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
+
+// Flag to specify whether to dump the EP context into the Onnx model.
+// "0": dump the EP context into separate file, keep the file name in the Onnx model.
+// "1": dump the EP context into the Onnx model. (default).
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
diff --git a/js/common/lib/backend-impl.ts b/js/common/lib/backend-impl.ts
@@ -82,7 +82,7 @@ export const resolveBackend = async(backendHints: readonly string[]): Promise<Ba
       const isInitializing = !!backendInfo.initPromise;
       try {
         if (!isInitializing) {
-          backendInfo.initPromise = backendInfo.backend.init();
+          backendInfo.initPromise = backendInfo.backend.init(backendName);
         }
         await backendInfo.initPromise;
         backendInfo.initialized = true;

diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
@@ -71,7 +71,7 @@ export interface Backend {
   /**
    * Initialize the backend asynchronously. Should throw when failed.
    */
-  init(): Promise<void>;
+  init(backendName: string): Promise<void>;
 
   createInferenceSessionHandler(uriOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions):
       Promise<InferenceSessionHandler>;

diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
@@ -20,7 +20,7 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   }
 
   async dispose(): Promise<void> {
-    return Promise.resolve();
+    this.#inferenceSession.dispose();
   }
 
   readonly inputNames: string[];

diff --git a/js/node/lib/binding.ts b/js/node/lib/binding.ts
@@ -28,6 +28,8 @@ export declare namespace Binding {
     readonly outputNames: string[];
 
     run(feeds: FeedsType, fetches: FetchesType, options: RunOptions): ReturnType;
+
+    dispose(): void;
   }
 
   export interface InferenceSessionConstructor {

diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc
@@ -31,6 +31,7 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
   Napi::Function func = DefineClass(
       env, "InferenceSession",
       {InstanceMethod("loadModel", &InferenceSessionWrap::LoadModel), InstanceMethod("run", &InferenceSessionWrap::Run),
+       InstanceMethod("dispose", &InferenceSessionWrap::Dispose),
        InstanceAccessor("inputNames", &InferenceSessionWrap::GetInputNames, nullptr, napi_default, nullptr),
        InstanceAccessor("outputNames", &InferenceSessionWrap::GetOutputNames, nullptr, napi_default, nullptr)});
 
@@ -45,14 +46,15 @@ Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
 }
 
 InferenceSessionWrap::InferenceSessionWrap(const Napi::CallbackInfo &info)
-    : Napi::ObjectWrap<InferenceSessionWrap>(info), initialized_(false), session_(nullptr),
+    : Napi::ObjectWrap<InferenceSessionWrap>(info), initialized_(false), disposed_(false), session_(nullptr),
       defaultRunOptions_(nullptr) {}
 
 Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   Napi::HandleScope scope(env);
 
   ORT_NAPI_THROW_ERROR_IF(this->initialized_, env, "Model already loaded. Cannot load model multiple times.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   size_t argsLength = info.Length();
   ORT_NAPI_THROW_TYPEERROR_IF(argsLength == 0, env, "Expect argument: model file path or buffer.");
@@ -129,6 +131,7 @@ Napi::Value InferenceSessionWrap::LoadModel(const Napi::CallbackInfo &info) {
 Napi::Value InferenceSessionWrap::GetInputNames(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   Napi::EscapableHandleScope scope(env);
   return scope.Escape(CreateNapiArrayFrom(env, inputNames_));
@@ -137,6 +140,7 @@ Napi::Value InferenceSessionWrap::GetInputNames(const Napi::CallbackInfo &info)
 Napi::Value InferenceSessionWrap::GetOutputNames(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
 
   Napi::EscapableHandleScope scope(env);
   return scope.Escape(CreateNapiArrayFrom(env, outputNames_));
@@ -145,6 +149,7 @@ Napi::Value InferenceSessionWrap::GetOutputNames(const Napi::CallbackInfo &info)
 Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
   ORT_NAPI_THROW_TYPEERROR_IF(info.Length() < 2, env, "Expect argument: inputs(feed) and outputs(fetch).");
   ORT_NAPI_THROW_TYPEERROR_IF(!info[0].IsObject() || !info[1].IsObject(), env,
                               "Expect inputs(feed) and outputs(fetch) to be objects.");
@@ -209,6 +214,18 @@ Napi::Value InferenceSessionWrap::Run(const Napi::CallbackInfo &info) {
   }
 }
 
+Napi::Value InferenceSessionWrap::Dispose(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  ORT_NAPI_THROW_ERROR_IF(!this->initialized_, env, "Session is not initialized.");
+  ORT_NAPI_THROW_ERROR_IF(this->disposed_, env, "Session already disposed.");
+
+  this->defaultRunOptions_.reset(nullptr);
+  this->session_.reset(nullptr);
+
+  this->disposed_ = true;
+  return env.Undefined();
+}
+
 Napi::Value InferenceSessionWrap::ListSupportedBackends(const Napi::CallbackInfo &info) {
   Napi::Env env = info.Env();
   Napi::EscapableHandleScope scope(env);

diff --git a/js/node/src/inference_session_wrap.h b/js/node/src/inference_session_wrap.h
@@ -55,13 +55,22 @@ class InferenceSessionWrap : public Napi::ObjectWrap<InferenceSessionWrap> {
    */
   Napi::Value Run(const Napi::CallbackInfo &info);
 
+  /**
+   * [sync] dispose the session.
+   * @param nothing
+   * @returns nothing
+   * @throw nothing
+   */
+  Napi::Value Dispose(const Napi::CallbackInfo &info);
+
   // private members
 
   // persistent constructor
   static Napi::FunctionReference constructor;
 
   // session objects
   bool initialized_;
+  bool disposed_;
   std::unique_ptr<Ort::Session> session_;
   std::unique_ptr<Ort::RunOptions> defaultRunOptions_;
 

diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
@@ -4,7 +4,7 @@
 import {cpus} from 'node:os';
 import {Backend, env, InferenceSession, InferenceSessionHandler} from 'onnxruntime-common';
 
-import {initializeWebAssemblyInstance} from './wasm/proxy-wrapper';
+import {initializeOrtEp, initializeWebAssemblyAndOrtRuntime} from './wasm/proxy-wrapper';
 import {OnnxruntimeWebAssemblySessionHandler} from './wasm/session-handler-inference';
 
 /**
@@ -33,12 +33,23 @@ export const initializeFlags = (): void => {
 };
 
 export class OnnxruntimeWebAssemblyBackend implements Backend {
-  async init(): Promise<void> {
+  /**
+   * This function initializes the WebAssembly backend.
+   *
+   * This function will be called only once for each backend name. It will be called the first time when
+   * `ort.InferenceSession.create()` is called with a registered backend name.
+   *
+   * @param backendName - the registered backend name.
+   */
+  async init(backendName: string): Promise<void> {
     // populate wasm flags
     initializeFlags();
 
     // init wasm
-    await initializeWebAssemblyInstance();
+    await initializeWebAssemblyAndOrtRuntime();
+
+    // performe EP specific initialization
+    await initializeOrtEp(backendName);
   }
   createInferenceSessionHandler(path: string, options?: InferenceSession.SessionOptions):
       Promise<InferenceSessionHandler>;

diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
@@ -21,7 +21,7 @@ if (!BUILD_DEFS.DISABLE_WEBGL) {
 if (!BUILD_DEFS.DISABLE_WASM) {
   const wasmBackend = BUILD_DEFS.DISABLE_TRAINING ? require('./backend-wasm-inference').wasmBackend :
                                                     require('./backend-wasm-training').wasmBackend;
-  if (!BUILD_DEFS.DISABLE_WEBGPU && typeof navigator !== 'undefined' && navigator.gpu) {
+  if (!BUILD_DEFS.DISABLE_WEBGPU) {
     registerBackend('webgpu', wasmBackend, 5);
   }
   registerBackend('cpu', wasmBackend, 10);

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -144,17 +144,7 @@ export class WebGpuBackend {
    */
   sessionExternalDataMapping: Map<number, Map<number, [number, GPUBuffer]>> = new Map();
 
-  async initialize(env: Env): Promise<void> {
-    if (!navigator.gpu) {
-      // WebGPU is not available.
-      throw new Error('WebGpuBackend: WebGPU is not available.');
-    }
-
-    const adapter = await navigator.gpu.requestAdapter();
-    if (!adapter) {
-      throw new Error('WebGpuBackend: Failed to get GPU adapter.');
-    }
-
+  async initialize(env: Env, adapter: GPUAdapter): Promise<void> {
     this.env = env;
     const requiredFeatures: GPUFeatureName[] = [];
     const deviceDescriptor: GPUDeviceDescriptor = {