From 5f3113ecd612b4ef070d5a3c8a586d725cfdb13b Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Wed, 10 Jan 2024 14:49:19 +0800
Subject: [PATCH 001/100] [ROCm] Fix hipify error: fast_divmod.h: No such file
 or directory (#19060)

Fix error:
```
[ 48%] Built target onnxruntime_optimizer

In file included from /onnxruntime_src/onnxruntime/core/providers/rocm/rocm_stream_handle.cc:5:
/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_common.h:11:10: fatal error: core/providers/rocm/shared_inc/fast_divmod.h: No such file or directory
   11 | #include "core/providers/rocm/shared_inc/fast_divmod.h"
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
```

This error is due to onnxruntime_optimizer missing dependencies on
hipify generated files.
---
 cmake/onnxruntime_optimizer.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 6f09583199ffd..f15d5b8dd6f80 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -130,3 +130,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
+
+if (onnxruntime_USE_ROCM)
+  add_dependencies(onnxruntime_optimizer generate_hipified_files)
+endif()

From cf78d01546ca059a2ab487e01626e38029a3e8fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Wed, 10 Jan 2024 16:36:50 +0100
Subject: [PATCH 002/100] remove use of ai.onnx.ml in test for custom ops and
 local functions (#19043)

### Description

QNN_Nuget_Windows does not allow ai.onnx.ml operators but the test
test_custom_op_local_function is using LabelEncoder. The operator can be
removed as the test is only checking custom ops api.

### Motivation and Context

Fix test test_custom_op_local_function in QNN_Nuget_Windows pipeline.
---
 .../custom_ops_type_inference_fails_0.onnx    | Bin 2086 -> 1977 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx
index 8116ec338064567cea06fafe45168567813071ed..3a43a7378a9123b1f51a30cc2270d2edaa1c0f76 100644
GIT binary patch
delta 512
zcmZ1`u#=yigVSmo3zzdmLH&u|pBcR-+cQe&8*;IhCFYc-YN>KC3$QyuI9?J=3=n`$
zPcCPiE-cK&7$L;R#lyiU#KFZR!~n#TWtmc0PB9*1oji@nbn<y71tFMGEFgmxFtS*g
zPiABml~;t>C(6Nyu#e+E6il{aR$x?~9L+4r#mU7~k}n~+fRSl(Bl8apbs?7Gl9V(h
z4(I&5;*!L?5-m|KZm`qi({l0?OGNu*PdF`LWSpGATFRu#HCdWb$xwjNON>h>CqFqc
zN3XCHs7Eg)KQ}QmPk=E9i|8rF*OQsqbn1<TSgKNs@{w%9;b>?WfqiEv#FT4DSPRUb
ipa21BF~Za$k*b9nHvgcSHt%5zV`TP%_)U*AzX1T^6p=>&

delta 679
zcmdnVzf3@ogF}cxib07%gTaWwY8?xg-$ZtO9xYBTrjmRK!3B&=R+AN3g(rI5x3m`G
zPA)Eq&&@5)NGwQI;&9H-D=taQE74NoVl7L|DNWT9<6stGw_;{uVsb)OI=PT>IzK-b
zV}uYF7Y7HU5DOOr*JNp?R8c2BF0RB(z5Kkq3ccJM0j5)ouh_UIDu_*<%_PTS1a!dU
zolFYeN-PXO;KW*$T9lu*fRV+@Mu;OXwH)0QYA{zwaWDd1!Sx>o7@gqCC(AP{Fse;<
zV3y?N0eKMUy;Y1Ldy<)d)T;<FmzCrr`Ae9K6YQ_}%)AnqN-Je9kSZ>?kwTn6Zb@oh
zaek2!dr&IaRaV77oiI@$uHurEG^h$5pTwlp9M`<${FKxpEeS5}?9|F)kojgNA;k`Y
z4nhvX4k8N}*|pTTc)@lT$EW4wCzcf3_Q{@bvfU>T>13<=SH#Ws|63uq1&nM~Na1It
z$Ax5-5DUoK$*-7YC%3Sa2%v>@5GYbvPcbfH;hH>=QD$;BD_^~~5DO@Xkpe&x7Nor3
yAZ2850;>SUfq@WHt|5|A1(;G1jL-)vgy}HCql51sBEpz*jW!Fgg)uUM-3b8JD$Ni8


From df116b82c743f9104bd090a310d5777a4475e539 Mon Sep 17 00:00:00 2001
From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Date: Wed, 10 Jan 2024 14:13:25 -0800
Subject: [PATCH 003/100] Custom op API for thread pool  (#18980)

Allow custom op to invoke internal thread-pool for parallelism.

---------

Co-authored-by: Randy Shuai <rashuai@microsoft.com>
---
 .../core/session/onnxruntime_c_api.h          | 13 +++++++
 .../core/session/onnxruntime_cxx_api.h        |  1 +
 .../core/session/onnxruntime_cxx_inline.h     |  4 ++
 onnxruntime/core/session/custom_ops.cc        | 26 +++++++++++++
 onnxruntime/core/session/onnxruntime_c_api.cc |  1 +
 onnxruntime/core/session/ort_apis.h           |  2 +
 .../testdata/custom_op_library/cpu/cpu_ops.cc | 37 +++++++++++++++++--
 7 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 06fef6bf72cc9..8cd0d0051d1eb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4528,6 +4528,19 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
+
+  /**
+   * Run fn in parallel
+   *
+   * \param[in] context
+   * \param[in] fn Function accepting usr_data and an integer as iterator
+   * \param[in] total The number of times fn is to be invoked
+   * \param[in] num_batch Number of batches by which the "total" is to be divided in maximum. When zero, there is no limit
+   * \param[in] usr_data User data to be passed back to fn
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 16d9451624533..3773a01cb65a8 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2057,6 +2057,7 @@ struct KernelContext {
   Logger GetLogger() const;
   OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
   OrtKernelContext* GetOrtKernelContext() const { return ctx_; }
+  void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const;
 
  private:
   OrtKernelContext* ctx_;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 63e55603736b6..db4619eeeae62 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1658,6 +1658,10 @@ inline Logger KernelContext::GetLogger() const {
   return Logger{out};
 }
 
+inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const {
+  ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data));
+}
+
 inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
   Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
 }
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index eea675eb0193a..984fdd6bce325 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -24,6 +24,7 @@
 #include "core/session/custom_ops.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 static constexpr uint32_t min_ort_version_with_optional_io_support = 8;
@@ -380,6 +381,31 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont
   API_IMPL_END
 };
 
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data) {
+  API_IMPL_BEGIN
+  if (!context) {
+    return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, "Invalid context");
+  }
+  if (fn && total) {
+    const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContext*>(context);
+    auto* tp = ctx->GetOperatorThreadPool();
+    if (num_batch) {
+      onnxruntime::concurrency::ThreadPool::TryBatchParallelFor(
+          tp,
+          static_cast<std::ptrdiff_t>(total),
+          [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); },
+          static_cast<std::ptrdiff_t>(num_batch));
+    } else {
+      onnxruntime::concurrency::ThreadPool::TrySimpleParallelFor(
+          tp,
+          static_cast<std::ptrdiff_t>(total),
+          [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); });
+    }
+  }
+  return nullptr;
+  API_IMPL_END
+};
+
 #ifdef _WIN32
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 76a8a778025e1..08bfb618f55b4 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2722,6 +2722,7 @@ static constexpr OrtApi ort_api_1_to_17 = {
     &OrtApis::SetSymbolicDimensions,
     &OrtApis::ReadOpAttr,
     &OrtApis::SetDeterministicCompute,
+    &OrtApis::KernelContext_ParallelFor,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index c9e4074a1afe2..6df5e4145b416 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -502,4 +502,6 @@ ORT_API_STATUS_IMPL(SetSymbolicDimensions, _In_ OrtTensorTypeAndShapeInfo* info,
 ORT_API_STATUS_IMPL(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
 ORT_API_STATUS_IMPL(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
 
+ORT_API_STATUS_IMPL(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* user_data);
+
 }  // namespace OrtApis
diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
index 85edfa0e59f1d..ebef441350d4c 100644
--- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
@@ -49,16 +49,45 @@ struct KernelOne {
   }
 };
 
+struct DataI {
+  const float* from = {};
+  float* to = {};
+};
+
+struct DataII {
+  const float* from = {};
+  int32_t* to = {};
+};
+
+// floats to floats
+void CopyI(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataI*>(raw_data);
+  data->to[ith] = data->from[ith];
+}
+
+// floats to int32_t
+void CopyII(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataII*>(raw_data);
+  data->to[ith] = static_cast<int32_t>(round(data->from[ith]));
+}
+
 // lite custom op as a function
-void KernelTwo(const Ort::Custom::Tensor<float>& X,
+void KernelTwo(OrtKernelContext* context,
+               const Ort::Custom::Tensor<float>& X,
                Ort::Custom::Tensor<int32_t>& Y) {
   const auto& shape = X.Shape();
   auto X_raw = X.Data();
   auto Y_raw = Y.Allocate(shape);
+  std::vector<float> floats(static_cast<size_t>(X.NumberOfElement()), 0.f);
+
+  DataI data_i = {X_raw, floats.data()};
   auto total = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>());
-  for (int64_t i = 0; i < total; i++) {
-    Y_raw[i] = static_cast<int32_t>(round(X_raw[i]));
-  }
+
+  Ort::KernelContext ctx(context);
+  ctx.ParallelFor(CopyI, static_cast<size_t>(total), 0, &data_i);  // test simple parallel for
+
+  DataII data_ii = {floats.data(), Y_raw};
+  ctx.ParallelFor(CopyII, static_cast<size_t>(total), 2, &data_ii);  // test batch parallel for
 }
 
 template <typename T>

From 731b50dfc4f8074185dc70f3a10236fa4fdfc0aa Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Wed, 10 Jan 2024 15:13:04 -0800
Subject: [PATCH 004/100] Support INT4 weight only quantize, including RTN and
 GPTQ 2 algorithms (#17390)

### Description
Support INT4 weight only quantize (WOQ) via Intel Neural Compressor,
including RTN and GPTQ 2 algorithms.

**Note:**
Please install `neural-compressor==2.3` for weight only quantize.

### Motivation and Context
As large language models (LLMs) become more prevalent, there is a
growing need for new and improved quantization methods that can meet the
computational demands of these modern architectures while maintaining
the accuracy. Compared to normal quantization like W8A8, weight only
quantization is probably a better trade-off to balance the performance
and the accuracy.
RTN is the most straightforward way to quantize weight.
GPTQ algorithm provides more accurate quantization but requires more
computational resources.

### Evaluation results
The following table shows the accuracy results of Llama-2 models
evaluated on [lambada_openai](https://huggingface.co/datasets/lambada)
task. `GPTQ W4G32Asym` in configuration column means GPTQ algorithm is
used for 4-bit weight only quantization, setting group_size=32 and
scheme=asym.
<table class="tg">
<thead>
  <tr>
    <th rowspan="2">Model name</th>
    <th rowspan="2">Configuration</th>
    <th colspan="2">Lambada_openai</th>
    <th rowspan="2">Accuracy Ratio<br>[WOQ/FP32]</th>
  </tr>
  <tr>
    <th>Accuracy</th>
    <th>Perplexity</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-7b-chat-hf</td>
    <td>FP32</td>
    <td>0.7058</td>
    <td>3.2788</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7025</td>
    <td>3.4489</td>
    <td>99.53%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-7b-hf</td>
    <td>FP32</td>
    <td>0.7392</td>
    <td>3.3950</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7326</td>
    <td>3.5286</td>
    <td>99.11%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-13b-chat-hf</td>
    <td>FP32</td>
    <td>0.7312</td>
    <td>2.9163</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G128Asym</td>
    <td>0.7289</td>
    <td>3.0061</td>
    <td>99.56%</td>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-13b-hf</td>
    <td>FP32</td>
    <td>0.7677</td>
    <td>3.0438</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7607</td>
    <td>3.1562</td>
    <td>99.09%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-70b-chat-hf</td>
    <td>FP32</td>
    <td>0.7543</td>
    <td>2.6181</td>
    <td>/</td>
  </tr>
  <tr>
    <td>RTN<br>W4G32Sym</td>
    <td>0.7489</td>
    <td>2.6850</td>
    <td>99.28%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-70b-hf</td>
    <td>FP32</td>
    <td>0.7964</td>
    <td>2.6612</td>
    <td>/</td>
  </tr>
  <tr>
    <td>RTN<br>W4G32Sym</td>
    <td>0.7896</td>
    <td>2.7546</td>
    <td>99.15%</td>
  </tr>
</tbody>
</table>

---------

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
Co-authored-by: Wang, Mengni <mengni.wang@intel.com>
---
 .../quantization/matmul_4bits_quantizer.py    | 189 ++++++++++++++++--
 .../python/tools/quantization/quantize.py     |  11 +-
 .../quantization/test_op_matmul_4bits.py      |  70 ++++++-
 3 files changed, 246 insertions(+), 24 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 6293bcbbf95bd..3e9f9a6544a71 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -7,6 +7,8 @@
 from __future__ import annotations
 
 import argparse
+import copy
+import importlib
 import logging
 import os
 
@@ -14,9 +16,11 @@
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
+from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
 from .quant_utils import attribute_to_kwarg
 
@@ -24,24 +28,98 @@
 logger = logging.getLogger(__name__)
 
 
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+    ):
+        """
+        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
+        RTN is the most straightforward way to quantize weight using scale maps.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+        """
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="RTN",
+        )
+        self.ratios = ratios
+
+
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        percdamp=0.01,
+        blocksize=128,
+        actorder=False,
+        mse=False,
+        perchannel=True,
+    ):
+        """
+        This is a class for GPTQ algorithm Weight Only Quant Configuration.
+        GPTQ algorithm provides more accurate quantization but requires more computational resources.
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            percdamp:
+                percent of the average Hessian diagonal to use for dampening.
+            blocksize (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            actorder (bool, optional):
+                whether rearrange Hessian matrix considering the diag's value.
+            mse (bool, optional):
+                whether get scale and zero point with mse error.
+            perchannel (bool, optional):
+                whether quantize weight per-channel.
+        """
+        super().__init__(
+            algorithm="GPTQ",
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.blocksize = blocksize
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+
+
 class MatMul4BitsQuantizer:
     """Perform 4b quantization of constant MatMul weights"""
 
     def __init__(
         self,
-        model: ModelProto,
+        model: ModelProto | str,
         block_size: int,
         is_symmetric: bool,
         accuracy_level: int | None = None,
-        nodes_to_exclude: list[str] | None = None,
+        nodes_to_exclude=None,
+        algo_config: WeightOnlyQuantConfig = None,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
-        self.model = ONNXModel(model)
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.accuracy_level = accuracy_level
         self.nodes_to_exclude = set(nodes_to_exclude)
+        self.algo_config = algo_config
 
     @staticmethod
     def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -176,20 +254,99 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
         graph_stack.pop()
         return graph
 
+    def _generate_q4_node_config(self):
+        """Generate weight only quant configuration for nodes."""
+        q4_node_config = {}
+        template_config_q4 = {
+            "bits": 4,
+            "group_size": self.block_size,
+            "scheme": "sym" if self.is_symmetric else "asym",
+        }
+        for node in self.model.model.graph.node:
+            if node.op_type in ["MatMul"]:
+                if not all([self.model.get_initializer(i) is None for i in node.input]):
+                    q4_node_config[node.name] = template_config_q4
+        return q4_node_config
+
+    def int4_quant_algo(self):
+        """4b quantize a model with RTN or GPTQ algorithm. Please refer to
+        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
+        for more details on weight only quantization using Intel® Neural Compressor.
+        """
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        kwargs = {}
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
+        weight_only_node_config = self._generate_q4_node_config()
+
+        algorithm = self.algo_config.algorithm
+        logger.info(f"start to quantize model with {algorithm} algorithm...")
+        if algorithm == "RTN":
+            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
+
+            kwargs["ratios"] = self.algo_config.ratios
+
+            self.model = rtn_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                **kwargs,
+            )
+        elif algorithm == "GPTQ":
+            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
+
+            kwargs["percdamp"] = self.algo_config.percdamp
+            kwargs["blocksize"] = self.algo_config.blocksize
+            kwargs["actorder"] = self.algo_config.actorder
+            kwargs["mse"] = self.algo_config.mse
+            kwargs["perchannel"] = self.algo_config.perchannel
+            kwargs["n_samples"] = -1
+            dataloader = inc_dataloader()
+
+            self.model = gptq_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                dataloader=dataloader,
+                **kwargs,
+            )
+        logger.info(f"complete quantization of model with {algorithm} algorithm.")
+
     def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
-
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
-
-        self._process_subgraph(graph_stack)
-        self.model.clean_initializers()
+        if self.algo_config is None:
+            # use a stack to keep track of sub-graphs
+            graph_stack = [self.model.graph()]
+            opset_import = self.model.opset_import()
+
+            has_ms_domain = False
+            for opset in opset_import:
+                if opset.domain == "com.microsoft":
+                    has_ms_domain = True
+            if not has_ms_domain:
+                opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+            self._process_subgraph(graph_stack)
+            self.model.clean_initializers()
+        else:
+            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
+            try:
+                importlib.import_module("neural_compressor")
+            except Exception as e:
+                logging.error(f"{e}.")
+                raise RuntimeError(
+                    "neural-compressor is not correctly installed. Please check your environment."
+                ) from e
+
+            import neural_compressor
+
+            assert version.parse(neural_compressor.__version__) >= version.parse(
+                "2.3.2"
+            ), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
+
+            self.int4_quant_algo()
 
 
 def parse_args():
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index aed46563c2764..1bd2ef42151d0 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -466,7 +466,6 @@ def quantize_static(
 
         import copy
 
-        import onnx
         from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
 
         def inc_dataloader():
@@ -478,13 +477,11 @@ def inc_dataloader():
         dataloader = inc_dataloader()
         sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
         del dataloader
-        model = sq.transform(
-            extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)
-        ).model
-        nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes])
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
         sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
-        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
-        onnx.save_model(model, model_input, save_as_external_data=True)
+        model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 02f51cc4fa809..73dae08af8ece 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -71,13 +71,16 @@ def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> Non
         output_name = "output"
         initializers = []
 
-        def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
+        def make_matmul(
+            input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str
+        ):
             weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
             initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
             return onnx.helper.make_node(
                 "MatMul",
                 [input_name, weight_name],
                 [output_name],
+                node_name,
             )
 
         in_features = 52
@@ -88,6 +91,7 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na
             [in_features, out_features],
             "linear1.weight",
             output_name,
+            "MatMul_0",
         )
 
         # make graph
@@ -139,6 +143,48 @@ def quant_test(
             else:
                 raise exception
 
+    def quant_test_with_algo(
+        self,
+        algorithm: str,
+        model_fp32_path: str,
+        data_reader: TestDataFeeds,
+        block_size: int,
+        is_symmetric: bool,
+    ):
+        model_int4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+        )
+
+        # Quantize fp32 model to int4 model
+        from onnxruntime.quantization import matmul_4bits_quantizer
+
+        algo_config = None
+        if algorithm == "RTN":
+            # test RTN algorithm
+            algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig()
+        elif algorithm == "GPTQ":
+            # test GPTQ algorithm
+            algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+
+        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
+        quant.process()
+        quant.model.save_model_to_file(model_int4_path, False)
+
+        quant_nodes = {"MatMulNBits": 1}
+        check_op_type_count(self, model_int4_path, **quant_nodes)
+
+        data_reader.rewind()
+
+        try:
+            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
+        except Exception as exception:
+            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
+                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
+                pass
+            else:
+                raise exception
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )
@@ -159,6 +205,28 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_rtn_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("RTN", model_fp32_path, data_reader, 32, False)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_gptq_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False)
+
 
 if __name__ == "__main__":
     unittest.main()

From e58319ebfc344419b94ab5f8f27f7ce5eabe56f5 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Wed, 10 Jan 2024 15:29:34 -0800
Subject: [PATCH 005/100] [TensorRT EP] Fix memleak (#19053)

### Description
<!-- Describe your changes. -->
To fix memleak:
```bash
192 bytes in 1 blocks are definitely lost in loss record 1,254 of 1,999
   at 0x483BE63: operator new(unsigned long) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
   by 0x4A93FD5: OrtApis::CreateTensorRTProviderOptions(OrtTensorRTProviderOptionsV2**) (in /code/onnxruntime/build/Linux/Release/libonnxruntime.so.1.17.0)
   by 0x1502E1: onnxruntime::perftest::OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env&, std::random_device&, onnxruntime::perftest::PerformanceTestConfig const&, TestModelInfo const&) (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
   by 0x15A404: onnxruntime::perftest::PerformanceRunner::PerformanceRunner(Ort::Env&, onnxruntime::perftest::PerformanceTestConfig const&, std::random_device&) (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
   by 0x14C6D9: real_main(int, char**) (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
   by 0x145A2A: main (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
```

add ptr to help release trtep provider options


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/test/perftest/ort_test_session.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index ac25c98b15758..13082fe69cf48 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -170,6 +170,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     const auto& api = Ort::GetApi();
     OrtTensorRTProviderOptionsV2* tensorrt_options;
     Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+    std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
+        tensorrt_options, api.ReleaseTensorRTProviderOptions);
     std::vector<const char*> option_keys, option_values;
     // used to keep all option keys and value strings alive
     std::list<std::string> buffer;

From fd6bab4250c41a7f6498e6fa02ba446bc74e0a8d Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Thu, 11 Jan 2024 08:12:43 +0800
Subject: [PATCH 006/100] [js/webgpu] Provide a vectorized algorithm for
 GroupedConv (#18884)

### Description
This PR provides a vectorized algorithm for NHWC GroupedConv to improve
performance.

The aggregate time of GroupedConv in mobilenetv2-12 becomes ~1ms from
~4ms on Intel Alder Lake machine. About 20% improvement for the whole
model.
---
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  |  99 +++++++++++-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |  26 ++-
 js/web/test/data/ops/conv.jsonc               | 152 +++++++++++++++++-
 3 files changed, 271 insertions(+), 6 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 14482272bad38..21b4953d3f90c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -3,9 +3,9 @@
 
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {ProgramInfo} from '../types';
+import {ProgramInfo, ProgramUniform} from '../types';
 
-import {inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common';
 import {calculateOutputShape, ConvAttributes} from './conv';
 import {getActivationSnippet} from './fuse-utils';
 
@@ -95,3 +95,98 @@ export const createGroupedConvProgramInfo =
         getShaderSource,
       };
     };
+
+export const createGroupedConvVectorizeProgramInfo =
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const components = getMaxComponents(outputShape[3]);
+      const outputNumber = getMaxComponents(outputShape[2]);
+      const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+      const xShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[0].dims[2], inputs[0].dims[3] / components];
+      const wShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[1].dims[3] / components];
+      const outputShapeInShader = [outputShape[0], outputShape[1], outputShape[2], outputShape[3] / components];
+
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: outputSize}, {type: 'int32', data: attributes.strides},
+        {type: 'int32', data: attributes.pads}, ...createTensorShapeVariables(xShape),
+        ...createTensorShapeVariables(wShape), ...createTensorShapeVariables(outputShapeInShader)
+      ];
+      const xNumber = (outputNumber - 1) * attributes.strides[1] + wShape[1];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
+        const {activationFunction, applyActivation} = getActivationSnippet(attributes, output.type.value);
+        const x = inputVariable('x', inputs[0].dataType, xShape.length, components);
+        const w = inputVariable('w', inputs[1].dataType, wShape.length, components);
+        const inputVars = [x, w];
+        if (hasBias) {
+          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims, components));
+        }
+        const processBias = hasBias ? 'value += b[output_channel];' : '';
+
+        return `
+  ${
+            shaderHelper.registerUniform('output_size', 'u32')
+                .registerUniform('strides', 'i32', 2)
+                .registerUniform('pads', 'i32', 2)
+                .declareVariables(...inputVars, output)}
+  ${activationFunction}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let width0 = uniforms.output_shape[3];
+    let output_channel = global_idx % width0;
+    var index1 = global_idx / width0;
+    let width1 = uniforms.output_shape[2] / ${outputNumber}u;
+    let col = (index1 % width1) * ${outputNumber}u;
+    index1 = index1 / width1;
+    let row = index1 % uniforms.output_shape[1];
+    let batch = index1 / uniforms.output_shape[1];
+
+    let x_corner = vec2<i32>(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+    var x_vals: array<${x.type.value}, ${xNumber}>;
+    var values: array<${output.type.value}, ${outputNumber}>;
+    let input_channel = output_channel;
+    // Use constant instead of uniform can give better performance for w's height/width.
+    for (var w_height: u32 = 0u; w_height < ${wShape[0]}; w_height++) {
+      let x_height = x_corner.x + i32(w_height);
+      if (x_height >= 0 || u32(x_height) < uniforms.x_shape[1]) {
+        for (var i = 0; i < ${xNumber}; i++) {
+          let x_width = x_corner.y + i;
+          if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+            x_vals[i] = ${x.get('batch', 'u32(x_height)', 'u32(x_width)', 'input_channel')};
+          } else {
+            x_vals[i] = ${x.type.value}(0);
+          }
+        }
+        for (var w_width: u32 = 0u; w_width < ${wShape[1]}; w_width++) {
+          let w_val = ${w.get('w_height', 'w_width', '0', 'output_channel')};
+          for (var i = 0u; i < ${outputNumber}u; i++) {
+            values[i] = fma(x_vals[i * ${attributes.strides[1]}u + w_width], w_val, values[i]);
+          }
+        }
+      }
+    }
+
+    for (var i = 0u; i < ${outputNumber}u; i++) {
+      var value = values[i];
+      ${processBias}
+      ${applyActivation}
+      ${output.set('batch', 'row', 'col + i', 'output_channel', 'value')};
+    }
+  }`;
+      };
+
+      return {
+        name: 'GroupedConv-Vectorize',
+        shaderCache: {
+          hint: `${attributes.activationCacheKey};${components};${outputNumber};${xNumber};${wShape[0]};${wShape[1]}`,
+          inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank']
+        },
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms
+        }),
+        getShaderSource,
+      };
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 33a5db7ff6b25..cb40a9f08d2d7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -8,7 +8,7 @@ import {ComputeContext} from '../types';
 
 import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
 import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu';
-import {createGroupedConvProgramInfo} from './conv-grouped';
+import {createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo} from './conv-grouped';
 import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
 import {createNaiveMatmulProgramInfo} from './matmul';
 import {createTransposeProgramInfo} from './transpose';
@@ -136,12 +136,32 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // check attributes
 
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+  const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    if (isChannelsLast && inputs[1].dims[0] === attributes.group && inputs[1].dims[1] === 1 &&
+        attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+      const outputShape = calculateOutputShape(
+          inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
+          isChannelsLast);
+      const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+          context.compute(
+              createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+              {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+      if (attributes.wIsConst && !context.kernelCustomData.wT) {
+        context.kernelCustomData.wT = transposedWeight;
+      }
+      const convInputs = [inputs[0], transposedWeight];
+      if (inputs.length === 3) {
+        convInputs.push(inputs[2]);
+      }
+      context.compute(
+          createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs});
+    } else {
+      context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    }
     return;
   }
 
-  const isChannelsLast = attributes.format === 'NHWC';
   const hasBias = inputs.length === 3;
   const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
   const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 2e8eaaba191d0..cc10df5864233 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -298,7 +298,157 @@
       }
     ]
   },
-
+  {
+    "name": "conv - vectorize group - A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [1, 1], "type": "ints" },
+      { "name": "group", "data": 2, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0],
+            "dims": [2, 1, 1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - B",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - C",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 44, 54, 74, 84, 94, 386, 412, 438, 490, 516, 542, 1122, 1164, 1206, 1290, 1332, 1374],
+            "dims": [1, 3, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - D",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0] strides = [2, 2]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 54, 386, 438, 1122, 1206],
+            "dims": [1, 3, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - pointwise",
     "operator": "Conv",

From 5678317bafd219e2b71c72431905b776460e11a4 Mon Sep 17 00:00:00 2001
From: Yvonne Chen <yche@synopsys.com>
Date: Thu, 11 Jan 2024 10:36:33 +0800
Subject: [PATCH 007/100] Fix the duplicated QDQ attributes setup issue
 (#18039)

### Description
The copied QDQ node should have exactly the same attributes as the
original QDQ node. Otherwise, it might cause errors when the original
node has attributes that use non default values (such as axis != 1
case).

An example user case is like:
A DequantizeLinear node has more than 1 consumer in the graph, and its
attributes axis is 0.

### Motivation and Context
I see the errors like
https://github.com/microsoft/onnxruntime/issues/16188
and this fix could solve the issue.
---
 .../ensure_unique_dq_for_node_unit.cc         |   2 +-
 .../ensure_unique_dq_for_node_unit_test.cc    |  40 ++++++++++++++++++
 .../qdq_with_multi_consumer_q_dq_axis.onnx    | Bin 0 -> 9361 bytes
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx

diff --git a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
index cc0f7854791d4..9d53e28921784 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/ensure_unique_dq_for_node_unit.cc
@@ -53,7 +53,7 @@ Status DuplicateDQForOutputEdge(const graph_utils::GraphEdge& original_dq_output
                                     MakeString("Added by ", kTransformerName),
                                     dq_inputs,
                                     {&new_dq_output_nodearg},
-                                    nullptr,  // attributes
+                                    &original_dq_node.GetAttributes(),
                                     original_dq_node.Domain());
 
   // set up edges
diff --git a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
index 7a67747f7cf4c..89ffb8ec87dcb 100644
--- a/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
+++ b/onnxruntime/test/optimizer/ensure_unique_dq_for_node_unit_test.cc
@@ -234,4 +234,44 @@ TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodes) {
   EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 4, OpCount(op_count_after, "DequantizeLinear"));
 }
 
+TEST(EnsureUniqueDQForNodeUnitTests, QDQWithMultiConsumerDQNodesPreservingAttributes) {
+  constexpr auto model_uri = ORT_TSTR("testdata/qdq_with_multi_consumer_q_dq_axis.onnx");
+
+  SessionOptions session_options{};
+  // test interaction with level 1 transformers
+  session_options.graph_optimization_level = TransformerLevel::Level1;
+
+  InferenceSessionWrapper session{session_options, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session.Load(model_uri));
+
+  const auto op_count_before = CountOpsInGraph(session.GetGraph());
+
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const auto op_count_after = CountOpsInGraph(session.GetGraph());
+
+  EXPECT_EQ(OpCount(op_count_before, "DequantizeLinear") + 8, OpCount(op_count_after, "DequantizeLinear"));
+
+  int64_t given_axis = 0;  // all the following 4 DQ nodes and their duplicated one should have axis = 0
+  std::string axis_dq_name0 = "Convolution28_Output_0/fusedmuladd_B/DequantizeLinear";
+  std::string axis_dq_name1 = "Parameter5/DequantizeLinear";
+  std::string axis_dq_name2 = "Convolution110_Output_0/fusedmuladd_B/DequantizeLinear";
+  std::string axis_dq_name3 = "Parameter87/DequantizeLinear";
+  for (const auto& node : session.GetGraph().Nodes()) {
+    if (node.OpType() == "DequantizeLinear") {
+      if (node.Name().find(axis_dq_name0) == 0 ||
+          node.Name().find(axis_dq_name1) == 0 ||
+          node.Name().find(axis_dq_name2) == 0 ||
+          node.Name().find(axis_dq_name3) == 0) {
+        const auto& attrs = node.GetAttributes();
+        ASSERT_TRUE(attrs.find("axis") != attrs.end());
+        const auto& axis_attr = attrs.at("axis");
+        int64_t axis = axis_attr.i();
+        EXPECT_EQ(axis, given_axis);
+      }
+    }
+  }
+}
+
 }  // namespace onnxruntime::test
diff --git a/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx b/onnxruntime/test/testdata/qdq_with_multi_consumer_q_dq_axis.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..4f575ebb2841a02802754e2a449a05ad58220afc
GIT binary patch
literal 9361
zcmcgy2~<;O+Rh5$W)CDJA?&LG*&)<Wi@4LeAhufDj)h9Aj#aBTf2VWiB!6e-pXu~(
zJJV%4YHhLZiq-`;7TMQ8NZ3LkgoF?hAV3zf5J>W0gi3^9sdLUR=N`E4eYfv@p6|Q&
z`(9D*B+sm^+p_|;?Ap3DVB68W1MeUBDEAm0vTA%2a0?2Mw`U*Aem^%a_gD%m=7SU2
zhx5S4`v9V|`1m{7hjIamvj}u?Wbe@<2M*_v7|!x_xj*}O<-p<G>|^w2J`Ex))Mugr
zMD~XVj#DA|P*^-6b0(pkzdxG0DgX?X!d?Xd3ZQ`3lZZ)9DHEKevl-8iEh)w+VIL>O
zh4JandiXr)7Q%p-wMP#B?#Q7Nc?XUhj*E-kyX{2Y(Gz)lV`JVuaXfe5`zH=%@7uRG
z-DxC|%QKiAWbE>koyMQMJc~u8c}}}usv7ZWkQ3r(Q2q%9L_4J!pYqF9V=ha7wrI|f
zEbZH=W=YF(zNlo+XZBnb%Ld#w9|n1q0AOZ+$IK3uICtkX7}T67lm6tdAlzdVz_~v=
zZ(UBk_kbt*zZ}G4F37q}`uv}ZN9HonC={5t0iHD_f#<6*3BM!vl~)tupRkIKeG1?p
z#LO*|2B5M}<bexa_P!t>Yt5GRdtcqYef^GnD4a<GJoX(plsz-2kEcP?p!raDCI&#h
zckIZCqg3z?#v}un-{u}WoO@{R@%`CHbJLJ%$QdO9;C?*s*nxeyKbwJU{~QV|Ob8$^
z$Z!r$`a-c2uEd4UGU<y&%R>5VD6rV_c~T-RObMie3)wPx@lPn=!k9$be&ooZ1Bc&B
zPM*ip<B2TDeM|O-Gj|{&^KiOt-L^Ap{mvOE7Xk%KgUtX%1P~zZ$I~FQKMSy%RRXa)
z1LpK3)|2=Eb+K#?L0=b7oIvz)@i>$2Qo1ZC&&|9q7u6>w^Z7F*yKo1>>RP-^WNkBF
zB8#(JmkB)c0|I!?cM^#(!^^n1#3z8!aZU>W_?G^EUID&>0*g`tC<~LEW021xxf6dL
zPY~yk#mvpPr)A6H&dIs?_O$8q<>k33X^=Cngv(?MxJ0?KCV9@$C@y*aON$Q73xD-;
z08fMe@^Ub%1TF_Zf@#eMdqOXO=0Z3Jr=P1wPKf?IF`P#Pg6dklEL<<o&$g$X{wHZt
zTnz)ncnSIPU?$Pmf0%bH`<=YpeF5(tJMw<OAJ%T&`SQkN*+=*Pkr_aMrY|u^G7$S$
zRHrh72=Mf0&6W%(7u12KFS%N0AP%00O=X6FexAK<&v>GMo}T*n%0RIG$YnFBp!MR<
zuo(+~(8BeLelBnzX!?v-{)~@c(8tp3nkz&o=;s%BHskAc6an<Mc=MN$<`$97KG*jW
zd)vCb>>Fpcus33p**z_9vIlk?VgKcC3GA=p|C{}n;JxfU0jt;|>Tb4Q9)``kafp3b
z7Rk0%Y-2N{LD%r~B{q>6geff9HjH$KTBZQ4n?j3Xo~NM*C=?2bLV{-VJ2~fFY6h1!
zDxd81@oFqE;@vGnE#3Gj&sy>Hx`dHVZoyBUWcvd(bTlf>*gmd~ixnbT8FT={ZzuV-
zi^^Ju`m2je`f)dVPqpje<eusx+Ni+@_n4x|M~Lv7mmfj!x*t0&zELW4w7TtVFJ%X|
zsH7k1Cv4^F0GO$(sSoK0g`O;GisEb*6$_LbXgF@Z&V!+8yI!hCH(VOOLrs(xwDeD-
zJ(^&+wOB`=WOcIqPAD)z9aatlURZuB6y6K(e0V*GBoa1+zvk{Cf}o(O?=b$VPs{4%
z*`s!8>t!RDFV1mc5}o<><S;E2GK%+l6KHE8cX`0H75Dkp=;1aI4v*d$y#BY7kuRkJ
zFSzSr;#6;=NKeK~6MtjsNAB8N1~iTd8qTQ(e28P6oujdDazyYo)U5av3cpIs8#c9+
zTWZE-1KlynZ@?LAquy}a8s-PFl|g;<g=4KHaWKqu>aJ;r*+_Uh)kBgHHtuI@ZlBgt
z#+wP!;-7}FhJ+Z$3y}wQ?DXj#RX5&JVdF?d8N%M4#4Q@3q2a9xgEc}x@XJzs@LG$a
zW821XRBgm>12)`ifJm4=5F}O_OX|beQv8RsoWtu_n_k7a;S&uBFvA!m$|rGU`lDV*
z@b>nKv*XaW;}N60cY?w*HpMXPQSQ?)?KI6_*?dK|I>Dm)VI?bgN1_)+8aw4a2)MgN
z_-G}yal7L@S~P(>R~&}IiMvwZb_%Ri(>_>J+3D!1sh95ZAyXsJUE(f$2*qs=EOg&T
znWvjgDBqa2R&T<7tYoCoT11lyM;pBfBa<D`dQ<(^_uz$|p`oKT#&)!92b8#{NYM;K
zp5=AbkIDX_;1h@Z-L+aird&E<t#ZfV>4N|qLG>c1hAU(y60DSqi?WPW=%hEhJIadW
z;z#0P#<b6c3;yxE-jTY1_Cjf|L?-P}=}Uxn`;oD{+m+sqa-Cqz<6M1Z;dt?oQD%Nv
zHySIcR3LLqI*TV~RkLE%R>zRZt!KbtSnWsbA=_3Fin>?oG$RR#gH4QnkItIjM{R?b
zR-}5O$|6|y4c+{{<6hb;1BB6b_5IfK;&Bl+>J|Sg^|-O;-lu23jiKK;%@v5?VWHhO
zZt+yD<saV{3Z~#=BN?1*TSG%L|BmpMR$E>8P${FFlN&ll>Wjar`nzyq=NeCp8g7&}
zNzCmYLW`$b)G8VH?$dusu~8xE);J`)eW0#hQ;H3#|Mcv*rT&(nN@{LwDlHtf=}9$a
z*u;3{w^OK-HC1(e(N@L7f=J@P!|%!(OWNyy653HEZf^(olxFRVc#ESCQfUD>?E<&f
zQr}Q{!Q3SnIUn+7fU-X<X*FaRHL~#?i8qR}k%&O<#!c1NeA-wett!&>VQCIc@=C8R
zVfuuqh?Jaz^6tW#)dBJewM`8d=xuie-vZSTQG4Ir$;2>U-s$KKI9LUQl|U7F;B9so
z(LJ>7hfy88{a@89==~1{4g>}wJwtVLB4kp;Tnh)HXg!{Zto|MyQ#GYE)pSN6<Dd-q
zkb@b;@F#njh~B<wsd`AznxT#g-5A5u+|)}(%U1+w_1M>5(`ZuR^iDM5-{){DVa>hK
zyL{uQLJwC2cZ9E0U;&J>E2PCu2VW}`8wVn#!^S|4s;ygLZn-N*Cs3MXRvLt_NPGRX
zdqQkiqY|&J;j8uB@<3D9=q7x>ikga1kGW0bjYB@}D|KW8k%mFK#f-t+NRh21#`H@T
zMVB8j(px%-3-oWl-t}_m;3LaEL_bs#DJC9t8yElH582gO(b|N+)7K5{xq{j9k*GVC
z?0Ey`4@{wh?UQ0spuFd*`%!FlL=sz)<%_fLA;yVFZNj&c-OAdx$6ZuLa9$|$)#qL7
zos30{>QTOiqB9h@^r4bGaW08Iar?m<Tc!I1gOwEwph?KCSd4$g`LM1V=Qh7Ds4DgL
zZ;#sZN83H|l>8P{dh)_I4=(-K-6;H+Yu3fnqSn1q-=d{@mW^ER`d(3frc!hMPk$3O
zYRjy_s6r1TUtJL!0^#?ve1}aBs11p@Q+`-f`{3&IlyXLr?aufH=;bD3#OIALR++jI
zj&C*<l+>#|uJFrbvXvZ?DRvJEQS65b$4&J5uJO_tR0)3d6P|?wo}>2~y;|TTf>APL
zlb!Mo2(PY*^lU3^R7wkqgcVkVskLduVdBaA3~F<|6&bOM7QcFh(m*ol){Mu6wRrS4
zC`Yju2_ks=uxivOF<A7<O}da^w*;iZER~Dw1bIa_wyj`jrODq4^lB!^i2jF{D<?ib
z&;8`a+0z%Llv??OVq7+iYnS>3O8aZ|DkRCjUg}G9+%;ZliKL)QQS6XuNuWLbq+C!8
zk$%Ay8~tM3YHe!2ydGL}Xcmlb4df6y+NV?OG*hp--#^h{s_WNRivA()m4)*k2wJF+
zG#{Nt6&zXVC->lspghE-n^3Xlkw$gHy|z!P20U8@ba7vSK3XmonD$Es&9D%%HNU*m
z7cp{mR4y4TZLznvw)*O8dvWn)MPJpem6|o`OfAz+c%8OAjQL(Xf!Yu89-$ynqimB9
z;*YXgtBJ*((Y3noN*kyB!)(Zb*q2(>y2)NoFGsoO3kj3n`=ir1ME!`^64ZerSYsZl
zwf>vojL`8Udi**M4ZpD(qHG&86Rgk)VqkCRI`4<6dp@e?0+#;1WX$wbu$w=0g;(UL
zr&(vbE54;<qUuYAJ*s+#&{&^$$$HZkTu+sv)pAGZKMCh!##=@N3fuZu{p^FWK9hCw
zI*IT~zlYsTXvdM`wRXrV4;k9WZcvS>V3hHIp$AQUi2%QAWN27B27?f{>;U9%?Ot(f
z>=HexsJBGkD3<V~?xHfRMMW~D&`VbPRg4L&4aU}K;GAiEScz`x5$h>~-=Q&(=#A5*
z=0VhCQ^ORTBD}1JMHyuVbH%mdPLZYX%luo7Llsp*{B%&?0138#%0fNF8MBzx4SFoB
z)6#;Pm||-?WxW`Kt!z{l>JVxi6;NEg#pFgFARF6lQv%JTwvJ2hX2y=8r;HQRQcY`}
z3{uN$Xe45!VnJ*y@?4qGMl3t8ZSNM|gidJdZ&(Iy@+{OB`S-=SzcDa03VB<D?E-JC
z?h7ro_{O*K<kz-@;=T-9nep<yZg+;agnYMYHO%8;IHHe%q4oPP5$I!-D)_3*-YetC
zi3uF)Cl2uKMPGI~_onqs&BYq|^}0$;1tV}8`<6eh-TlGjK;;#*xo^a)!a?r}t(6CP
zMI8bPVUUr0o#_rr9Y3r$+c6AXo!)Hl`rT#3);mbY+Z2`GAGWm)HV*!v@VfN9pzZay
zBaWy^QJnQKOww`jrvt&_8-lWa32sVs-qv;X`t2&ht&4^hC4)nMln}iN4w;N3bTqbA
zoGQ_@4u%w%G%sSM^xL?q!hvedqbo|Oxab1XHrDcm&Aswko%PF4N-YNM-NNsNrtQ;Z
z17dYy`9S%p^Bvj_l(ogCcMMjkQ1F|4w3c^Iti0MJX%Lju!mX-DU8vzfBvPP-n{`c9
zeHA=nIkxWW0(8~=AB(L@&6Vro`vreLJM!6Ixn`fj?=K~0R*bLScm~>Y{n{COuo;QK
zkMKzN@+%#X5_EGzH*bwQPO6lvsvQI0D0m*jl=|YL|Jg75K~^S)k!YK5|NCy+7_x0J
zj*y6$4yU|oGwfO07K}?<$LrEuyKuGE92V2xAnwx-6zBi#8V{o7^9gF60a8?Oxz*3o
z_7~ok<e`5K`*Me*w?r6et)2);Ie%Z3PeRrvw86e_WL*^u)-@msyC_q-EabQP>uf-a
zs(5(+YQ&Wr`r@9+o{4h#8Zx1)Uc1*zYQb?3H}O8q3|2^W`I*}%FY#m&Wn=RNQ8nKF
zRbhW2gw*YIK4}ZIv2XQD5MxT3i8m0zz)u=F-VL+V8906#P2|BTp(ZE|c6Taq;!&Gn
z3`1(cQV02#=wiWrS!5$$E^oR#VnX-ldo!rgvFM1>A^ZNo5&>*C>gHH3cCZwF>d!Vu
z$4_D(pvFw1YP3BRavw}XQeyb!7tRl1Mjt{t(O3$ckSIhdqx269N_9_vd5J~}Ttbkb
z;dSJs(B|Y9<Ix>|*c9g84KRHCE)`+pH*oz7l3}%*d+l&7Re~8{rl3&Zj%iWvDI-8s
zEM(ug`M+#@eFvKsx`+M2rhN9k!Q1TUl8@Pk4t>IAhJu#PI~(U7RR&6iAma^g1`Bfp
z3hsiSbNe8dov(~~XkYO^J57K7SJ!TjuZB3R({-#DZ*vFXkG{D*#rp6)yrvq?$&o+k
z+x-Ujs^o<o*IJCw4aCsYS2&wq-M(>Evxmbv&b?H1PhjrvbD-~CD=;^41@^SGlc+?-
z&PYxsX8i{rQn#L{>o(N1B7NIxxT=lXFS$L0sA-)s^f)J2-Jl=@g#XArReoT77x)dw
z+3cq$z8Lpr42Dz6eYExc&;EM%jn&*sy)O{zPd8@-A;~I(7#FkIe@FOQjwGB4r#eJE
z4cu|wmxVm$@y8p`U(*1~NNb4qNvT+aNO?<1OSxBsO<6N(Wvxu4rc{_hQr_j7Sy)DV
zN)?}$^4-gjl(-466mEV6>tRT6${M|fbvY5A!u${n>X!bi8)+G7kpJ)>^S>TPcJ4+k
zr$LyJmT7i4{-k#}rnL@7rrO~+170oQbq|zL1|5zpmBYaqa5y%Y9FE^XA&{3G4#(@A
z;5{h+ddlHo?s!bwU(qtl2!Mam%G$Z=Ggj=$P8P?yhxMW|pY`rHcUYv<Ke2lBpRky_
zK%Z{u&VyM_Yb_(-mx{=I?Xlxu*$>PJkbs5Qb=u*uV<8Zu)#0cFi(>){Ktn<xY=y(I
zC*bFn8HoaTp`auJ07F6NE(Z9Uk>mI5<L4aTXEvvDQl9sGI*0X~jjbGDzDJjXcIu+#
z5Er$Way(r$d2+m5wOD4PbH{s5O_m(6Y>TBanK|A|9Ombc7N2te9PsHg<{Yxi)7=@d
zi(^KofKQ)i=1{@^wlLdPc<RBr%!A3#hSrG^GqgZ~7jou0nDk7@oy=xTX4o`qw(fbU
zKmrj3xrqQh5e}|>PFgrQK^+X5r^w@U!41t^sx#?6CjP--C+awj>8aUqaK$1@El`~)
z!K}YX7gSpid|o-q6$O`c4QekYp$lysm<;Y<v}?+`$YoYPW(g`U8GyN%;tO<Vsy*v`
zZlF(7e?cnq+ACd7i6xzb>Mp47LVbo>h=cP3eQKs;x=3f%Aa<##wN!|71OOqB{tq3W
Bly?9C

literal 0
HcmV?d00001


From 053ddfe3fd52135742567940750b7bf6ccffe166 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 10 Jan 2024 18:45:49 -0800
Subject: [PATCH 008/100] Disable per-session thread pool for web (#18480)

### Description
ORT web prefers to use a global thread pool for all inference sessions.
See how OrtCreateSession is implemented in
https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/wasm/api.cc#L183
.

Application code can only the global thread poo. However, internal
testing code still often use per-session threadpool. This pr is to fix
the inconsistency.

### Motivation and Context
Replace PR #18476
---
 onnxruntime/core/framework/session_options.h   |  8 +++++++-
 .../test/framework/inference_session_test.cc   | 18 ++++++++++++++++++
 .../cpu/activation/activation_op_test.cc       |  3 +++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 40c59cfcf699d..796a018ac0f68 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -65,6 +65,11 @@ struct FreeDimensionOverride {
  * Configuration information for a session.
  */
 struct SessionOptions {
+#if defined(__wasm__) && defined(__EMSCRIPTEN_PTHREADS__)
+  static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = false;
+#else
+  static constexpr bool DEFAULT_USE_PER_SESSION_THREADS = true;
+#endif
   ExecutionMode execution_mode = ExecutionMode::ORT_SEQUENTIAL;
 
   // set the execution order of the graph
@@ -129,7 +134,8 @@ struct SessionOptions {
 
   // By default the session uses its own set of threadpools, unless this is set to false.
   // Use this in conjunction with the CreateEnvWithGlobalThreadPools API.
-  bool use_per_session_threads = true;
+  bool use_per_session_threads = DEFAULT_USE_PER_SESSION_THREADS;
+
   bool thread_pool_allow_spinning = true;
 
   // Deterministic compute is likely not as performant. This option is default to false.
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 486ec37d1eebd..2522ee3b496f6 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -578,6 +578,9 @@ TEST(InferenceSessionTests, ModelMetadata) {
 }
 #endif
 TEST(InferenceSessionTests, CheckRunLogger) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   SessionOptions so;
 
   so.session_logid = "CheckRunLogger";
@@ -837,6 +840,9 @@ TEST(InferenceSessionTests, PreAllocateOutputVector) {
 }
 
 TEST(InferenceSessionTests, ConfigureVerbosityLevel) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   SessionOptions so;
 
   so.session_logid = "ConfigureVerbosityLevel";
@@ -2661,6 +2667,9 @@ class InferenceSessionTestSharingAllocator : public InferenceSessionWrapper {
 
 // Ensure sessions use the same allocator. It uses ORT created allocator.
 TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllocator) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2706,6 +2715,9 @@ TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsUseSameOrtCreatedAllo
 
 // Ensure sessions don't use the same allocator. It uses ORT created allocator.
 TEST(InferenceSessionTests, AllocatorSharing_EnsureSessionsDontUseSameOrtCreatedAllocator) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2758,6 +2770,9 @@ class InferenceSessionTestSharingInitializer : public InferenceSessionWrapper {
 };
 
 TEST(InferenceSessionTests, InitializerSharing_EnsureSessionsUseUserAddedInitializer) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   auto logging_manager = std::make_unique<logging::LoggingManager>(
       std::unique_ptr<ISink>(new CLogSink()), logging::Severity::kVERBOSE, false,
       LoggingManager::InstanceType::Temporal);
@@ -2942,6 +2957,9 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) {
 
 // test inter thread pool with setting denormal as zero
 TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   // test if denormal-as-zero mode is supported
   if (!SetDenormalAsZero(false)) {
     return;
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index 7ec9e0f345187..ddb0a6620619c 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -588,6 +588,9 @@ TEST_F(ActivationOpTest, Softplus) {
 }
 
 TEST_F(ActivationOpNoInfTest, Softsign) {
+  if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) {
+    GTEST_SKIP() << "Skipping the test";
+  }
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 1, which exceeds threshold";

From 0a0ef958eb94d94a17fd8c09a6d217e0589827a2 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 10 Jan 2024 19:26:01 -0800
Subject: [PATCH 009/100] update .vscode/settings.json (#19084)

### Description

`"explicit"` now replaced `true` to config entry
"source.organizeImports". Latest VSCode will automatically modify this
config.
---
 .vscode/settings.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2f2adc78f6de9..3e2b1f31dd6cf 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,7 +11,7 @@
         // Auto sort imports
         "editor.formatOnSave": true,
         "editor.codeActionsOnSave": {
-            "source.organizeImports": true
+            "source.organizeImports": "explicit"
         },
         "editor.defaultFormatter": "ms-python.black-formatter"
     },

From d03e477b9026a97d22dba64cd00b4614603671e5 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 11 Jan 2024 12:50:55 +0800
Subject: [PATCH 010/100] Fix missing subgraph candidates for recompute
 (#19077)

### Fix missing subgraph candidates for recompute

For subgraphs for example `MatMul+Transpose+Reshape`, since the ending
node is a Reshape, in ORT, it is reusing input buffers.

Currently, the subgraph detection logic has defect, as a result, those
subgraphs will be missing as recompute candidates.

Also append a few more node types for recompute support.

TODO: add unit test later. This PR is needed for a customer model now.
---
 .../memory_optimizer/memory_insight.cc        | 34 +++++++++-----
 .../memory_optimizer/optimization_planner.cc  | 29 ------------
 .../memory_optimizer/optimization_planner.h   |  2 +-
 .../memory_optimizer/recompute_analysis.cc    |  3 ++
 .../memory_optimizer/recompute_analysis.h     | 45 +++++++++++++++++++
 .../ortmodule/_graph_execution_manager.py     |  2 +-
 .../python/training/ortmodule/options.py      |  9 ++++
 7 files changed, 83 insertions(+), 41 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
index 9b77832abb6f1..3fbdd5da7b768 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/memory_insight.cc
@@ -485,12 +485,15 @@ void ListAllCombinations(const InlinedVector<InlinedVector<InlinedVector<std::sh
     return;
   }
 
-  for (const auto& plans : all_possible_node_optimization_plans[index]) {
-    for (const auto& plan : plans) {
-      InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>> new_combination = current_combination;
-      new_combination.push_back(plan);
-      ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations);
-    }
+  const InlinedVector<InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>>>&
+      plan_combination_list_at_cur_index = all_possible_node_optimization_plans[index];
+  // For the index-th reused buffer, iterate all possible complete plans.
+  for (size_t i = 0; i < plan_combination_list_at_cur_index.size(); ++i) {
+    const auto& plan_combination = plan_combination_list_at_cur_index[i];
+    InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>> new_combination = current_combination;
+    // Append the chosen complete plan and continue exploring the next reused buffer by index + 1.
+    new_combination.insert(new_combination.end(), plan_combination.begin(), plan_combination.end());
+    ListAllCombinations(all_possible_node_optimization_plans, index + 1, new_combination, logger, all_combinations);
   }
 
   MO_LOG_DEBUG_INFO(logger, "Exit ListAllCombinations");
@@ -520,17 +523,28 @@ void IterateNodeOptimizationPlan(const std::shared_ptr<NodeOptimizationPlanBase>
   }
 
   InlinedVector<InlinedVector<InlinedVector<std::shared_ptr<NodeOptimizationPlanBase>>>>
-      all_possible_node_optimization_plans;
-  all_possible_node_optimization_plans.resize(plan->reuse_buffers.size());
+      all_possible_node_optimization_plans(plan->reuse_buffers.size());
 
   size_t i = 0;
   for (const auto& p : plan->reuse_buffers) {
     MO_LOG_DEBUG_INFO(logger, ">>>reuse buffer: " + std::to_string(p.first));
-    IterateNode(p.second.first, node_to_optimization_plans_map, {}, logger, all_possible_node_optimization_plans[i]);
+    // If the resued node is part of current node optimization plan, then we just add current combination to the result.
+    if (plan->GetOptimizationType() == OptimizationType::RecomputeWithCompromise || plan->GetOptimizationType() == OptimizationType::Recompute) {
+      const auto& recompute_subgraph =
+          dynamic_cast<NodeRecomputePlan*>(plan.get())->GetNodesInTopoOrder();
+      if (std::find(recompute_subgraph.begin(), recompute_subgraph.end(), p.second.first) != recompute_subgraph.end()) {
+        all_possible_node_optimization_plans[i].push_back(current_combination);
+      }
+    }
+
+    if (all_possible_node_optimization_plans[i].size() == 0) {
+      IterateNode(p.second.first, node_to_optimization_plans_map, current_combination, logger, all_possible_node_optimization_plans[i]);
+    }
+
     ++i;
   }
 
-  ListAllCombinations(all_possible_node_optimization_plans, 0, current_combination, logger, all_combinations);
+  ListAllCombinations(all_possible_node_optimization_plans, 0, {}, logger, all_combinations);
 
   MO_LOG_DEBUG_INFO(logger, "Exit IterateNodeOptimizationPlan: " + plan->GetClusterId());
 }
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
index 64e99a4a0bca5..4ce896c5350b0 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.cc
@@ -15,35 +15,6 @@
 
 namespace onnxruntime::optimizer::memory_optimizer {
 
-std::string NodeOptimizationPlanBase::GetMemorySavingSymbolicString() const {
-  std::string saving_str;
-  for (auto output_index : activation_output_indices_) {
-    // If the output is reusing other node's buffer, then no memory saving.
-    if (reuse_buffers.find(output_index) != reuse_buffers.end()) {
-      continue;
-    }
-
-    const auto& output_def = node->OutputDefs()[output_index];
-    MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto());
-    ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ",
-                DataTypeImpl::ToString(ml_data_type));
-    const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType();
-    ORT_ENFORCE(nullptr != tensor_type_base);
-    MLDataType elt_type = tensor_type_base->GetElementType();
-    const auto byte_count_per_element = elt_type->Size();
-    if (!saving_str.empty()) {
-      saving_str += " + ";
-    }
-    saving_str = "(" + GetActivationOutputDimParamString(output_index) + " * " +
-                 std::to_string(byte_count_per_element) + " * " +
-                 std::to_string(GetSaveRatio()) + ")";
-  }
-  if (saving_str.empty()) {
-    return saving_str;
-  }
-  return "(" + saving_str + ")";
-}
-
 Status MemoryOptimizationPlanner::UpdateNodePlansFromExecutionPlan(const GraphViewer& graph_viewer,
                                                                    const OrtValueNameIdxMap& ortvalue_name_to_idx_map,
                                                                    const SequentialExecutionPlan& p_seq_exec_plan) {
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
index c585b2810b39d..789f530b29f1d 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/optimization_planner.h
@@ -83,7 +83,7 @@ class NodeOptimizationPlanBase {
   /**
    * Get a symbolic string to represent the memory saving for this optimization plan.
    */
-  std::string GetMemorySavingSymbolicString() const;
+  virtual std::string GetMemorySavingSymbolicString() const = 0;
 
   std::string GetActivationOutputDimParamString(size_t index) const {
     ORT_ENFORCE(activation_output_dim_params_.find(index) != activation_output_dim_params_.end(),
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
index 52dea571a1eaf..12c83591c0036 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc
@@ -72,12 +72,14 @@ const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecompu
         {"Add", AllowedRecomputeNodeConfig{{0, 1}}},
         {"BiasGelu", AllowedRecomputeNodeConfig{{0, 1}}},
         {"Div", AllowedRecomputeNodeConfig{{0, 1}}},
+        {"Equal", AllowedRecomputeNodeConfig{{0, 1}}},
         {"Mul", AllowedRecomputeNodeConfig{{0, 1}}},
         {"Sub", AllowedRecomputeNodeConfig{{0, 1}}},
 
         // Data layout
         /// The shape input is trivial whether it exists or not in backward.
         {"Reshape", AllowedRecomputeNodeConfig{{0}}},
+        {"Shape", AllowedRecomputeNodeConfig{{0}}},
         {"Squeeze", AllowedRecomputeNodeConfig{{0}}},
         {"Transpose", AllowedRecomputeNodeConfig{{0}}},
         {"Unsqueeze", AllowedRecomputeNodeConfig{{0}}},
@@ -92,6 +94,7 @@ const InlinedHashMap<std::string, AllowedRecomputeNodeConfig>& GetAllowedRecompu
         {"Expand", AllowedRecomputeNodeConfig{{0}}},
         {"FastGelu", AllowedRecomputeNodeConfig{{0}}},
         {"Gelu", AllowedRecomputeNodeConfig{{0}}},
+        {"QuickGelu", AllowedRecomputeNodeConfig{{0}}},
 
         // Ternary elementwise
         {"Where", AllowedRecomputeNodeConfig{{0, 1, 2}}},
diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
index d9693835313b8..ab114d970191e 100644
--- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
+++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.h
@@ -86,6 +86,51 @@ class NodeRecomputePlan : public NodeOptimizationPlanBase {
 
   std::string GetNodesInTopoOrderStr() const;
 
+  std::string GetMemorySavingSymbolicString() const override {
+    std::string saving_str;
+    for (auto output_index : GetActivationOutputIndices()) {
+      // If the output is reusing other node's buffer, then no memory saving.
+      std::string cur_output_saving_str;
+
+      bool is_reused = reuse_buffers.find(output_index) != reuse_buffers.end();
+      bool is_src_node_in_cur_node_subgraph = false;
+      if (is_reused) {
+        // Here we assume the src_node is the real owner of the buffer, so we don't need trace further.
+        const auto* src_node = reuse_buffers.at(output_index).first;
+        is_src_node_in_cur_node_subgraph = std::find(nodes_in_topological_order_.begin(),
+                                                     nodes_in_topological_order_.end(),
+                                                     src_node) != nodes_in_topological_order_.end();
+      }
+
+      if (!is_reused || is_src_node_in_cur_node_subgraph) {
+        // For is_src_node_in_cur_node_subgraph is True, still use the output to calculate the saving, because
+        // reusing buffer is the same size.
+        const auto& output_def = node->OutputDefs()[output_index];
+        MLDataType ml_data_type = DataTypeImpl::TypeFromProto(*output_def->TypeAsProto());
+        ORT_ENFORCE(ml_data_type->IsTensorType(), "ml_type must be a tensor type, but it is ",
+                    DataTypeImpl::ToString(ml_data_type));
+        const TensorTypeBase* tensor_type_base = ml_data_type->AsTensorType();
+        ORT_ENFORCE(nullptr != tensor_type_base);
+        MLDataType elt_type = tensor_type_base->GetElementType();
+        const auto byte_count_per_element = elt_type->Size();
+        cur_output_saving_str = GetActivationOutputDimParamString(output_index) + " * " +
+                                std::to_string(byte_count_per_element) + " * " +
+                                std::to_string(GetSaveRatio());
+      } else {
+        cur_output_saving_str = "0";
+      }
+
+      if (!saving_str.empty()) {
+        saving_str += " + ";
+      }
+
+      saving_str = "(" + cur_output_saving_str + ")";
+    }
+
+    ORT_ENFORCE(!saving_str.empty(), "saving_str should not be empty for node: ", node->OpType(), " ", node->Name());
+    return "(" + saving_str + ")";
+  }
+
  private:
   bool compromise_recompute_;
   InlinedVector<const Node*> nodes_in_topological_order_;
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 76943b954837b..853eab61b4bd6 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -243,7 +243,7 @@ def _get_session_config(self):
         # requires PRIORITY_BASED order to work properly. So we use PRIORITY_BASED order when recompute is enabled.
         session_options.execution_order = (
             onnxruntime.ExecutionOrder.PRIORITY_BASED
-            if self._runtime_options.memory_optimizer_config != ""
+            if self._runtime_options.memory_optimizer_is_enabled()
             else onnxruntime.ExecutionOrder.DEFAULT
         )
         # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index a93f6413b7ab4..bfa38efb349ae 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -399,3 +399,12 @@ def _override_from_env_vars(self):
 
         if "ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT" in os.environ:
             self.deepcopy_before_model_export = int(os.getenv("ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT")) == 1
+
+    def memory_optimizer_is_enabled(self) -> bool:
+        """Check whether memory optimizer is enabled."""
+        if self.memory_optimization_level == _MemoryOptimizationLevel.USER_SPECIFIED:
+            return len(self.memory_optimizer_config) > 0
+        elif self.memory_optimization_level == _MemoryOptimizationLevel.TRANSFORMER_LAYERWISE_RECOMPUTE:
+            return True
+
+        return False

From f11713702f8cfac5785c970d96c455e883d6c269 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Jan 2024 22:08:14 -0800
Subject: [PATCH 011/100] Bump follow-redirects from 1.15.2 to 1.15.4 in
 /js/node (#19070)

Bumps
[follow-redirects](https://github.com/follow-redirects/follow-redirects)
from 1.15.2 to 1.15.4.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/65858205e59f1e23c9bf173348a7a7cbb8ac47f5"><code>6585820</code></a>
Release version 1.15.4 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/7a6567e16dfa9ad18a70bfe91784c28653fbf19d"><code>7a6567e</code></a>
Disallow bracketed hostnames.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/05629af696588b90d64e738bc2e809a97a5f92fc"><code>05629af</code></a>
Prefer native URL instead of deprecated url.parse.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/1cba8e85fa73f563a439fe460cf028688e4358df"><code>1cba8e8</code></a>
Prefer native URL instead of legacy url.resolve.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/72bc2a4229bc18dc9fbd57c60579713e6264cb92"><code>72bc2a4</code></a>
Simplify _processResponse error handling.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/3d42aecdca39b144a0a2f27ea134b4cf67dd796a"><code>3d42aec</code></a>
Add bracket tests.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/bcbb096b32686ecad6cd34235358ed6f2217d4f0"><code>bcbb096</code></a>
Do not directly set Error properties.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/192dbe7ce671ecad813c074bffe3b3f5d3680fee"><code>192dbe7</code></a>
Release version 1.15.3 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/bd8c81e4f32d12f28a35d265f88b1716703687c6"><code>bd8c81e</code></a>
Fix resource leak on destroy.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/9c728c314b06f9595dcd7f245d40731e8a27d79f"><code>9c728c3</code></a>
Split linting and testing.</li>
<li>Additional commits viewable in <a
href="https://github.com/follow-redirects/follow-redirects/compare/v1.15.2...v1.15.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.2&new-version=1.15.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/node/package-lock.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index c1cf8af4bb80e..542eebe746d59 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -336,9 +336,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true,
       "funding": [
         {
@@ -1242,9 +1242,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true
     },
     "form-data": {

From a89db01fceafde24669f44b7f8f52a86b8ce63a6 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Fri, 12 Jan 2024 00:13:14 +0800
Subject: [PATCH 012/100] [js/webgpu] disable GroupedConvVectorize path
 (#19090)

Disable createGroupedConvVectorizeProgramInfo path due to bots failures
on below two cases:
[webgpu]Conv - conv - vectorize group - B
[webgpu]Conv - conv - vectorize group - D
---
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index cb40a9f08d2d7..7af2c5db49f40 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -138,8 +138,12 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
   const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    if (isChannelsLast && inputs[1].dims[0] === attributes.group && inputs[1].dims[1] === 1 &&
-        attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+    // Temporarily disable createGroupedConvVectorizeProgramInfo path due to bots failures with below two cases:
+    // [webgpu]Conv - conv - vectorize group - B
+    // [webgpu]Conv - conv - vectorize group - D
+    const disableGroupedConvVectorize = true;
+    if (!disableGroupedConvVectorize && isChannelsLast && inputs[1].dims[0] === attributes.group &&
+        inputs[1].dims[1] === 1 && attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
       const outputShape = calculateOutputShape(
           inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
           isChannelsLast);

From 2eb3db6bf03b3f190d20542bfdbd9dd0a84881c8 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Thu, 11 Jan 2024 11:34:28 -0500
Subject: [PATCH 013/100] Adding python3.12 support to ORT (#18814)

### Description
Adding python3.12 support to ORT


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 setup.py                                      |  1 +
 .../orttraining-py-packaging-pipeline-cpu.yml |  2 ++
 .../stages/py-cuda-packaging-stage.yml        |  8 ++++++
 .../templates/py-package-smoking-test.yml     |  2 ++
 .../templates/py-packaging-stage.yml          | 26 +++++++++++++++++++
 .../py-packaging-training-cuda-stage.yml      |  6 +++++
 .../set-python-manylinux-variables-step.yml   |  4 +++
 .../linux/build_linux_python_package.sh       |  2 +-
 .../linux/docker/Dockerfile.manylinux2_28_cpu |  6 +++++
 .../docker/Dockerfile.manylinux2_28_cuda      |  6 +++++
 .../docker/Dockerfile.manylinux2_28_rocm      |  6 +++++
 ...Dockerfile.manylinux2_28_training_cuda11_8 |  6 +++++
 ...Dockerfile.manylinux2_28_training_cuda12_2 |  6 +++++
 .../python/cpu/Dockerfile.manylinux2_28_cpu   |  6 +++++
 .../x64/python/cpu/scripts/install_deps.sh    |  2 +-
 .../x64/python/cpu/scripts/requirements.txt   |  3 ++-
 .../docker/scripts/install_python_deps.sh     |  2 ++
 .../docker/scripts/manylinux/install_deps.sh  |  2 +-
 .../scripts/manylinux/install_deps_eager.sh   |  2 +-
 .../docker/scripts/manylinux/requirements.txt |  3 ++-
 .../linux/docker/scripts/requirements.txt     |  3 ++-
 .../stage1/requirements_rocm/requirements.txt |  3 ++-
 .../ortmodule/stage2/requirements.txt         |  3 ++-
 tools/ci_build/requirements.txt               |  3 ++-
 24 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index 685f0612e3762..e94165fdf9b05 100644
--- a/setup.py
+++ b/setup.py
@@ -451,6 +451,7 @@ def finalize_options(self):
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "Operating System :: Microsoft :: Windows",
     "Operating System :: MacOS",
 ]
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 693a06f9844f5..07b233590bcf5 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -28,6 +28,8 @@ stages:
             PythonVersion: '3.10'
           Python311:
             PythonVersion: '3.11'
+          Python312:
+            PythonVersion: '3.12'
 
       steps:
       - checkout: self
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index f3d68957d649c..e6d8ee35e75e3 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -92,6 +92,14 @@ stages:
           EP_NAME: gpu
           CudaVersion: ${{ parameters.cuda_version }}
 
+      - template: ../templates/py-win-gpu.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+          PYTHON_VERSION: '3.12'
+          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }}  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          EP_NAME: gpu
+          CudaVersion: ${{ parameters.cuda_version }}
+
 
   - ${{ if eq(parameters.enable_linux_gpu, true) }}:
       - template: ../templates/py-linux-gpu.yml
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
index 8d5ca19a73535..0cb438c71066e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -30,6 +30,8 @@ jobs:
         PythonVersion: '3.10'
       Python311:
         PythonVersion: '3.11'
+      Python312:
+        PythonVersion: '3.12'
   steps:
   - checkout: none
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 7cee5045bc4f3..a3c2983b755d0 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -77,6 +77,10 @@ stages:
             PythonVersion: '3.11'
             MsbuildPlatform: x64
             buildArch: x64
+          Python312_x64:
+            PythonVersion: '3.12'
+            MsbuildPlatform: x64
+            buildArch: x64
           # Training build cannot support Win32 for now because one or more of its python
           # dependencies does not support Win32. So, don't build a training package for Win32
           ${{ if not(contains(parameters.build_py_parameters, '--enable_training')) }}:
@@ -96,6 +100,10 @@ stages:
               PythonVersion: '3.11'
               MsbuildPlatform: Win32
               buildArch: x86
+            Python312_x86:
+              PythonVersion: '3.12'
+              MsbuildPlatform: Win32
+              buildArch: x86
       variables:
         OnnxRuntimeBuildDirectory: '$(Build.BinariesDirectory)'
         EnvSetupScript: setup_env.bat
@@ -295,6 +303,14 @@ stages:
           ENV_SETUP_SCRIPT: setup_env_gpu.bat
           EP_NAME: gpu
 
+      - template: py-win-gpu.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4'
+          PYTHON_VERSION: '3.12'
+          EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8"  --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80"
+          ENV_SETUP_SCRIPT: setup_env_gpu.bat
+          EP_NAME: gpu
+
       - template: py-win-gpu.yml
         parameters:
           MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
@@ -327,6 +343,14 @@ stages:
           ENV_SETUP_SCRIPT: setup_env.bat
           EP_NAME: directml
 
+      - template: py-win-gpu.yml
+        parameters:
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
+          PYTHON_VERSION: '3.12'
+          EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
+          ENV_SETUP_SCRIPT: setup_env.bat
+          EP_NAME: directml
+
   - ${{ if eq(parameters.enable_mac_cpu, true) }}:
     - job: MacOS_py_Wheels
       timeoutInMinutes: 180
@@ -346,6 +370,8 @@ stages:
             PythonVersion: '3.10'
           Python311:
             PythonVersion: '3.11'
+          Python312:
+            PythonVersion: '3.12'
       steps:
       - checkout: self
         clean: true
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index 7fdd7e54e752d..e7b935712ac6c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -98,6 +98,12 @@ stages:
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
+          Python312:
+            PythonVersion: '3.12'
+            TorchVersion: ${{ parameters.torch_version }}
+            OpsetVersion: ${{ parameters.opset_version }}
+            CudaVersion: ${{ parameters.cuda_version }}
+            UploadWheel: ${{ parameters.upload_wheel }}
 
       steps:
       - task: CmdLine@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
index 110eaff46f460..1fe58a7239369 100644
--- a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
@@ -30,6 +30,10 @@ steps:
         variables = {
           "PythonManylinuxDir": "/opt/python/cp311-cp311"
         }
+      elif version == "3.12":
+        variables = {
+          "PythonManylinuxDir": "/opt/python/cp312-cp312"
+        }
       else:
         raise ValueError("Unsupported Python version: '{}'".format(version))
 
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 3c1c65c9a6862..4c0a39fdc512e 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -9,7 +9,7 @@ EXTRA_ARG=""
 
 # Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this 
 # config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp38-cp38/bin/python3.8")
+PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp38-cp38/bin/python3.8")
 while getopts "d:p:x:c:" parameter_Option
 do case "${parameter_Option}"
 in
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index af87852561e0a..546fca69201a1 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -116,6 +116,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -127,6 +131,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -140,6 +145,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index 8f265b208cd47..0c95083d614ed 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -131,6 +135,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 FROM runtime_base
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
index b9fd88083f218..dd7c669c37885 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
@@ -135,6 +135,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
@@ -147,6 +151,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -160,6 +165,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
index 09ab7951552a0..a6a75afb0f4c3 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
index a36f60b87768d..d29157daef611 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2
@@ -119,6 +119,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/install-pypy.sh \
      build_scripts/pypy.sha256 \
@@ -130,6 +134,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -143,6 +148,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
index 06e75ee1a39f6..66fe0cafd945b 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/Dockerfile.manylinux2_28_cpu
@@ -114,6 +114,10 @@ FROM build_cpython AS build_cpython311
 COPY build_scripts/cpython-pubkey-310-311.txt /build_scripts/cpython-pubkeys.txt
 RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.11.2
 
+FROM build_cpython AS build_cpython312
+COPY build_scripts/cpython-pubkey-312-313.txt /build_scripts/cpython-pubkeys.txt
+RUN manylinux-entrypoint /build_scripts/build-cpython.sh 3.12.1
+
 FROM build_cpython AS all_python
 COPY build_scripts/finalize-python.sh \
      /build_scripts/
@@ -122,6 +126,7 @@ COPY --from=build_cpython38 /opt/_internal /opt/_internal/
 COPY --from=build_cpython39 /opt/_internal /opt/_internal/
 COPY --from=build_cpython310 /opt/_internal /opt/_internal/
 COPY --from=build_cpython311 /opt/_internal /opt/_internal/
+COPY --from=build_cpython312 /opt/_internal /opt/_internal/
 RUN manylinux-entrypoint /build_scripts/finalize-python.sh
 
 
@@ -135,6 +140,7 @@ COPY build_scripts/finalize.sh \
      build_scripts/requirements3.9.txt \
      build_scripts/requirements3.10.txt \
      build_scripts/requirements3.11.txt \
+     build_scripts/requirements3.12.txt \
      build_scripts/requirements-base-tools.txt \
      /build_scripts/
 COPY build_scripts/requirements-tools/* /build_scripts/requirements-tools/
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
index 7bf031ee78485..f576b867da73b 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_deps.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e -x
 pushd .
-PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 CURRENT_DIR=$(pwd)
 if ! [ -x "$(command -v protoc)" ]; then
   $CURRENT_DIR/install_protobuf.sh
diff --git a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
index aa0ad05b42dbf..7249fd2331321 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/requirements.txt
@@ -1,5 +1,6 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
index 86585b75d43fe..1ac1d226deec6 100755
--- a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh
@@ -46,6 +46,8 @@ elif [[ "$PYTHON_VER" = "3.10" && -d "/opt/python/cp310-cp310"  ]]; then
    PYTHON_EXE="/opt/python/cp310-cp310/bin/python3.10"
 elif [[ "$PYTHON_VER" = "3.11" && -d "/opt/python/cp311-cp311"  ]]; then
    PYTHON_EXE="/opt/python/cp311-cp311/bin/python3.11"
+elif [[ "$PYTHON_VER" = "3.12" && -d "/opt/python/cp312-cp312"  ]]; then
+   PYTHON_EXE="/opt/python/cp312-cp312/bin/python3.12"
 else
    PYTHON_EXE="/usr/bin/python${PYTHON_VER}"
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
index 8c79918120d8d..5b181a484a607 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps.sh
@@ -19,7 +19,7 @@ PARENT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." &> /dev/null && pwd)"
 source "$PARENT_DIR/install_dotnet.sh"
 
 if [ ! -d "/opt/conda/bin" ]; then
-    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
index ad3366b0bb3b6..d8d2fbc06a00b 100755
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_eager.sh
@@ -6,7 +6,7 @@ yum -y install \
     graphviz
 
 if [ ! -d "/opt/conda/bin" ]; then
-    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11")
+    PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12")
 else
     PYTHON_EXES=("/opt/conda/bin/python")
 fi
diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
index d6912bfb05efe..94f52f476579b 100644
--- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt
@@ -1,5 +1,6 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools>=68.2.2
diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt
index 0fc80b30c1b3a..58a342277fc2d 100644
--- a/tools/ci_build/github/linux/docker/scripts/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt
@@ -1,6 +1,7 @@
 cerberus
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 mypy
 pytest
 setuptools==69.0.3
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
index 9c52aff960d6e..57331d6df97d9 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt
@@ -1,2 +1,3 @@
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
\ No newline at end of file
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
\ No newline at end of file
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
index 2b557f2aee00f..47f64568f424a 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
@@ -1,7 +1,8 @@
 pandas
 scikit-learn
 numpy==1.21.6 ; python_version < '3.11'
-numpy==1.24.2 ; python_version >= '3.11'
+numpy==1.24.2 ; python_version == '3.11'
+numpy==1.26.0 ; python_version >= '3.12'
 transformers==v4.36.0
 accelerate==0.25.0
 rsa==4.9
diff --git a/tools/ci_build/requirements.txt b/tools/ci_build/requirements.txt
index aaca45b3e17e1..57fc8f08336d2 100644
--- a/tools/ci_build/requirements.txt
+++ b/tools/ci_build/requirements.txt
@@ -1,7 +1,8 @@
 # packages used by transformers python unittest (only enabled in Linux CPU CI Pipeline)
 packaging
 protobuf==3.20.2
-numpy==1.24.0
+numpy==1.24.0 ; python_version < '3.12'
+numpy==1.26.0 ; python_version >= '3.12'
 coloredlogs==15.0
 transformers==4.36.0
 psutil

From f68dfcd8887ba1993143a0e496e127e7af2d4f4a Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 11 Jan 2024 09:19:12 -0800
Subject: [PATCH 014/100] [CUDA] Improve performance of
 DecoderMaskedMultiheadAttention on A100 (#18695)

### Description

Currently there are 2 memory latency bound hotspots in the
DecoderMaskedMultiheadAttention kernel in terms of reading from global
memory - one reading K values and the other reading V values.

The current logic to read them both is something like this -

for(int i=0; i<all_time_steps; ++i) {
  auto data_in_register = load_chunk_from_global_memory(i);
  do_compute(data_in_register);
}

This incurs a data read stall as data needs to be fetched into the
registers before compute can begin and the compute instruction incurs a
data read stall and this also does not fully utilize the memory
bandwidth of A100. The above logic can be re-written by doing some
manual loop unrolling so that more data read is triggered "in flight".

Unroll factor: 4
for(int i=0; i<all_time_steps; i+=4) {
  auto data_in_register_0 = load_chunk_from_global_memory(i);

  // Do bounds check for the following
  auto data_in_registers_1 = load_chunk_from_global_memory(i+1);
  auto data_in_register_2 = load_chunk_from_global_memory(i+2);
  auto data_in_register_3 = load_chunk_from_global_memory(i+3);

  do_compute(data_in_register_0);

 // Do bounds check for the following
 do_compute(data_in_register_1);
 do_compute(data_in_register_2);
 do_compute(data_in_register_3);
}

The idea is that the memory read latency is hidden by instructions being
issued for subsequent data reads. See here for more details -
https://forums.developer.nvidia.com/t/global-memory-access-synchronous-or-asynchronous-read-write/3256/4

Kernel clock cycles, latency, and memory bandwidth usage before:

<img width="1210" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/9969784/7a1f41f9-fdaa-47b3-b629-996d7b5eef17">

Kernel clock cycles, latency, and memory bandwidth usage after:

<img width="1205" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/9969784/c76b2d2f-43e3-43c9-a710-b5fae76f69b6">


As can be seen, the kernel latency is better by >30% and memory
throughput is better by >14%.

We have a 1P customer using the Whisper model (sampling using
BeamSearch) and the E2E perf for a representative production input is >
6.5%

Whisper E2E Latency for sample input before (on A100):

<img width="194" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/9969784/84ef59f5-84f2-4277-b9f8-b04c27336642">

Whisper E2E Latency for sample input after (on A100):

<img width="191" alt="image"
src="https://github.com/microsoft/onnxruntime/assets/9969784/ca9fe5d3-f726-403e-b27c-be4ee07e0625">


This feature of loading more data in flight may not always yield gains
and it will be workload dependent. For now, keeping the feature turned
OFF by default. It can be turned ON by the user when needed.

### Motivation and Context
Improve BeamSearch performance on CUDA EP
---
 .../contrib_ops/cpu/bert/attention_common.h   |   4 +
 .../decoder_masked_multihead_attention.cc     |   4 +
 .../bert/decoder_masked_self_attention.cc     |   4 +
 ...decoder_masked_multihead_attention_impl.cu | 244 +++++++++++++++---
 .../decoder_masked_multihead_attention_impl.h |   8 +-
 ...oder_masked_multihead_attention_op_test.cc |  41 ++-
 6 files changed, 253 insertions(+), 52 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index c9ed23895b60c..da489a6901512 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -133,6 +133,10 @@ constexpr const char* kMinSeqLenForFlashAttentionPackedQKV = "ORT_MIN_SEQ_LEN_FL
 // Default value for the above setting.
 constexpr int kDefaultMinSeqLenForFlashAttentionPackedQKV = 513;
 
+// Environment variable to enable loading more KV data in flight in
+// DecoderMaskedMultiHeadAttention/DecoderMaskedSelfAttention kernels
+constexpr const char* kDecoderMaskedAttentionLoadKVDataInFlight = "ORT_DECODER_MASKED_ATTENTION_LOAD_KV_DATA_IN_FLIGHT";
+
 }  // namespace attention
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
index 54aad9cbaf387..a9b60da0c96ca 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_multihead_attention.cc
@@ -70,6 +70,10 @@ Status DecoderMaskedMultiHeadAttention<T1, T2>::ComputeInternal(OpKernelContext*
 
   auto& device_prop = GetDeviceProp();
   DecoderMaskedMultiHeadAttentionParams parameters;
+
+  parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault<bool>(
+      attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
   bool is_dmmha_packing = (key == nullptr && value == nullptr);
   ORT_RETURN_IF_ERROR(multihead_attention_helper::CheckInputs<Tensor>(query,
                                                                       key,
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
index 69ed07101e647..72ede2e22b557 100644
--- a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
@@ -52,6 +52,10 @@ Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* cont
 
   auto& device_prop = GetDeviceProp();
   DecoderMaskedMultiHeadAttentionParams parameters;
+
+  parameters.kv_data_in_flight = ParseEnvironmentVariableWithDefault<bool>(
+      attention::kDecoderMaskedAttentionLoadKVDataInFlight, false);
+
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
index 33e7a33494778..9efb6f08e8e99 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
@@ -344,52 +344,148 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
   bool has_beams = params.cache_indir != nullptr && !params.is_cross_attention;
   const int* beam_indices = has_beams ? &params.cache_indir[bi_max_seq_length] : nullptr;
 
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
+  if (!params.kv_data_in_flight) {
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      bool is_masked = (params.mask != nullptr) && (params.mask[bi_total_seq_length + ti] == 0);
 
-    // The keys loaded from the key cache.
-    K_vec_k k_vec[K_VECS_PER_THREAD];
-    if (ti < tlength) {
-      if (has_beams) {
-        const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
+      // The keys loaded from the key cache.
+      K_vec_k k_vec[K_VECS_PER_THREAD];
+      if (ti < tlength) {
+        if (has_beams) {
+          const int beam_offset = beam_indices[ti] * params.num_heads * params.max_sequence_length * head_size;
 
 #pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-          int jj = ii * params.max_sequence_length + ti;
+          for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.max_sequence_length + ti;
 
-          k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
-              (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
-        }
-      } else {
+            k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B])));
+          }
+        } else {
 #pragma unroll
-        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-          int jj = ii * params.max_sequence_length + ti;
+          for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.max_sequence_length + ti;
 
-          k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
-              (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+            k_vec[ii] = vec_conversion<K_vec_k, K_vec_m>(
+                (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+          }
         }
       }
-    }
 
-    // Perform the dot product and normalize qk.
-    // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
-    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec) * inv_sqrt_dh;
+      // Perform the dot product and normalize qk.
+      // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+      float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec) * inv_sqrt_dh;
 
-    // This is a deviation from FasterTransformer kernel implementation
-    // but this aligns with ORT's other Attention kernels which strives to
-    // mimic PyTorch when dealing with mask filter values
-    if (is_masked) {
-      qk += params.mask_filter_value;
+      // This is a deviation from FasterTransformer kernel implementation
+      // but this aligns with ORT's other Attention kernels which strives to
+      // mimic PyTorch when dealing with mask filter values
+      if (is_masked) {
+        qk += params.mask_filter_value;
+      }
+
+      // Store the product to shared memory. There's one qk value per timestep. Update the max.
+      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+        if (params.relative_attention_bias != nullptr) {
+          qk = add_vec(qk,
+                       reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+        }
+        qk_max = fmaxf(qk_max, qk);
+        qk_smem[ti] = qk;
+      }
     }
+  } else {
+    // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+    // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+    constexpr int K_CACHE_DATA_LOAD_UNROLL = 4;
 
-    // Store the product to shared memory. There's one qk value per timestep. Update the max.
-    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
-      if (params.relative_attention_bias != nullptr) {
-        qk = add_vec(qk,
-                     reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + ti]);
+    for (int ti = ko; ti < ti_end; ti += (K_CACHE_DATA_LOAD_UNROLL * K_PER_ITER)) {
+      int is_masked[K_CACHE_DATA_LOAD_UNROLL];
+      int beam_offset[K_CACHE_DATA_LOAD_UNROLL];
+      int time_step[K_CACHE_DATA_LOAD_UNROLL];
+      bool time_bounds_cond[K_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        is_masked[k_unroll] = 1;
+        beam_offset[k_unroll] = 0;
+        time_step[k_unroll] = ti + k_unroll * K_PER_ITER;
+        time_bounds_cond[k_unroll] = (time_step[k_unroll] < tlength);
+      }
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && params.mask != nullptr) {
+          is_masked[k_unroll] = params.mask[bi_total_seq_length + time_step[k_unroll]];
+        }
+      }
+
+      if (has_beams) {
+        int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+#pragma unroll
+        for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+          if (time_bounds_cond[k_unroll]) {
+            beam_offset[k_unroll] = beam_indices[time_step[k_unroll]] * head_maxlength_headsize_prod;
+          }
+        }
+      }
+
+      // The keys loaded from the key cache.
+      K_vec_k k_vec[K_CACHE_DATA_LOAD_UNROLL][K_VECS_PER_THREAD];
+
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll]) {
+          if (has_beams) {
+#pragma unroll
+            for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+              int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+              k_vec[k_unroll][ii] = vec_conversion<K_vec_k, K_vec_m>(
+                  (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[beam_offset[k_unroll] + jj * QK_ELTS_IN_16B])));
+            }
+          } else {
+#pragma unroll
+            for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+              int jj = ii * params.max_sequence_length + time_step[k_unroll];
+
+              k_vec[k_unroll][ii] = vec_conversion<K_vec_k, K_vec_m>(
+                  (*reinterpret_cast<const K_vec_m*>(&k_cache_batch[jj * QK_ELTS_IN_16B])));
+            }
+          }
+        }
+      }
+
+      // Perform the dot product and normalize qk.
+      // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+      float qk[K_CACHE_DATA_LOAD_UNROLL];
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        qk[k_unroll] = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k_vec[k_unroll]) * inv_sqrt_dh;
+      }
+
+// This is a deviation from FasterTransformer kernel implementation
+// but this aligns with ORT's other Attention kernels which strives to
+// mimic PyTorch when dealing with mask filter values
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && is_masked[k_unroll] == 0) {
+          qk[k_unroll] += params.mask_filter_value;
+        }
+      }
+
+// Store the product to shared memory. There's one qk value per timestep. Update the max.
+#pragma unroll
+      for (int k_unroll = 0; k_unroll < K_CACHE_DATA_LOAD_UNROLL; ++k_unroll) {
+        if (time_bounds_cond[k_unroll] && (tidx % THREADS_PER_KEY == 0)) {
+          if (params.relative_attention_bias != nullptr) {
+            qk[k_unroll] = add_vec(qk[k_unroll],
+                                   reinterpret_cast<T*>(params.relative_attention_bias)[hi * params.sequence_length * params.total_sequence_length + time_step[k_unroll]]);
+          }
+          qk_max = fmaxf(qk_max, qk[k_unroll]);
+          qk_smem[time_step[k_unroll]] = qk[k_unroll];
+        }
       }
-      qk_max = fmaxf(qk_max, qk);
-      qk_smem[ti] = qk;
     }
   }
 
@@ -504,18 +600,80 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
   V_vec_acum out;
   zero(out);
 
-  // Loop over the timesteps to compute the partial outputs.
-  for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
-    // Fetch offset based on cache_indir when beam sampling
-    const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
-    const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+  if (!params.kv_data_in_flight) {
+    // Loop over the timesteps to compute the partial outputs.
+    for (int ti = vo; ti < tlength; ti += V_PER_ITER) {
+      // Fetch offset based on cache_indir when beam sampling
+      const int beam_src = has_beams ? params.cache_indir[bi_max_seq_length + ti] : 0;
+      const int beam_offset = has_beams ? beam_src * params.num_heads * params.max_sequence_length * head_size : 0;
+
+      // Load the values from the cache.
+      V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * head_size]));
+
+      // Load the logits from shared memory.
+      T logit = logits_smem[ti];
+      out = fma(logit, v, out);
+    }
+  } else {
+    // Loop over the timesteps to compute the partial outputs.
+
+    // TODO(hasesh): Tune this value for different workloads. Currently, it is tuned for Whisper model
+    // Also tune it for different architectures. This works best for Whisper on 80GB A100.
+    constexpr int V_CACHE_DATA_LOAD_UNROLL = 8;
+
+    for (int ti = vo; ti < tlength; ti += V_CACHE_DATA_LOAD_UNROLL * V_PER_ITER) {
+      int beam_src[V_CACHE_DATA_LOAD_UNROLL];
+      int beam_offset[V_CACHE_DATA_LOAD_UNROLL];
+      int time_step[V_CACHE_DATA_LOAD_UNROLL];
+      bool time_bounds_cond[V_CACHE_DATA_LOAD_UNROLL];
+
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        beam_src[v_unroll] = 0;
+        beam_offset[v_unroll] = 0;
+        time_step[v_unroll] = ti + v_unroll * V_PER_ITER;
+        time_bounds_cond[v_unroll] = (time_step[v_unroll] < tlength);
+      }
+
+      int head_maxlength_headsize_prod = params.num_heads * params.max_sequence_length * head_size;
+
+      if (has_beams) {
+// Do the global memory read and corresponding compute in separate unrolled loops
+#pragma unroll
+        for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+          if (time_bounds_cond[v_unroll]) {
+            beam_src[v_unroll] = params.cache_indir[bi_max_seq_length + time_step[v_unroll]];
+          }
+        }
+
+#pragma unroll
+        for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+          if (time_bounds_cond[v_unroll]) {
+            beam_offset[v_unroll] = beam_src[v_unroll] * head_maxlength_headsize_prod;
+          }
+        }
+      }
 
-    // Load the values from the cache.
-    V_vec_k v = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset + ti * head_size]));
+      // Load the values from the V-cache and logits from shared memory.
+      V_vec_k v[V_CACHE_DATA_LOAD_UNROLL];
+      T logits[V_CACHE_DATA_LOAD_UNROLL];
 
-    // Load the logits from shared memory.
-    T logit = logits_smem[ti];
-    out = fma(logit, v, out);
+// Do the global memory read and compute in separate unrolled loops
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        if (time_bounds_cond[v_unroll]) {
+          v[v_unroll] = vec_conversion<V_vec_k, V_vec_m>(*reinterpret_cast<const V_vec_m*>(&v_cache_batch[beam_offset[v_unroll] + time_step[v_unroll] * head_size]));
+          logits[v_unroll] = logits_smem[time_step[v_unroll]];
+        }
+      }
+
+#pragma unroll
+      for (int v_unroll = 0; v_unroll < V_CACHE_DATA_LOAD_UNROLL; ++v_unroll) {
+        if (time_bounds_cond[v_unroll]) {
+          out = fma(logits[v_unroll], v[v_unroll], out);
+        }
+      }
+    }
   }
 
   // One group of threads computes the product(s) for the current timestep.
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
index 4b408dafa2d81..1a17757d1ec2d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
@@ -22,6 +22,12 @@ struct DecoderMaskedMultiHeadAttentionParams : AttentionParameters {
   bool is_cross_attention = false;
   bool is_packed_qkv = false;
 
+  // Useful to better use global memory bandwidth on certain CUDA architectures.
+  // Turned off by default for now until we fully understand performance implications
+  // for all types of workloads.
+  // Can be turned on by appropriate environment variable (see attention_common.h).
+  bool kv_data_in_flight = false;
+
   void* q = nullptr;
   void* q_bias = nullptr;
 
@@ -62,4 +68,4 @@ void mmha_launch_kernel(const DecoderMaskedMultiHeadAttentionParams& params, cud
 }  // namespace cuda
 
 }  // namespace contrib
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index d9c870a7dc52a..6afb61bd1f0a1 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -738,10 +738,23 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
 
         tester.AddOutput<float>("present", past_dims, present);
 
-        // Run
-        std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-        execution_providers.push_back(DefaultCudaExecutionProvider());
-        tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        // Run - Regular kernel execution path
+        {
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
+
+        // Test alternate kernel path of loading more KV data "in flight"
+        {
+          ScopedEnvironmentVariables scoped_env_vars{
+              EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
+
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
       }
     }
   }
@@ -852,10 +865,22 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
 
         tester.AddOutput<MLFloat16>("present", past_dims, present);
 
-        // Run
-        std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-        execution_providers.push_back(DefaultCudaExecutionProvider());
-        tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        // Run - Regular kernel execution path
+        {
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
+
+        // Test alternate kernel path of loading more KV data "in flight"
+        {
+          ScopedEnvironmentVariables scoped_env_vars{
+              EnvVarMap{{onnxruntime::contrib::attention::kDecoderMaskedAttentionLoadKVDataInFlight, "1"}}};
+
+          std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+          execution_providers.push_back(DefaultCudaExecutionProvider());
+          tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+        }
       }
     }
   }

From 4694edcd4199419ec9315cb94305f906c1df49ba Mon Sep 17 00:00:00 2001
From: Aditya Goel <48102515+adityagoel4512@users.noreply.github.com>
Date: Thu, 11 Jan 2024 18:01:43 +0000
Subject: [PATCH 015/100] String concat operator (#17994)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Closes https://github.com/microsoft/onnxruntime/issues/17595.

---------

Signed-off-by: Aditya Goel <agoel4512@gmail.com>
---
 docs/OperatorKernels.md                       |  1 +
 .../providers/cpu/cpu_execution_provider.cc   |  2 +
 .../core/providers/cpu/text/string_concat.cc  | 60 +++++++++++++++
 .../core/providers/cpu/text/string_concat.h   | 17 +++++
 .../cpu/{nn => text}/string_normalizer.cc     |  0
 .../cpu/{nn => text}/string_normalizer.h      |  0
 .../providers/cpu/text/string_concat_test.cc  | 76 +++++++++++++++++++
 .../{nn => text}/string_normalizer_test.cc    |  0
 .../onnx_backend_test_series_filters.jsonc    |  5 --
 9 files changed, 156 insertions(+), 5 deletions(-)
 create mode 100644 onnxruntime/core/providers/cpu/text/string_concat.cc
 create mode 100644 onnxruntime/core/providers/cpu/text/string_concat.h
 rename onnxruntime/core/providers/cpu/{nn => text}/string_normalizer.cc (100%)
 rename onnxruntime/core/providers/cpu/{nn => text}/string_normalizer.h (100%)
 create mode 100644 onnxruntime/test/providers/cpu/text/string_concat_test.cc
 rename onnxruntime/test/providers/cpu/{nn => text}/string_normalizer_test.cc (100%)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index f985cf10ded60..c856d12141c9c 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -382,6 +382,7 @@ Do not modify directly.*
 |Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
+|StringConcat|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|20+|**T** = tensor(string)|
 |StringNormalizer|*in* X:**tensor(string)**<br> *out* Y:**tensor(string)**|10+|**X** = tensor(string)|
 |Sub|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index f60c7ddac5c05..ba7738b405795 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -989,6 +989,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -2447,6 +2448,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN)>,
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.cc b/onnxruntime/core/providers/cpu/text/string_concat.cc
new file mode 100644
index 0000000000000..bc626f8e055aa
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.cc
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_concat.h"
+#include "core/providers/cpu/math/element_wise_ops.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+ONNX_CPU_OPERATOR_KERNEL(StringConcat, 20,
+                         KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<std::string>()),
+                         StringConcat);
+
+Status StringConcat::Compute(OpKernelContext* context) const {
+  ProcessBroadcastSpanFuncs broadcast_funcs{[](BroadcastHelper& broadcast_helper) {
+                                              auto x = broadcast_helper.ScalarInput0<std::string>();
+                                              auto y = broadcast_helper.SpanInput1<std::string>();
+                                              auto y_iter = y.begin();
+                                              auto output_iter = broadcast_helper.OutputSpan<std::string>().begin();
+                                              const auto x_size = x.length();
+                                              while (y_iter != y.end()) {
+                                                output_iter->reserve(x_size + y_iter->length());
+                                                output_iter->append(x);
+                                                output_iter->append(*y_iter);
+                                                y_iter++;
+                                                output_iter++;
+                                              }
+                                            },
+                                            [](BroadcastHelper& broadcast_helper) {
+                                              auto x = broadcast_helper.SpanInput0<std::string>();
+                                              auto x_iter = x.begin();
+                                              auto y = broadcast_helper.ScalarInput1<std::string>();
+                                              auto output_iter = broadcast_helper.OutputSpan<std::string>().begin();
+                                              const auto y_size = y.length();
+                                              while (x_iter != x.end()) {
+                                                output_iter->reserve(y_size + x_iter->length());
+                                                output_iter->append(*x_iter);
+                                                output_iter->append(y);
+                                                x_iter++;
+                                                output_iter++;
+                                              }
+                                            },
+                                            [](BroadcastHelper& broadcast_helper) {
+                                              auto x_iter = broadcast_helper.SpanInput0<std::string>().begin();
+                                              auto y_iter = broadcast_helper.SpanInput1<std::string>().begin();
+                                              auto output = broadcast_helper.OutputSpan<std::string>();
+                                              auto output_iter = output.begin();
+                                              while (output_iter != output.end()) {
+                                                output_iter->reserve(x_iter->length() + y_iter->length());
+                                                output_iter->append(*x_iter);
+                                                output_iter->append(*y_iter);
+                                                x_iter++;
+                                                y_iter++;
+                                                output_iter++;
+                                              }
+                                            }};
+  UntypedBroadcastTwo(*context, broadcast_funcs);
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_concat.h b/onnxruntime/core/providers/cpu/text/string_concat.h
new file mode 100644
index 0000000000000..63c1ea8a41146
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_concat.h
@@ -0,0 +1,17 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringConcat final : public OpKernel {
+ public:
+  StringConcat(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status Compute(OpKernelContext* context) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.cc b/onnxruntime/core/providers/cpu/text/string_normalizer.cc
similarity index 100%
rename from onnxruntime/core/providers/cpu/nn/string_normalizer.cc
rename to onnxruntime/core/providers/cpu/text/string_normalizer.cc
diff --git a/onnxruntime/core/providers/cpu/nn/string_normalizer.h b/onnxruntime/core/providers/cpu/text/string_normalizer.h
similarity index 100%
rename from onnxruntime/core/providers/cpu/nn/string_normalizer.h
rename to onnxruntime/core/providers/cpu/text/string_normalizer.h
diff --git a/onnxruntime/test/providers/cpu/text/string_concat_test.cc b/onnxruntime/test/providers/cpu/text/string_concat_test.cc
new file mode 100644
index 0000000000000..2bfa3dc5615e1
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/string_concat_test.cc
@@ -0,0 +1,76 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+static void RunTest(const std::vector<int64_t>& dims, const std::vector<std::string>& input1,
+                    const std::vector<std::string>& input2, const std::vector<std::string>& output) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", dims, input1);
+  test.AddInput<std::string>("Y", dims, input2);
+  test.AddOutput<std::string>("Z", dims, output);
+  test.Run();
+}
+
+TEST(StringConcat, BasicConcatenation) {
+  RunTest({1, 2}, {"Hello", "World"}, {"Hello", "World"}, {"HelloHello", "WorldWorld"});
+}
+
+TEST(StringConcat, TwoDimensionalConcatenation) {
+  RunTest({2, 2}, {"Hello", "World", "ONNX", "onnxruntime"}, {"Hello", "World", "ONNX", "onnxruntime"},
+          {"HelloHello", "WorldWorld", "ONNXONNX", "onnxruntimeonnxruntime"});
+}
+
+TEST(StringConcat, LeftToRightBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"});
+  test.AddInput<std::string>("Y", {1}, {"!"});
+  test.AddOutput<std::string>("Z", {2, 2}, {"Hello!", "World!", "ONNX!", "onnxruntime!"});
+  test.Run();
+}
+
+TEST(StringConcat, RightToLeftBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {1}, {"!"});
+  test.AddInput<std::string>("Y", {2, 2}, {"Hello", "World", "ONNX", "onnxruntime"});
+  test.AddOutput<std::string>("Z", {2, 2}, {"!Hello", "!World", "!ONNX", "!onnxruntime"});
+  test.Run();
+}
+
+TEST(StringConcat, BidirectionalBroadcastingConcatenation) {
+  OpTester test("StringConcat", 20, onnxruntime::kOnnxDomain);
+  test.AddInput<std::string>("X", {2, 1, 3}, {"a", "b", "c", "d", "e", "f"});
+  test.AddInput<std::string>("Y", {1, 4, 3}, {"a", "b", "c", "d", "e", "f", "g", "h", "i", "k", "l", "m"});
+  test.AddOutput<std::string>("Z", {2, 4, 3},
+                              {
+                                  "aa",
+                                  "bb",
+                                  "cc",
+                                  "ad",
+                                  "be",
+                                  "cf",
+                                  "ag",
+                                  "bh",
+                                  "ci",
+                                  "ak",
+                                  "bl",
+                                  "cm",
+                                  "da",
+                                  "eb",
+                                  "fc",
+                                  "dd",
+                                  "ee",
+                                  "ff",
+                                  "dg",
+                                  "eh",
+                                  "fi",
+                                  "dk",
+                                  "el",
+                                  "fm",
+                              });
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/string_normalizer_test.cc b/onnxruntime/test/providers/cpu/text/string_normalizer_test.cc
similarity index 100%
rename from onnxruntime/test/providers/cpu/nn/string_normalizer_test.cc
rename to onnxruntime/test/providers/cpu/text/string_normalizer_test.cc
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index 3a13e39702904..fb33ef0a1da3c 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -251,11 +251,6 @@
         "^test_regex_full_match_basic",
         "^test_regex_full_match_email_domain",
         "^test_regex_full_match_empty",
-        "^test_string_concat_broadcasting",
-        "^test_string_concat",
-        "^test_string_concat_empty_string",
-        "^test_string_concat_utf8",
-        "^test_string_concat_zero_dimensional",
         "^test_string_split_basic",
         "^test_string_split_consecutive_delimiters",
         "^test_string_split_empty_string_delimiter",

From 58bf836592cbecdc53c197e9f1b45345b5e3ab3b Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Thu, 11 Jan 2024 10:51:23 -0800
Subject: [PATCH 016/100] Offline tooling for training to use reduction with
 keepdims=False (#19027)

---
 .../python/training/onnxblock/blocks.py       | 12 ++++----
 .../python/training/onnxblock/loss/loss.py    |  6 ++--
 .../orttraining_test_ort_apis_onnxblock.py    | 30 +++++++++++++++++++
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/orttraining/orttraining/python/training/onnxblock/blocks.py b/orttraining/orttraining/python/training/onnxblock/blocks.py
index d6146b8509d7b..149d0a360f7d3 100644
--- a/orttraining/orttraining/python/training/onnxblock/blocks.py
+++ b/orttraining/orttraining/python/training/onnxblock/blocks.py
@@ -144,9 +144,10 @@ def build(self, pow_input_name):
 class _UnaryOp(Block):
     """Base class for all nodes that take in a single argument."""
 
-    def __init__(self, op_name):
+    def __init__(self, op_name, **attributes):
         super().__init__()
         self._op_name = op_name
+        self._attributes = attributes
 
     def build(self, input_name):
         # get the model to manipulate
@@ -165,6 +166,7 @@ def build(self, input_name):
             node_input_names,
             node_output_names,
             _graph_utils.generate_graph_name(self._op_name),
+            **self._attributes,
         )
         onnx_model.graph.node.append(node)
 
@@ -174,15 +176,15 @@ def build(self, input_name):
 class ReduceMean(_UnaryOp):
     """Adds ReduceMean node to the onnx model."""
 
-    def __init__(self):
-        super().__init__("ReduceMean")
+    def __init__(self, keepdims=True):
+        super().__init__("ReduceMean", keepdims=keepdims)
 
 
 class ReduceSum(_UnaryOp):
     """Adds ReduceSum node to the onnx model."""
 
-    def __init__(self):
-        super().__init__("ReduceSum")
+    def __init__(self, keepdims=True):
+        super().__init__("ReduceSum", keepdims=keepdims)
 
 
 class Sigmoid(_UnaryOp):
diff --git a/orttraining/orttraining/python/training/onnxblock/loss/loss.py b/orttraining/orttraining/python/training/onnxblock/loss/loss.py
index 2ca848fa3ff71..e719301e13f48 100644
--- a/orttraining/orttraining/python/training/onnxblock/loss/loss.py
+++ b/orttraining/orttraining/python/training/onnxblock/loss/loss.py
@@ -29,7 +29,7 @@ def __init__(self, reduction: str = "mean"):
 
         reduction_blocks = {"mean": blocks.ReduceMean, "sum": blocks.ReduceSum, "none": blocks.PassThrough}
 
-        self._reduce = reduction_blocks[reduction]()
+        self._reduce = reduction_blocks[reduction](keepdims=False)
         self._sub = blocks.Sub()
         self._square = blocks.Pow(2.0)
 
@@ -139,7 +139,7 @@ def __init__(self, weight=None, reduction: str = "mean", pos_weight=None):
         reduction_blocks = {"mean": blocks.ReduceMean, "sum": blocks.ReduceSum, "none": blocks.PassThrough}
 
         self._weight = weight
-        self._reduce = reduction_blocks[reduction]()
+        self._reduce = reduction_blocks[reduction](keepdims=False)
         self._pos_weight = pos_weight
 
         self._sigmoid = blocks.Sigmoid()
@@ -225,7 +225,7 @@ def __init__(self, reduction: str = "mean"):
             raise RuntimeError(f"Reduction {reduction} not supported.")
 
         reduction_blocks = {"mean": blocks.ReduceMean, "sum": blocks.ReduceSum, "none": blocks.PassThrough}
-        self._reduce = reduction_blocks[reduction]()
+        self._reduce = reduction_blocks[reduction](keepdims=False)
         self._abs = blocks.Abs()
         self._sub = blocks.Sub()
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
index 6e5d54cbb9427..910ddb34e2b52 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py
@@ -1017,3 +1017,33 @@ def test_save_ort_format():
             raise AssertionError(f"Opsets mismatch {base_opsets['']} != {eval_opsets['']}.")
         if base_opsets[""] != optimizer_opsets[""]:
             raise AssertionError(f"Opsets mismatch {base_opsets['']} != {optimizer_opsets['']}.")
+
+
+def test_custom_loss_function():
+    # This test tries to add a custom loss function to the model.
+    # The custom loss function tries to use two model outputs of two different ranks, computes the
+    # two losses and returns the sum of the two losses.
+    # If the artifacts are generated successfully, without an exception being raised, the test passes.
+    class ModelWithTwoOutputs(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.a = torch.randn(20, 100, 35, 45)
+            self.b = torch.randn(40, 100, 70)
+
+        def forward(self, x, y):
+            return self.a + x, self.b + y
+
+    class CustomLossBlock(onnxblock.Block):
+        def __init__(self):
+            self._loss1 = onnxblock.loss.MSELoss()
+            self._loss2 = onnxblock.loss.BCEWithLogitsLoss()
+            self._add = onnxblock.blocks.Add()
+
+        def build(self, input1, input2):
+            return self._add(self._loss1(input1, target_name="target1"), self._loss2(input2, target_name="target2"))
+
+    model = ModelWithTwoOutputs()
+    onnx_model = _get_onnx_model(model, (torch.randn(20, 100, 35, 45), torch.randn(40, 100, 70)))
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        artifacts.generate_artifacts(onnx_model, loss=CustomLossBlock(), artifact_directory=temp_dir)

From 24e9daf7071b4a37be92e1b6e5bbf5cbaa0e65b6 Mon Sep 17 00:00:00 2001
From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Date: Thu, 11 Jan 2024 10:56:07 -0800
Subject: [PATCH 017/100] Enrich cuda resources with ep options (#19014)

Allow custom ops to access cuda ep options.

---------

Co-authored-by: Randy Shuai <rashuai@microsoft.com>
---
 .../core/providers/cuda/cuda_context.h        | 59 +++++++++++--------
 .../core/providers/cuda/cuda_resource.h       | 12 +++-
 .../core/session/onnxruntime_c_api.h          |  2 +-
 .../providers/cuda/cuda_execution_provider.cc |  3 +-
 .../core/providers/cuda/cuda_stream_handle.cc | 45 ++++++++++----
 .../core/providers/cuda/cuda_stream_handle.h  |  8 ++-
 .../tensorrt/tensorrt_execution_provider.cc   |  3 +-
 onnxruntime/core/session/custom_ops.cc        |  3 -
 .../custom_op_library/cuda/cuda_ops.cc        |  4 +-
 9 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index d73d551920d47..9416fad5f1448 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -28,38 +28,45 @@ struct CudaContext : public CustomOpContext {
   cudnnHandle_t cudnn_handle = {};
   cublasHandle_t cublas_handle = {};
   OrtAllocator* deferred_cpu_allocator = {};
+  // below are cuda ep options
+  int16_t device_id = 0;
+  int32_t arena_extend_strategy = 0;
+  int32_t cudnn_conv_algo_search = 0;
+  bool cudnn_conv_use_max_workspace = true;
+  bool cudnn_conv1d_pad_to_nc1d = false;
+  bool enable_skip_layer_norm_strict_mode = false;
+  bool prefer_nhwc = false;
 
   void Init(const OrtKernelContext& kernel_ctx) {
-    const auto& ort_api = Ort::GetApi();
-    void* resource = {};
-    OrtStatus* status = nullptr;
-
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cuda_stream_t, &resource);
-    if (status) {
-      ORT_CXX_API_THROW("failed to fetch cuda stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
-    }
-    cuda_stream = reinterpret_cast<cudaStream_t>(resource);
-
-    resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cudnn_handle_t, &resource);
-    if (status) {
-      ORT_CXX_API_THROW("failed to fetch cudnn handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
-    }
-    cudnn_handle = reinterpret_cast<cudnnHandle_t>(resource);
+    cuda_stream = FetchResource<cudaStream_t>(kernel_ctx, CudaResource::cuda_stream_t);
+    cudnn_handle = FetchResource<cudnnHandle_t>(kernel_ctx, CudaResource::cudnn_handle_t);
+    cublas_handle = FetchResource<cublasHandle_t>(kernel_ctx, CudaResource::cublas_handle_t);
+    deferred_cpu_allocator = FetchResource<OrtAllocator*>(kernel_ctx, CudaResource::deferred_cpu_allocator_t);
+
+    device_id = FetchResource<int16_t>(kernel_ctx, CudaResource::device_id_t);
+    arena_extend_strategy = FetchResource<int32_t>(kernel_ctx, CudaResource::arena_extend_strategy_t);
+    cudnn_conv_algo_search = FetchResource<int32_t>(kernel_ctx, CudaResource::cudnn_conv_algo_search_t);
+    cudnn_conv_use_max_workspace = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t);
+
+    cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
+    enable_skip_layer_norm_strict_mode = FetchResource<bool>(kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
+    prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
+  }
 
-    resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::cublas_handle_t, &resource);
-    if (status) {
-      ORT_CXX_API_THROW("failed to fetch cublas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  template <typename T>
+  T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
+    if (sizeof(T) > sizeof(void*)) {
+      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type), OrtErrorCode::ORT_INVALID_ARGUMENT);
     }
-    cublas_handle = reinterpret_cast<cublasHandle_t>(resource);
-
-    resource = {};
-    status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, CudaResource::deferred_cpu_allocator_t, &resource);
+    const auto& ort_api = Ort::GetApi();
+    void* resource = {};
+    OrtStatus* status = ort_api.KernelContext_GetResource(&kernel_ctx, ORT_CUDA_RESOUCE_VERSION, resource_type, &resource);
     if (status) {
-      ORT_CXX_API_THROW("failed to fetch deferred cpu allocator", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resouce type: " + std::to_string(resource_type), OrtErrorCode::ORT_RUNTIME_EXCEPTION);
     }
-    deferred_cpu_allocator = reinterpret_cast<OrtAllocator*>(resource);
+    T t = {};
+    memcpy(&t, &resource, sizeof(T));
+    return t;
   }
 
   void* AllocDeferredCpuMem(size_t size) const {
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index 8c3ed46ade6a1..c0e6328f27122 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -3,11 +3,19 @@
 
 #include "core/providers/resource.h"
 
-#define ORT_CUDA_RESOUCE_VERSION 2
+#define ORT_CUDA_RESOUCE_VERSION 3
 
 enum CudaResource : int {
-  cuda_stream_t = cuda_resource_offset,
+  cuda_stream_t = cuda_resource_offset,  // 10000
   cudnn_handle_t,
   cublas_handle_t,
   deferred_cpu_allocator_t,
+  // below are cuda ep options
+  device_id_t,  // 10004
+  arena_extend_strategy_t,
+  cudnn_conv_algo_search_t,
+  cudnn_conv_use_max_workspace_t,
+  cudnn_conv1d_pad_to_nc1d_t,
+  enable_skip_layer_norm_strict_mode_t,
+  prefer_nhwc_t,
 };
\ No newline at end of file
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 8cd0d0051d1eb..504f1db7b4420 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4418,7 +4418,7 @@ struct OrtApi {
   ORT_API2_STATUS(GetCUDAProviderOptionsByName, _In_ const OrtCUDAProviderOptionsV2* cuda_options, _In_ const char* key, _Outptr_ void** ptr);
 
   /**
-   * Get a EP resoure.
+   * Get a EP resource.
    * E.g. a cuda stream or a cublas handle
    *
    * \param context - Kernel context
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d8a0792209b0f..f7b23f12e8193 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -2465,7 +2465,8 @@ void CUDAExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry&
                             stream_,
                             use_ep_level_unified_stream_,
                             GetPerThreadContext().CudnnHandle(),
-                            GetPerThreadContext().CublasHandle());
+                            GetPerThreadContext().CublasHandle(),
+                            info_);
 }
 
 OrtDevice CUDAExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const {
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 9aad461b1d1c1..7c866395ecf6e 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -62,11 +62,13 @@ CudaStream::CudaStream(cudaStream_t stream,
                        bool release_cpu_buffer_on_cuda_stream,
                        bool own_flag,
                        cudnnHandle_t external_cudnn_handle,
-                       cublasHandle_t external_cublas_handle) : Stream(stream, device),
-                                                                own_stream_(own_flag),
-                                                                cpu_allocator_(cpu_allocator),
-                                                                release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
-                                                                deferred_cpu_allocator_(*this) {
+                       cublasHandle_t external_cublas_handle,
+                       const CUDAExecutionProviderInfo& ep_info) : Stream(stream, device),
+                                                                   own_stream_(own_flag),
+                                                                   cpu_allocator_(cpu_allocator),
+                                                                   release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
+                                                                   deferred_cpu_allocator_(*this),
+                                                                   ep_info_(ep_info) {
   if (own_flag) {
     CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
     CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
@@ -185,6 +187,27 @@ void* CudaStream::GetResource(int version, int id) const {
     case CudaResource::deferred_cpu_allocator_t:
       return const_cast<DeferredCpuAllocator*>(&deferred_cpu_allocator_);
       break;
+    case CudaResource::device_id_t:
+      return reinterpret_cast<void*>(ep_info_.device_id);
+      break;
+    case CudaResource::arena_extend_strategy_t:
+      return reinterpret_cast<void*>(ep_info_.arena_extend_strategy);
+      break;
+    case CudaResource::cudnn_conv_algo_search_t:
+      return reinterpret_cast<void*>(ep_info_.cudnn_conv_algo_search);
+      break;
+    case CudaResource::cudnn_conv_use_max_workspace_t:
+      return reinterpret_cast<void*>(ep_info_.cudnn_conv_use_max_workspace);
+      break;
+    case CudaResource::cudnn_conv1d_pad_to_nc1d_t:
+      return reinterpret_cast<void*>(ep_info_.cudnn_conv1d_pad_to_nc1d);
+      break;
+    case CudaResource::enable_skip_layer_norm_strict_mode_t:
+      return reinterpret_cast<void*>(ep_info_.enable_skip_layer_norm_strict_mode);
+      break;
+    case CudaResource::prefer_nhwc_t:
+      return reinterpret_cast<void*>(ep_info_.prefer_nhwc);
+      break;
     default:
       break;
   }
@@ -207,26 +230,28 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudaStream_t external_stream,
                                bool use_existing_stream,
                                cudnnHandle_t external_cudnn_handle,
-                               cublasHandle_t external_cublas_handle) {
+                               cublasHandle_t external_cublas_handle,
+                               const CUDAExecutionProviderInfo& ep_info) {
   // wait cuda notification on cuda ep
   stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitCudaNotificationOnDevice);
   // wait cuda notification on cpu ep
   stream_handle_registry.RegisterWaitFn(device_type, OrtDevice::CPU, WaitCudaNotificationOnHost);
   if (!use_existing_stream)
-    stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream](const OrtDevice& device) {
+    stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_cuda_stream, ep_info](const OrtDevice& device) {
       CUDA_CALL_THROW(cudaSetDevice(device.Id()));
       cudaStream_t stream = nullptr;
       CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
       // CUDA_CALL_THROW(cudaStreamCreate(&stream));
-      return std::make_unique<CudaStream>(stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, true, nullptr, nullptr);
+      return std::make_unique<CudaStream>(stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, true, nullptr, nullptr, ep_info);
     });
   else
     stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator,
                                                                 release_cpu_buffer_on_cuda_stream,
                                                                 external_stream,
                                                                 external_cudnn_handle,
-                                                                external_cublas_handle](const OrtDevice& device) {
-      return std::make_unique<CudaStream>(external_stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, false, external_cudnn_handle, external_cublas_handle);
+                                                                external_cublas_handle,
+                                                                ep_info](const OrtDevice& device) {
+      return std::make_unique<CudaStream>(external_stream, device, cpu_allocator, release_cpu_buffer_on_cuda_stream, false, external_cudnn_handle, external_cublas_handle, ep_info);
     });
 }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.h b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
index 917702fae08f1..b02c167e9e9ec 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.h
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.h
@@ -6,6 +6,7 @@
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/framework/stream_handles.h"
+#include "core/providers/cuda/cuda_execution_provider_info.h"
 
 namespace onnxruntime {
 
@@ -23,7 +24,8 @@ struct CudaStream : Stream {
              bool release_cpu_buffer_on_cuda_stream,
              bool own_flag,
              cudnnHandle_t external_cudnn_handle,
-             cublasHandle_t external_cublass_handle);
+             cublasHandle_t external_cublass_handle,
+             const CUDAExecutionProviderInfo& ep_info);
 
   ~CudaStream();
 
@@ -50,6 +52,7 @@ struct CudaStream : Stream {
   AllocatorPtr cpu_allocator_;
   bool release_cpu_buffer_on_cuda_stream_{true};
   DeferredCpuAllocator deferred_cpu_allocator_;
+  const CUDAExecutionProviderInfo ep_info_;
 };
 
 void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry,
@@ -59,6 +62,7 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudaStream_t external_stream,
                                bool use_existing_stream,
                                cudnnHandle_t external_cudnn_handle,
-                               cublasHandle_t external_cublass_handle);
+                               cublasHandle_t external_cublass_handle,
+                               const CUDAExecutionProviderInfo& ep_info);
 void WaitCudaNotificationOnDevice(Stream& stream, synchronize::Notification& notification);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 684303a8b6448..7397b84373db7 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3473,7 +3473,8 @@ void TensorrtExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegis
                             stream_,
                             external_stream_ /* use_existing_stream */,
                             external_cudnn_handle_,
-                            external_cublas_handle_);
+                            external_cublas_handle_,
+                            {});
 }
 
 OrtDevice TensorrtExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const {
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 984fdd6bce325..d653a27c577b0 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -374,9 +374,6 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Failed to fetch a stream hosting the requested resource");
   }
   *resource = stream->GetResource(resource_version, resource_id);
-  if (!(*resource)) {
-    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Requested resource does not exist");
-  }
   return nullptr;
   API_IMPL_END
 };
diff --git a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
index 3d561d378cb8c..43795921f17da 100644
--- a/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cuda/cuda_ops.cc
@@ -28,14 +28,14 @@ void KernelOne(const Ort::Custom::CudaContext& cuda_ctx,
                const Ort::Custom::Tensor<float>& X,
                const Ort::Custom::Tensor<float>& Y,
                Ort::Custom::Tensor<float>& Z) {
-  auto input_shape = X.Shape();
   CUSTOM_ENFORCE(cuda_ctx.cuda_stream, "failed to fetch cuda stream");
   CUSTOM_ENFORCE(cuda_ctx.cudnn_handle, "failed to fetch cudnn handle");
   CUSTOM_ENFORCE(cuda_ctx.cublas_handle, "failed to fetch cublas handle");
+  CUSTOM_ENFORCE(cuda_ctx.arena_extend_strategy == 0, "arena_extend_strategy mismatch");
   void* deferred_cpu_mem = cuda_ctx.AllocDeferredCpuMem(sizeof(int32_t));
   CUSTOM_ENFORCE(deferred_cpu_mem, "failed to allocate deferred cpu allocator");
   cuda_ctx.FreeDeferredCpuMem(deferred_cpu_mem);
-  auto z_raw = Z.Allocate(input_shape);
+  auto z_raw = Z.Allocate(X.Shape());
   cuda_add(Z.NumberOfElement(), z_raw, X.Data(), Y.Data(), cuda_ctx.cuda_stream);
 }
 

From 53497702a6647e853af0b97a913010a5b09e4114 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Thu, 11 Jan 2024 14:59:21 -0500
Subject: [PATCH 018/100] Fix Nuget CUDA Packaging pipeline (#19054)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Yi Zhang <zhanyi@microsoft.com>
---
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 +-
 .../cuda-packaging-pipeline.yml               | 34 +++++++++++++++----
 .../nuget/templates/test_linux.yml            |  1 +
 .../stages/nuget-combine-cuda-stage.yml       | 24 +++++++++++--
 4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 172a0dc1866ab..f97bf2dc6987f 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -890,7 +890,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PlatformsSupported: 'win-x64,linux-x64'
-        # 1* stands for version number. we use it to fileter Gpu.Windows and Gpu.Linux packages
+        # 1* stands for version number. we use it to filter Gpu.Windows and Gpu.Linux packages
         PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.1*nupkg'
         VerifyNugetSigning: false
 
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index efb936a8ded3d..a53416997025e 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -151,26 +151,48 @@ stages:
       DoEsrp: ${{ parameters.DoEsrp }}
       IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
   # Testing
-  ## Windows GPU Testing
   - template: nuget/templates/test_win.yml
     parameters:
-      AgentPool: 'onnxruntime-Win2022-GPU-T4'
-      NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
+      AgentPool : 'onnxruntime-Win2022-GPU-T4'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
       Skipx86Tests: 'true'
       CudaVersion: ${{ parameters.CudaVersion }}
       SpecificArtifact: ${{ parameters.SpecificArtifact }}
       BuildId: ${{ parameters.BuildId }}
-  ## Linux GPU Testing
+
+  - template: nuget/templates/test_win.yml
+    parameters:
+      AgentPool : 'onnxruntime-Win2022-GPU-T4'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
+      ArtifactSuffix: 'GPU'
+      StageSuffix: 'GPU'
+      MoreSuffix: '_Windows'
+      Skipx86Tests: 'true'
+      CudaVersion: ${{ parameters.CudaVersion }}
+      SpecificArtifact: ${{ parameters.SpecificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
+
   - template: nuget/templates/test_linux.yml
     parameters:
-      AgentPool: Onnxruntime-Linux-GPU
+      AgentPool : Onnxruntime-Linux-GPU
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
-      NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
+      CudaVersion: ${{ parameters.CudaVersion }}
       SpecificArtifact: ${{ parameters.specificArtifact }}
+      BuildId: ${{ parameters.BuildId }}
+
+  - template: nuget/templates/test_linux.yml
+    parameters:
+      AgentPool : Onnxruntime-Linux-GPU
+      ArtifactSuffix: 'GPU'
+      StageSuffix: 'GPU'
+      MoreSuffix: '_Linux'
+      NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Linux'
       CudaVersion: ${{ parameters.CudaVersion }}
+      SpecificArtifact: ${{ parameters.specificArtifact }}
       BuildId: ${{ parameters.BuildId }}
 
 ## Win/Linux GPU Combined Publishing
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index f735755b04bb3..f44106c145228 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -3,6 +3,7 @@ parameters:
   ArtifactSuffix: ''
   NugetPackageName : ''
   StageSuffix: 'CPU'
+  # More Suffix is used to differentiate testing for GPU and GPU-Windows/GPU-Linux packages
   MoreSuffix: ''
   NativePackagePrefix: 'onnxruntime'
   SpecificArtifact: false
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index d009e15559180..8ca3d9148b514 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -31,6 +31,8 @@ stages:
       variables:
         breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
         ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
+        BuildDate: $[format('{0:yyyyMMdd}', pipeline.startTime)]
+        BuildTime: $[format('{0:HHmm}', pipeline.startTime)]
 
       steps:
         - checkout: self
@@ -149,7 +151,8 @@ stages:
             solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
             configuration: RelWithDebInfo
             platform: 'Any CPU'
-            msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+            msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+                              -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:CurrentDate=$(BuildDate) -p:CurrentTime=$(BuildTime)'
             workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
         - task: BatchScript@1
@@ -189,8 +192,25 @@ stages:
           parameters:
             PackageType: 'nuget'
             PackagePath: '$(Build.ArtifactStagingDirectory)'
-            PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
             PlatformsSupported: 'win-x64,linux-x64'
+            # 1* stands for version number. we use it to filter Gpu.Windows and Gpu.Linux packages
+            PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.1*nupkg'
+            VerifyNugetSigning: false
+
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'nuget'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Windows.*nupkg'
+            PlatformsSupported: 'win-x64'
+            VerifyNugetSigning: false
+
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'nuget'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'Microsoft.ML.OnnxRuntime.Gpu.Linux.*nupkg'
+            PlatformsSupported: 'linux-x64'
             VerifyNugetSigning: false
 
         - task: PublishPipelineArtifact@0

From d0bac8216d324e78e6f3a89453bac207522b93eb Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Thu, 11 Jan 2024 12:13:24 -0800
Subject: [PATCH 019/100] [js/webgpu] fix bcast in where (#19009)

---
 js/web/lib/wasm/jsep/webgpu/ops/where.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/where.ts b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
index 687ee054096cc..2ef9637bcda5e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/where.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
@@ -76,7 +76,6 @@ const createWhereOpProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   const isBroadcast = !(ShapeUtil.areEqual(dimsA, dimsB) && ShapeUtil.areEqual(dimsB, dimsC));
   let outputShape = dimsA;
   let outputSize = ShapeUtil.size(dimsA);
-  const vecSize = Math.ceil(outputSize / 4);
   // TODO: deal with zero-sized tensors (eg. dims=[1,0])
 
   if (isBroadcast) {
@@ -88,6 +87,8 @@ const createWhereOpProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
     outputSize = ShapeUtil.size(outputShape);
   }
 
+  const vecSize = Math.ceil(outputSize / 4);
+
   return {
     name: 'Where',
     shaderCache: {inputDependencies: ['rank', 'rank', 'rank']},

From 4d1243b4b4dd5ce9475a690cb270e7435f3e970b Mon Sep 17 00:00:00 2001
From: ivberg <ivberg@microsoft.com>
Date: Thu, 11 Jan 2024 12:43:27 -0800
Subject: [PATCH 020/100] ORT ETW dynamic logging that improves ORT
 diagnosability & performance (#18882)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
This PR has several combined ORT ETW changes that improve ORT log
diagnosability & performance. 
- The existing log behavior in the ORT API and Severity behavior remain
the same as compiled by the dev using the ORT API
- The PR keeps the existing design which has 2 TraceLogging providers
defined (although both were not used before this PR)
- Keeps great inference (inf) and session load performance even with
dynamic logging enabled (see below)
- On Windows, when ONNXRuntimeTraceLoggingProvider is enabled, then ORT
will dynamically _add_ a new sink reflecting the severity level provided
by ETW dynamically. E.G Critical - Verbose per the need at runtime
- This allows previous printf style LOGS() statements both for CPU and
NPU cases to flow to ETW via a local trace (if enabled)
- This DOES NOT add any new Telemetry which can optionally be sent to
Microsoft.
- Telemetry are ETW events marked with the Measure keyword that _can_ be
sampled if a box opts-in
- Existing Microsoft.ML.ONNXRuntime events have appropriate keywords and
levels added if they were missing
- If Execution Providers (EPs) can provide more detailed insight into
their HW or component, then this PR allows for those to be dynamically
logged instead of just at compile time
- In this PR, the QNN EP for QC NPUs can have basic or detailed
profiling enabled to give some insight into how the NPU is performing
- When the Microsoft.ML.ONNXRuntime ETW provider is enabled with the
Profiling keyword and level 5 then QC QNN basic profiling info is output
to ETW

### Motivation and Context
- This make ORT logging and diagnosability more performant (on Windows)
and available in a wider variety of runtime environments.
- The performance difference for inf times was ~300x+ drastically
better/faster when these logs were output to ETW vs just stdout (Verbose
Severity)
- This style of ETW dynamic tracing is the widely used standard for
Windows components, and even by some 3rd party software since the ETW
API is open and part of the Windows API
- These ETW based logs can be seamlessly combined with other ETW logs
such as an AI component/feature using ORT, OS CPU profiling, scheduling,
and more
- Before the PR, ORT logging is largely printf style and output to a
sink (usually stdout) only if compiled with a certain log Severity. When
enabled the previous logging (to stdout) would vastly slow down
inference times. Once compiled for release the internal ORT logs were
not accessible by anyone except the AI model developer in their dev
inner loop. That means logs could not be enabled on a lab machine, or on
a production system where the runtime behavior or performance might be
different for various reasons on a wide variety of HW.
- This change was tested with performance in mind and tested with a
mobilenet small AI model with onnxruntime_perf_test
- CPU: There was no statistical difference for inf times with the
baseline (main) or this PR whether ETW was enabled or not (both ORT
providers all keywords level 5).
- NPU (QNN on SP9 or Dev Kit 2023 QC SQ3): There was no statistical
difference for inf times with this PR whether ETW (both ORT providers
all keywords) were enabled or not for Level 5 (Verbose). This is even
with QNN Basic profiling turned on and outputting NPU stats to ETW
- As expected and designed, there was perf slowdown when Max Level 255
is enabled which translates to QNN Detailed profiling. This mirrors the
expected slowdown in the NPU when individual model operations are
monitored & recorded as well. This perf is similar to the QNN SDK
Detailed profiling performance separate from this PR. This is designed
to be above level 5 (verbose) as that is commonly the max level used in
many trace profiles and won't affect inf performance.
- Other OSes such as Linux & Android are left untouched for now.
- Out of scope for this PR but TraceLogging is available for Linux with
LTTng tracing. So in the future, this optional tracing could also be
made available on other OSes where a TraceLogging API is available
---
 .../onnxruntime/core/common/logging/logging.h |  27 +++
 onnxruntime/core/common/logging/logging.cc    |  33 ++++
 .../common/logging/sinks/composite_sink.h     |  23 ++-
 .../core/framework/execution_providers.h      |   3 +
 .../core/framework/sequential_executor.cc     |   6 +-
 .../framework/stream_execution_context.cc     |   2 +-
 onnxruntime/core/platform/telemetry.cc        |  15 ++
 onnxruntime/core/platform/telemetry.h         |   8 +
 .../core/platform/windows/logging/etw_sink.cc | 128 +++++++++----
 .../core/platform/windows/logging/etw_sink.h  |  60 ++++++
 .../core/platform/windows/telemetry.cc        |  53 +++++-
 onnxruntime/core/platform/windows/telemetry.h |  26 +++
 .../qnn/builder/qnn_backend_manager.cc        | 171 +++++++++++++-----
 .../qnn/builder/qnn_backend_manager.h         |  28 ++-
 .../providers/qnn/qnn_execution_provider.cc   |  32 +++-
 onnxruntime/core/session/inference_session.cc |  26 ++-
 onnxruntime/core/session/ort_env.cc           |  28 +--
 .../core/session/provider_registration.cc     |   4 +
 onnxruntime/test/common/logging/sinks_test.cc |   2 +-
 .../logging/HowToValidateEtwSinkOutput.md     |  31 +++-
 .../platform/windows/logging/etw_sink_test.cc |   4 +-
 ort.wprp                                      |   4 +-
 22 files changed, 591 insertions(+), 123 deletions(-)

diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h
index bea3fa1d09cc2..2b9912ea77389 100644
--- a/include/onnxruntime/core/common/logging/logging.h
+++ b/include/onnxruntime/core/common/logging/logging.h
@@ -75,6 +75,21 @@ struct Category {
   // TODO: What other high level categories are meaningful? Model? Optimizer? Execution?
 };
 
+/// <summary>
+/// ORT TraceLogging keywords for categories of dynamic logging enablement
+/// </summary>
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
+
 class ISink;
 class Logger;
 class Capture;
@@ -333,5 +348,17 @@ unsigned int GetThreadId();
 */
 unsigned int GetProcessId();
 
+/**
+   If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then adds to the existing logger.
+*/
+std::unique_ptr<ISink> EnhanceLoggerWithEtw(std::unique_ptr<ISink> existingLogger, logging::Severity originalSeverity,
+                                            logging::Severity etwSeverity);
+
+/**
+  If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then can override the logging level.
+  But this overrided level only applies to the ETW sink. The original logger(s) retain their original logging level
+*/
+Severity OverrideLevelWithEtw(Severity originalSeverity);
+
 }  // namespace logging
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc
index 6c6e2f48557ef..eac9a7fa08081 100644
--- a/onnxruntime/core/common/logging/logging.cc
+++ b/onnxruntime/core/common/logging/logging.cc
@@ -12,6 +12,8 @@
 
 #ifdef _WIN32
 #include <Windows.h>
+#include "core/platform/windows/logging/etw_sink.h"
+#include "core/common/logging/sinks/composite_sink.h"
 #else
 #include <unistd.h>
 #if defined(__MACH__) || defined(__wasm__) || defined(_AIX)
@@ -243,5 +245,36 @@ unsigned int GetProcessId() {
 #endif
 }
 
+std::unique_ptr<ISink> EnhanceLoggerWithEtw(std::unique_ptr<ISink> existingLogger, logging::Severity originalSeverity,
+                                            logging::Severity etwSeverity) {
+#ifdef _WIN32
+  auto& manager = EtwRegistrationManager::Instance();
+  if (manager.IsEnabled()) {
+    auto compositeSink = std::make_unique<CompositeSink>();
+    compositeSink->AddSink(std::move(existingLogger), originalSeverity);
+    compositeSink->AddSink(std::make_unique<EtwSink>(), etwSeverity);
+    return compositeSink;
+  } else {
+    return existingLogger;
+  }
+#else
+  // On non-Windows platforms, just return the existing logger
+  (void)originalSeverity;
+  (void)etwSeverity;
+  return existingLogger;
+#endif  // _WIN32
+}
+
+Severity OverrideLevelWithEtw(Severity originalSeverity) {
+#ifdef _WIN32
+  auto& manager = logging::EtwRegistrationManager::Instance();
+  if (manager.IsEnabled() &&
+      (manager.Keyword() & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
+    return manager.MapLevelToSeverity();
+  }
+#endif  // _WIN32
+  return originalSeverity;
+}
+
 }  // namespace logging
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/logging/sinks/composite_sink.h b/onnxruntime/core/common/logging/sinks/composite_sink.h
index f27abb9e6aad5..9d18eb527ffdd 100644
--- a/onnxruntime/core/common/logging/sinks/composite_sink.h
+++ b/onnxruntime/core/common/logging/sinks/composite_sink.h
@@ -5,6 +5,8 @@
 
 #include <string>
 #include <vector>
+#include <utility>
+#include <memory>
 
 #include "core/common/logging/isink.h"
 #include "core/common/logging/logging.h"
@@ -27,20 +29,31 @@ class CompositeSink : public ISink {
   /// Adds a sink. Takes ownership of the sink (so pass unique_ptr by value).
   /// </summary>
   /// <param name="sink">The sink.</param>
+  /// <param name="severity">The min severity to send a message to that sink</param>
   /// <returns>This instance to allow chaining.</returns>
-  CompositeSink& AddSink(std::unique_ptr<ISink> sink) {
-    sinks_.push_back(std::move(sink));
+  CompositeSink& AddSink(std::unique_ptr<ISink> sink, logging::Severity severity) {
+    sinks_with_severity_.emplace_back(std::move(sink), severity);
     return *this;
   }
 
+  /// <summary>
+  /// Gets a const reference to the collection of sinks and min severity for that sink
+  /// </summary>
+  /// <returns>A const reference to the vector pair of unique_ptr to ISink and severity.</returns>
+  const std::vector<std::pair<std::unique_ptr<ISink>, logging::Severity>>& GetSinks() const {
+    return sinks_with_severity_;
+  }
+
  private:
   void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) override {
-    for (auto& sink : sinks_) {
-      sink->Send(timestamp, logger_id, message);
+    for (auto& sink_pair : sinks_with_severity_) {
+      if (message.Severity() >= sink_pair.second) {
+        sink_pair.first->Send(timestamp, logger_id, message);
+      }
     }
   }
 
-  std::vector<std::unique_ptr<ISink>> sinks_;
+  std::vector<std::pair<std::unique_ptr<ISink>, logging::Severity>> sinks_with_severity_;
 };
 }  // namespace logging
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h
index d97953fd9d5ea..61147e4367876 100644
--- a/onnxruntime/core/framework/execution_providers.h
+++ b/onnxruntime/core/framework/execution_providers.h
@@ -13,6 +13,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/common/logging/logging.h"
 #ifdef _WIN32
+#include <winmeta.h>
 #include "core/platform/tracing.h"
 #endif
 
@@ -47,6 +48,8 @@ class ExecutionProviders {
       TraceLoggingWrite(
           telemetry_provider_handle,
           "ProviderOptions",
+          TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+          TraceLoggingLevel(WINEVENT_LEVEL_INFO),
           TraceLoggingString(provider_id.c_str(), "ProviderId"),
           TraceLoggingString(config_pair.first.c_str(), "Key"),
           TraceLoggingString(config_pair.second.c_str(), "Value"));
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index ba68bc1d7d834..ea7f1397c961b 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -181,7 +181,7 @@ class SessionScope {
     }
 
     auto& logger = session_state_.Logger();
-    LOGS(logger, VERBOSE) << "Begin execution";
+    VLOGS(logger, 0) << "Begin execution";
     const SequentialExecutionPlan& seq_exec_plan = *session_state_.GetExecutionPlan();
     const auto& exec_plan_vec = seq_exec_plan.execution_plan;
     VLOGS(logger, 1) << "Size of execution plan vector: " << exec_plan_vec.size();
@@ -515,7 +515,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
     return Status(status.Category(), status.Code(), msg_string);
   }
   ctx.RecycleNodeInputs(idx);
-  LOGS(logger, VERBOSE) << "stream " << stream_idx << " launch kernel with idx " << idx;
+  VLOGS(logger, 0) << "stream " << stream_idx << " launch kernel with idx " << idx;
   return Status::OK();
 }
 
@@ -531,7 +531,7 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
                                    const bool only_execute_path_to_fetches,
                                    bool single_thread_mode) {
   auto* execution_plan = session_state.GetExecutionPlan();
-  LOGS(logger, VERBOSE) << "Number of streams: " << execution_plan->execution_plan.size();
+  VLOGS(logger, 0) << "Number of streams: " << execution_plan->execution_plan.size();
   int32_t valid_streams = 0;
   for (auto& stream : execution_plan->execution_plan) {
     if (stream && stream->steps_.size() > 0)
diff --git a/onnxruntime/core/framework/stream_execution_context.cc b/onnxruntime/core/framework/stream_execution_context.cc
index 4ff5ee5db865d..875e7f395bfa8 100644
--- a/onnxruntime/core/framework/stream_execution_context.cc
+++ b/onnxruntime/core/framework/stream_execution_context.cc
@@ -168,7 +168,7 @@ void StreamExecutionContext::RecycleNodeInputs(onnxruntime::NodeIndex node_index
   for (auto idx : execution_plan->node_release_list[node_index]) {
     if (--release_plan_[idx] == 0) {
       ORT_ENFORCE(frame_.ReleaseMLValue(static_cast<int>(execution_plan->release_actions[idx].value_index)).IsOK());
-      LOGS(*logger_, VERBOSE) << "ort value " << execution_plan->release_actions[idx].value_index << " released";
+      VLOGS(*logger_, 0) << "ort value " << execution_plan->release_actions[idx].value_index << " released";
     }
   }
 }
diff --git a/onnxruntime/core/platform/telemetry.cc b/onnxruntime/core/platform/telemetry.cc
index a99261d1d1caa..dc3b011cc7968 100644
--- a/onnxruntime/core/platform/telemetry.cc
+++ b/onnxruntime/core/platform/telemetry.cc
@@ -12,6 +12,21 @@ void LogRuntimeError(uint32_t sessionId, const common::Status& status, const cha
   env.GetTelemetryProvider().LogRuntimeError(sessionId, status, file, function, line);
 }
 
+bool Telemetry::IsEnabled() const {
+  return false;
+}
+
+// Get the current logging level
+// The Level defined as uchar is coming from the ETW Enable callback in TraceLoggingRegisterEx.
+unsigned char Telemetry::Level() const {
+  return 0;
+}
+
+// Get the current keyword
+uint64_t Telemetry::Keyword() const {
+  return 0;
+}
+
 void Telemetry::EnableTelemetryEvents() const {
 }
 
diff --git a/onnxruntime/core/platform/telemetry.h b/onnxruntime/core/platform/telemetry.h
index da808e73d97c3..7b61de9d54073 100644
--- a/onnxruntime/core/platform/telemetry.h
+++ b/onnxruntime/core/platform/telemetry.h
@@ -38,6 +38,14 @@ class Telemetry {
   virtual void DisableTelemetryEvents() const;
   virtual void SetLanguageProjection(uint32_t projection) const;
 
+  virtual bool IsEnabled() const;
+
+  // Get the current logging level
+  virtual unsigned char Level() const;
+
+  // Get the current keyword
+  virtual uint64_t Keyword() const;
+
   virtual void LogProcessInfo() const;
 
   virtual void LogSessionCreationStart() const;
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 396695e6c570c..5fb7f7a65161d 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -58,42 +58,107 @@ TRACELOGGING_DEFINE_PROVIDER(etw_provider_handle, "ONNXRuntimeTraceLoggingProvid
 #pragma warning(pop)
 #endif
 
-// Class to unregister ETW provider at shutdown.
-// We expect one static instance to be created for the lifetime of the program.
-class EtwRegistrationManager {
- public:
-  static EtwRegistrationManager& Register() {
-    const HRESULT etw_status = ::TraceLoggingRegister(etw_provider_handle);
-
-    if (FAILED(etw_status)) {
-      ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status));
-    }
+EtwRegistrationManager& EtwRegistrationManager::Instance() {
+  static EtwRegistrationManager instance;
+  instance.LazyInitialize();
+  return instance;
+}
 
-    // return an instance that is just used to unregister as the program exits
-    static EtwRegistrationManager instance(etw_status);
-    return instance;
-  }
+bool EtwRegistrationManager::IsEnabled() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return is_enabled_;
+}
+
+UCHAR EtwRegistrationManager::Level() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return level_;
+}
 
-  const HRESULT Status() const noexcept {
-    return etw_status_;
+Severity EtwRegistrationManager::MapLevelToSeverity() {
+  switch (level_) {
+    case TRACE_LEVEL_NONE:
+      return Severity::kFATAL;  // There is no none severity option
+    case TRACE_LEVEL_VERBOSE:
+      return Severity::kVERBOSE;
+    case TRACE_LEVEL_INFORMATION:
+      return Severity::kINFO;
+    case TRACE_LEVEL_WARNING:
+      return Severity::kWARNING;
+    case TRACE_LEVEL_ERROR:
+      return Severity::kERROR;
+    case TRACE_LEVEL_CRITICAL:
+      return Severity::kFATAL;
+    default:
+      return Severity::kVERBOSE;
   }
+}
+
+ULONGLONG EtwRegistrationManager::Keyword() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return keyword_;
+}
 
-  ~EtwRegistrationManager() {
-    ::TraceLoggingUnregister(etw_provider_handle);
+HRESULT EtwRegistrationManager::Status() const {
+  return etw_status_;
+}
+
+void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) {
+  std::lock_guard<OrtMutex> lock(callbacks_mutex_);
+  callbacks_.push_back(callback);
+}
+
+void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG IsEnabled,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _In_opt_ PVOID CallbackContext) {
+  auto& manager = EtwRegistrationManager::Instance();
+  {
+    std::lock_guard<OrtMutex> lock(manager.provider_change_mutex_);
+    manager.is_enabled_ = (IsEnabled != 0);
+    manager.level_ = Level;
+    manager.keyword_ = MatchAnyKeyword;
   }
+  manager.InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+}
+
+EtwRegistrationManager::~EtwRegistrationManager() {
+  ::TraceLoggingUnregister(etw_provider_handle);
+}
 
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(EtwRegistrationManager);
+EtwRegistrationManager::EtwRegistrationManager() {
+}
 
-  EtwRegistrationManager(const HRESULT status) noexcept : etw_status_{status} {}
-  const HRESULT etw_status_;
-};
+void EtwRegistrationManager::LazyInitialize() {
+  if (!initialized_) {
+    std::lock_guard<OrtMutex> lock(init_mutex_);
+    if (!initialized_) {  // Double-check locking pattern
+      initialized_ = true;
+      etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+      if (FAILED(etw_status_)) {
+        ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status_));
+      }
+    }
+  }
+}
+
+void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                                             ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
+                                             PVOID CallbackContext) {
+  std::lock_guard<OrtMutex> lock(callbacks_mutex_);
+  for (const auto& callback : callbacks_) {
+    callback(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  }
+}
 
 void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
   UNREFERENCED_PARAMETER(timestamp);
 
   // register on first usage
-  static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Register();
+  static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Instance();
 
   // do something (not that meaningful) with etw_manager so it doesn't get optimized out
   // as we want an instance around to do the unregister
@@ -101,9 +166,8 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
     return;
   }
 
-  // Do we want to output Verbose level messages via ETW at any point it time?
   // TODO: Validate if this filtering makes sense.
-  if (message.Severity() <= Severity::kVERBOSE || message.DataType() == DataType::USER) {
+  if (message.DataType() == DataType::USER) {
     return;
   }
 
@@ -114,11 +178,13 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
   // TraceLoggingWrite requires (painfully) a compile time constant for the TraceLoggingLevel,
   // forcing us to use an ugly macro for the call.
 #define ETW_EVENT_NAME "ONNXRuntimeLogEvent"
-#define TRACE_LOG_WRITE(level)                                                             \
-  TraceLoggingWrite(etw_provider_handle, ETW_EVENT_NAME, TraceLoggingLevel(level),         \
-                    TraceLoggingString(logger_id.c_str(), "logger"),                       \
-                    TraceLoggingString(message.Category(), "category"),                    \
-                    TraceLoggingString(message.Location().ToString().c_str(), "location"), \
+#define TRACE_LOG_WRITE(level)                                                                                      \
+  TraceLoggingWrite(etw_provider_handle, ETW_EVENT_NAME,                                                            \
+                    TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)), \
+                    TraceLoggingLevel(level),                                                                       \
+                    TraceLoggingString(logger_id.c_str(), "logger"),                                                \
+                    TraceLoggingString(message.Category(), "category"),                                             \
+                    TraceLoggingString(message.Location().ToString().c_str(), "location"),                          \
                     TraceLoggingString(message.Message().c_str(), "message"))
 
   const auto severity{message.Severity()};
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index 1e4f49a619302..143c3fcfdfc52 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -3,7 +3,9 @@
 
 #pragma once
 
+#include <Windows.h>
 #include <ntverp.h>
+#include <evntrace.h>
 
 // check for Windows 10 SDK or later
 // https://stackoverflow.com/questions/2665755/how-can-i-determine-the-version-of-the-windows-sdk-installed-on-my-computer
@@ -18,9 +20,11 @@
 #include <atomic>
 #include <iostream>
 #include <string>
+#include <vector>
 
 #include "core/common/logging/capture.h"
 #include "core/common/logging/isink.h"
+#include "core/platform/ort_mutex.h"
 
 namespace onnxruntime {
 namespace logging {
@@ -41,6 +45,62 @@ class EtwSink : public ISink {
   // EtwTracingManager to ensure we cleanly unregister it
   static std::atomic_flag have_instance_;
 };
+
+class EtwRegistrationManager {
+ public:
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  // Singleton instance access
+  static EtwRegistrationManager& Instance();
+
+  // Check if ETW logging is enabled
+  bool IsEnabled() const;
+
+  // Get the current logging level
+  UCHAR Level() const;
+
+  Severity MapLevelToSeverity();
+
+  // Get the current keyword
+  uint64_t Keyword() const;
+
+  // Get the ETW registration status
+  HRESULT Status() const;
+
+  void RegisterInternalCallback(const EtwInternalCallback& callback);
+
+ private:
+  EtwRegistrationManager();
+  ~EtwRegistrationManager();
+  void LazyInitialize();
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(EtwRegistrationManager);
+
+  void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                       ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext);
+
+  static void NTAPI ORT_TL_EtwEnableCallback(
+      _In_ LPCGUID SourceId,
+      _In_ ULONG IsEnabled,
+      _In_ UCHAR Level,
+      _In_ ULONGLONG MatchAnyKeyword,
+      _In_ ULONGLONG MatchAllKeyword,
+      _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+      _In_opt_ PVOID CallbackContext);
+
+  std::vector<EtwInternalCallback> callbacks_;
+  OrtMutex callbacks_mutex_;
+  mutable OrtMutex provider_change_mutex_;
+  OrtMutex init_mutex_;
+  bool initialized_ = false;
+  bool is_enabled_;
+  UCHAR level_;
+  ULONGLONG keyword_;
+  HRESULT etw_status_;
+};
+
 }  // namespace logging
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc
index ec49c2edc2125..a9849873fd060 100644
--- a/onnxruntime/core/platform/windows/telemetry.cc
+++ b/onnxruntime/core/platform/windows/telemetry.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/platform/windows/telemetry.h"
+#include "core/common/logging/logging.h"
 #include "onnxruntime_config.h"
 
 // ETW includes
@@ -16,6 +17,7 @@
 
 #include <TraceLoggingProvider.h>
 #include <evntrace.h>
+#include <winmeta.h>
 
 // Seems this workaround can be dropped when we drop support for VS2017 toolchains
 // https://developercommunity.visualstudio.com/content/problem/85934/traceloggingproviderh-is-incompatible-with-utf-8.html
@@ -55,15 +57,18 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim
 #endif
 
 OrtMutex WindowsTelemetry::mutex_;
+OrtMutex WindowsTelemetry::provider_change_mutex_;
 uint32_t WindowsTelemetry::global_register_count_ = 0;
 bool WindowsTelemetry::enabled_ = true;
 uint32_t WindowsTelemetry::projection_ = 0;
+UCHAR WindowsTelemetry::level_ = 0;
+UINT64 WindowsTelemetry::keyword_ = 0;
 
 WindowsTelemetry::WindowsTelemetry() {
   std::lock_guard<OrtMutex> lock(mutex_);
   if (global_register_count_ == 0) {
     // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process
-    HRESULT hr = TraceLoggingRegister(telemetry_provider_handle);
+    HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
     if (SUCCEEDED(hr)) {
       global_register_count_ += 1;
     }
@@ -80,6 +85,44 @@ WindowsTelemetry::~WindowsTelemetry() {
   }
 }
 
+bool WindowsTelemetry::IsEnabled() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return enabled_;
+}
+
+UCHAR WindowsTelemetry::Level() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return level_;
+}
+
+UINT64 WindowsTelemetry::Keyword() const {
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  return keyword_;
+}
+
+// HRESULT WindowsTelemetry::Status() {
+//     return etw_status_;
+// }
+
+void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG IsEnabled,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _In_opt_ PVOID CallbackContext) {
+  (void)SourceId;
+  (void)MatchAllKeyword;
+  (void)FilterData;
+  (void)CallbackContext;
+
+  std::lock_guard<OrtMutex> lock(provider_change_mutex_);
+  enabled_ = (IsEnabled != 0);
+  level_ = Level;
+  keyword_ = MatchAnyKeyword;
+}
+
 void WindowsTelemetry::EnableTelemetryEvents() const {
   enabled_ = true;
 }
@@ -110,6 +153,7 @@ void WindowsTelemetry::LogProcessInfo() const {
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingString(ORT_VERSION, "runtimeVersion"),
@@ -126,7 +170,8 @@ void WindowsTelemetry::LogSessionCreationStart() const {
                     "SessionCreationStart",
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
-                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
+                    TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO));
 }
 
 void WindowsTelemetry::LogEvaluationStop() const {
@@ -199,6 +244,8 @@ void WindowsTelemetry::LogSessionCreation(uint32_t session_id, int64_t ir_versio
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingUInt32(session_id, "sessionId"),
@@ -227,6 +274,7 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_ERROR),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingHResult(hr, "hResult"),
@@ -243,6 +291,7 @@ void WindowsTelemetry::LogRuntimeError(uint32_t session_id, const common::Status
                     TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
                     TelemetryPrivacyDataTag(PDT_ProductAndServicePerformance),
                     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+                    TraceLoggingLevel(WINEVENT_LEVEL_ERROR),
                     // Telemetry info
                     TraceLoggingUInt8(0, "schemaVersion"),
                     TraceLoggingUInt32(session_id, "sessionId"),
diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h
index 08e48214c85b3..c3798943d491d 100644
--- a/onnxruntime/core/platform/windows/telemetry.h
+++ b/onnxruntime/core/platform/windows/telemetry.h
@@ -3,6 +3,8 @@
 
 #pragma once
 #include "core/platform/telemetry.h"
+#include <Windows.h>
+#include <TraceLoggingProvider.h>
 #include "core/platform/ort_mutex.h"
 #include "core/platform/windows/TraceLoggingConfig.h"
 #include <atomic>
@@ -22,6 +24,17 @@ class WindowsTelemetry : public Telemetry {
   void DisableTelemetryEvents() const override;
   void SetLanguageProjection(uint32_t projection) const override;
 
+  bool IsEnabled() const override;
+
+  // Get the current logging level
+  unsigned char Level() const override;
+
+  // Get the current keyword
+  UINT64 Keyword() const override;
+
+  // Get the ETW registration status
+  // static HRESULT Status();
+
   void LogProcessInfo() const override;
 
   void LogSessionCreationStart() const override;
@@ -50,6 +63,19 @@ class WindowsTelemetry : public Telemetry {
   static uint32_t global_register_count_;
   static bool enabled_;
   static uint32_t projection_;
+
+  static OrtMutex provider_change_mutex_;
+  static UCHAR level_;
+  static ULONGLONG keyword_;
+
+  static void NTAPI ORT_TL_EtwEnableCallback(
+      _In_ LPCGUID SourceId,
+      _In_ ULONG IsEnabled,
+      _In_ UCHAR Level,
+      _In_ ULONGLONG MatchAnyKeyword,
+      _In_ ULONGLONG MatchAllKeyword,
+      _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+      _In_opt_ PVOID CallbackContext);
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 38d74909db86b..ca6a2238e520d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -18,6 +18,11 @@
 #include "core/common/logging/capture.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 
+#ifdef _WIN32
+#include <winmeta.h>
+#include "core/platform/tracing.h"
+#endif
+
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
 
@@ -843,28 +848,46 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
       LOGS(*logger_, VERBOSE) << "The QNN backend does not support extended event data.";
     }
 
-    // Write to CSV in append mode
-    const char* profilingCsvFilename = "qnn-profiling-data.csv";
-    std::ifstream infile(profilingCsvFilename);
-    bool exists = infile.good();
-    infile.close();
-
-    std::ofstream outfile(profilingCsvFilename, std::ios_base::app);
-    ORT_RETURN_IF(!outfile.is_open(), "Failed to open qnn-profiling-data.csv");
-    // If file didn't exist before, write the header
-    if (!exists) {
-      outfile << "Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier\n";
+    bool tracelogging_provider_ep_enabled = false;
+    const Env& env = Env::Default();
+    auto& provider = env.GetTelemetryProvider();
+    if (provider.IsEnabled()) {
+      auto keyword = provider.Keyword();
+      if ((keyword & static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
+        tracelogging_provider_ep_enabled = true;
+      }
+    }
+    std::ofstream outfile;
+    if (!tracelogging_provider_ep_enabled) {
+      // Write to CSV in append mode
+      const char* profilingCsvFilename = "qnn-profiling-data.csv";
+      std::ifstream infile(profilingCsvFilename);
+      bool exists = infile.good();
+      infile.close();
+
+      outfile.open(profilingCsvFilename, std::ios_base::app);
+      ORT_RETURN_IF(!outfile.is_open(), "Failed to open qnn-profiling-data.csv");
+      // If file didn't exist before, write the header
+      if (!exists) {
+        outfile << "Msg Timestamp,Message,Time,Unit of Measurement,Timing Source,Event Level,Event Identifier\n";
+      }
     }
 
     for (size_t event_idx = 0; event_idx < num_events; event_idx++) {
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingEvent(*(profile_events + event_idx), "ROOT", outfile, backendSupportsExtendedEventData));
+          ExtractProfilingEvent(*(profile_events + event_idx), "ROOT", outfile, backendSupportsExtendedEventData,
+                                tracelogging_provider_ep_enabled));
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingSubEvents(*(profile_events + event_idx), outfile, backendSupportsExtendedEventData));
+          ExtractProfilingSubEvents(*(profile_events + event_idx), outfile, backendSupportsExtendedEventData,
+                                    tracelogging_provider_ep_enabled));
     }
 
-    outfile.close();
-    LOGS(*logger_, INFO) << "Wrote QNN profiling events (" << num_events << ") to qnn-profiling-data.csv";
+    if (!tracelogging_provider_ep_enabled) {
+      outfile.close();
+      LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to qnn-profiling-data.csv";
+    } else {
+      LOGS(*logger_, VERBOSE) << "Wrote QNN profiling events (" << num_events << ") to ETW";
+    }
   }
 
   return Status::OK();
@@ -873,7 +896,8 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
 Status QnnBackendManager::ExtractProfilingSubEvents(
     QnnProfile_EventId_t profile_event_id,
     std::ofstream& outfile,
-    bool useExtendedEventData) {
+    bool useExtendedEventData,
+    bool tracelogging_provider_ep_enabled) {
   const QnnProfile_EventId_t* profile_sub_events{nullptr};
   uint32_t num_sub_events{0};
   auto result = qnn_interface_.profileGetSubEvents(profile_event_id, &profile_sub_events, &num_sub_events);
@@ -884,12 +908,14 @@ Status QnnBackendManager::ExtractProfilingSubEvents(
 
     for (size_t sub_event_idx = 0; sub_event_idx < num_sub_events; sub_event_idx++) {
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingEvent(*(profile_sub_events + sub_event_idx), "SUB-EVENT", outfile, useExtendedEventData));
+          ExtractProfilingEvent(*(profile_sub_events + sub_event_idx), "SUB-EVENT", outfile, useExtendedEventData,
+                                tracelogging_provider_ep_enabled));
       ORT_RETURN_IF_ERROR(
-          ExtractProfilingSubEvents(*(profile_sub_events + sub_event_idx), outfile, useExtendedEventData));
+          ExtractProfilingSubEvents(*(profile_sub_events + sub_event_idx), outfile, useExtendedEventData,
+                                    tracelogging_provider_ep_enabled));
     }
 
-    LOGS(*logger_, INFO) << "Wrote QNN profiling sub events (" << num_sub_events << ") to qnn-profiling-data.csv";
+    LOGS(*logger_, VERBOSE) << "Wrote QNN profiling sub events (" << num_sub_events << ")";
   }
 
   return Status::OK();
@@ -899,18 +925,20 @@ Status QnnBackendManager::ExtractProfilingEvent(
     QnnProfile_EventId_t profile_event_id,
     const std::string& eventLevel,
     std::ofstream& outfile,
-    bool useExtendedEventData) {
+    bool useExtendedEventData,
+    bool tracelogging_provider_ep_enabled) {
   if (useExtendedEventData) {
-    return ExtractProfilingEventExtended(profile_event_id, eventLevel, outfile);
+    return ExtractProfilingEventExtended(profile_event_id, eventLevel, outfile, tracelogging_provider_ep_enabled);
   } else {
-    return ExtractProfilingEventBasic(profile_event_id, eventLevel, outfile);
+    return ExtractProfilingEventBasic(profile_event_id, eventLevel, outfile, tracelogging_provider_ep_enabled);
   }
 }
 
 Status QnnBackendManager::ExtractProfilingEventBasic(
     QnnProfile_EventId_t profile_event_id,
     const std::string& eventLevel,
-    std::ofstream& outfile) {
+    std::ofstream& outfile,
+    bool tracelogging_provider_ep_enabled) {
   QnnProfile_EventData_t event_data;
   auto result = qnn_interface_.profileGetEventData(profile_event_id, &event_data);
   QnnProfile_Error_t errorCode = static_cast<QnnProfile_Error_t>(result & 0xFFFF);
@@ -919,15 +947,32 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
   std::string message = GetEventTypeString(event_data.type);
   std::string unit = GetUnitString(event_data.unit);
 
-  outfile << "UNKNOWN"
-          << ","
-          << message << ","
-          << event_data.value << ","
-          << unit << ","
-          << "BACKEND"
-          << ","
-          << eventLevel << ","
-          << (event_data.identifier ? event_data.identifier : "NULL") << "\n";
+#ifndef _WIN32
+  tracelogging_provider_ep_enabled = false;
+#endif
+
+  if (!tracelogging_provider_ep_enabled) {
+    outfile << "UNKNOWN"
+            << ","
+            << message << ","
+            << event_data.value << ","
+            << unit << ","
+            << "BACKEND"
+            << ","
+            << eventLevel << ","
+            << (event_data.identifier ? event_data.identifier : "NULL") << "\n";
+  } else {
+#ifdef _WIN32
+    LogQnnProfileEventAsTraceLogging(
+        (uint64_t)0,
+        message,
+        std::to_string(event_data.value),
+        unit,
+        "BACKEND",
+        eventLevel,
+        (event_data.identifier ? event_data.identifier : "NULL"));
+#endif
+  }
 
   return Status::OK();
 }
@@ -935,7 +980,8 @@ Status QnnBackendManager::ExtractProfilingEventBasic(
 Status QnnBackendManager::ExtractProfilingEventExtended(
     QnnProfile_EventId_t profile_event_id,
     const std::string& eventLevel,
-    std::ofstream& outfile) {
+    std::ofstream& outfile,
+    bool tracelogging_provider_ep_enabled) {
   QnnProfile_ExtendedEventData_t event_data_extended;
   auto resultGetExtendedEventData = qnn_interface_.profileGetExtendedEventData(profile_event_id, &event_data_extended);
   QnnProfile_Error_t errorCode = static_cast<QnnProfile_Error_t>(resultGetExtendedEventData & 0xFFFF);
@@ -944,20 +990,61 @@ Status QnnBackendManager::ExtractProfilingEventExtended(
   std::string message = GetEventTypeString(event_data_extended.v1.type);
   std::string unit = GetUnitString(event_data_extended.v1.unit);
 
-  if (event_data_extended.version == QNN_PROFILE_DATA_VERSION_1) {
-    outfile << event_data_extended.v1.timestamp << ","
-            << message << ","
-            << ExtractQnnScalarValue(event_data_extended.v1.value) << ","
-            << unit << ","
-            << "BACKEND"
-            << ","
-            << eventLevel << ","
-            << (event_data_extended.v1.identifier ? event_data_extended.v1.identifier : "NULL") << "\n";
+#ifndef _WIN32
+  tracelogging_provider_ep_enabled = false;
+#endif
+
+  if (!tracelogging_provider_ep_enabled) {
+    if (event_data_extended.version == QNN_PROFILE_DATA_VERSION_1) {
+      outfile << event_data_extended.v1.timestamp << ","
+              << message << ","
+              << ExtractQnnScalarValue(event_data_extended.v1.value) << ","
+              << unit << ","
+              << "BACKEND"
+              << ","
+              << eventLevel << ","
+              << (event_data_extended.v1.identifier ? event_data_extended.v1.identifier : "NULL") << "\n";
+    }
+  } else {
+#ifdef _WIN32
+    LogQnnProfileEventAsTraceLogging(
+        event_data_extended.v1.timestamp,
+        message,
+        ExtractQnnScalarValue(event_data_extended.v1.value),
+        unit,
+        "BACKEND",
+        eventLevel,
+        (event_data_extended.v1.identifier ? event_data_extended.v1.identifier : "NULL"));
+#endif
   }
 
   return Status::OK();
 }
 
+#ifdef _WIN32
+void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
+    uint64_t timestamp,
+    const std::string& message,
+    const std::string& qnnScalarValue,
+    const std::string& unit,
+    const std::string& timingSource,
+    const std::string& eventLevel,
+    const char* eventIdentifier) {
+  TraceLoggingWrite(
+      telemetry_provider_handle,
+      "QNNProfilingEvent",
+      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)),
+      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+      TraceLoggingValue(timestamp, "Timestamp"),
+      TraceLoggingString(message.c_str(), "Message"),
+      TraceLoggingString(qnnScalarValue.c_str(), "Value"),
+      TraceLoggingString(unit.c_str(), "Unit of Measurement"),
+      TraceLoggingString(timingSource.c_str(), "Timing Source"),
+      TraceLoggingString(eventLevel.c_str(), "Event Level"),
+      TraceLoggingString(eventIdentifier, "Event Identifier"));
+}
+#endif
+
 const std::string& QnnBackendManager::GetUnitString(QnnProfile_EventUnit_t unitType) {
   const auto& unitStringMap = GetUnitStringMap();
   auto it = unitStringMap.find(unitType);
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index bc05820da2f73..58f207efb9e95 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -6,10 +6,15 @@
 #include <windows.h>
 #include <psapi.h>
 #include <libloaderapi.h>
+#include <set>
 #else
 #include <dlfcn.h>
 #endif
 
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
 #include "HTP/QnnHtpDevice.h"
 #include "QnnLog.h"
 #include "System/QnnSystemInterface.h"
@@ -117,8 +122,11 @@ class QnnBackendManager {
   void Split(std::vector<std::string>& split_string, const std::string& tokenized_string, const char separator);
 
   Status ExtractBackendProfilingInfo();
-  Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile, bool backendSupportsExtendedEventData);
-  Status ExtractProfilingEvent(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel, std::ofstream& outfile, bool backendSupportsExtendedEventData);
+  Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile,
+                                   bool backendSupportsExtendedEventData, bool tracelogging_provider_ep_enabled);
+  Status ExtractProfilingEvent(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel,
+                               std::ofstream& outfile, bool backendSupportsExtendedEventData,
+                               bool tracelogging_provider_ep_enabled);
 
   void SetQnnBackendType(uint32_t backend_id);
   QnnBackendType GetQnnBackendType() { return qnn_backend_type_; }
@@ -175,13 +183,25 @@ class QnnBackendManager {
     return (backend_build_id == nullptr ? std::string("") : std::string(backend_build_id));
   }
 
-  Status ExtractProfilingEventBasic(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel, std::ofstream& outfile);
-  Status ExtractProfilingEventExtended(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel, std::ofstream& outfile);
+  Status ExtractProfilingEventBasic(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel,
+                                    std::ofstream& outfile, bool tracelogging_provider_ep_enabled);
+  Status ExtractProfilingEventExtended(QnnProfile_EventId_t profile_event_id, const std::string& eventLevel,
+                                       std::ofstream& outfile, bool tracelogging_provider_ep_enabled);
   static const std::string& GetUnitString(QnnProfile_EventUnit_t unitType);
   static const std::unordered_map<QnnProfile_EventUnit_t, std::string>& GetUnitStringMap();
   static const std::string GetEventTypeString(QnnProfile_EventType_t eventType);
   static const std::string ExtractQnnScalarValue(const Qnn_Scalar_t& scalar);
   const char* QnnProfileErrorToString(QnnProfile_Error_t error);
+#ifdef _WIN32
+  void LogQnnProfileEventAsTraceLogging(
+      uint64_t timestamp,
+      const std::string& message,
+      const std::string& qnnScalarValue,
+      const std::string& unit,
+      const std::string& timingSource,
+      const std::string& eventLevel,
+      const char* eventIdentifier);
+#endif
 
  private:
   const std::string backend_path_;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index c72012fd4a19b..e5856e85e19e8 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -4,12 +4,13 @@
 #include "qnn_execution_provider.h"
 
 #include <filesystem>
-#include "core/providers/common.h"
 #include "core/framework/compute_capability.h"
 #include "core/graph/graph_viewer.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/kernel_registry.h"
+#include "core/platform/env.h"
+#include "core/providers/common.h"
 #include "core/providers/partitioning_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/partitioning_utils.h"
@@ -28,7 +29,7 @@ static void ParseProfilingLevel(std::string profiling_level_string,
                  profiling_level_string.end(),
                  profiling_level_string.begin(),
                  [](unsigned char c) { return static_cast<unsigned char>(std::tolower(c)); });
-  LOGS_DEFAULT(VERBOSE) << "profiling_level: " << profiling_level_string;
+  LOGS_DEFAULT(INFO) << "profiling_level: " << profiling_level_string;
   if (profiling_level_string == "off") {
     profiling_level = qnn::ProfilingLevel::OFF;
   } else if (profiling_level_string == "basic") {
@@ -146,9 +147,30 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
 
   static const std::string PROFILING_LEVEL = "profiling_level";
   qnn::ProfilingLevel profiling_level = qnn::ProfilingLevel::OFF;
-  auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
-  if (profiling_level_pos != provider_options_map.end()) {
-    ParseProfilingLevel(profiling_level_pos->second, profiling_level);
+  const Env& env = Env::Default();
+  auto& provider = env.GetTelemetryProvider();
+  if (provider.IsEnabled()) {
+    auto level = provider.Level();
+    auto keyword = provider.Keyword();
+    if ((keyword & static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
+      if (level != 0) {
+        if (level == 5) {
+          LOGS_DEFAULT(INFO) << "Overriding profiling to basic based on ETW level: " << static_cast<int>(level);
+          ParseProfilingLevel("basic", profiling_level);
+        } else if (level < 5) {
+          LOGS_DEFAULT(INFO) << "QNN Profiler ETW level not supported below level 5. Level: "
+                             << static_cast<int>(level);
+        } else {
+          LOGS_DEFAULT(INFO) << "Overriding profiling to detailed based on ETW level: " << static_cast<int>(level);
+          ParseProfilingLevel("detailed", profiling_level);
+        }
+      }
+    }
+  } else {
+    auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
+    if (profiling_level_pos != provider_options_map.end()) {
+      ParseProfilingLevel(profiling_level_pos->second, profiling_level);
+    }
   }
 
   static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency";
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 665cdbc36a963..93877c8dd66bd 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -69,6 +69,11 @@
 #include "core/util/protobuf_parsing_utils.h"
 #include "core/util/thread_utils.h"
 
+#ifdef _WIN32
+#include "core/platform/windows/logging/etw_sink.h"
+#include "core/common/logging/sinks/composite_sink.h"
+#endif
+
 // custom ops are not available in a minimal build unless ORT_MINIMAL_BUILD_CUSTOM_OPS is set
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 #include "core/framework/customregistry.h"
@@ -307,6 +312,7 @@ static Status FinalizeSessionOptions(const SessionOptions& user_provided_session
 
 logging::Severity GetSeverity(const SessionOptions& session_options) {
   logging::Severity severity = logging::Severity::kWARNING;
+
   if (session_options.session_log_severity_level == -1) {
     severity = logging::LoggingManager::DefaultLogger().GetSeverity();
   } else {
@@ -322,11 +328,17 @@ logging::Severity GetSeverity(const SessionOptions& session_options) {
 void InferenceSession::SetLoggingManager(const SessionOptions& session_options,
                                          const Environment& session_env) {
   logging_manager_ = session_env.GetLoggingManager();
+  std::unique_ptr<logging::ISink> sink;
+
   if (session_options.user_logging_function) {
-    std::unique_ptr<logging::ISink> user_sink = std::make_unique<UserLoggingSink>(session_options.user_logging_function,
-                                                                                  session_options.user_logging_param);
-    user_logging_manager_ = std::make_unique<logging::LoggingManager>(std::move(user_sink),
-                                                                      GetSeverity(session_options),
+    sink = std::make_unique<UserLoggingSink>(session_options.user_logging_function,
+                                             session_options.user_logging_param);
+    auto sessionSeverity = GetSeverity(session_options);
+    auto etwOverrideSeverity = logging::OverrideLevelWithEtw(sessionSeverity);
+    sink = EnhanceLoggerWithEtw(std::move(sink), sessionSeverity, etwOverrideSeverity);
+
+    user_logging_manager_ = std::make_unique<logging::LoggingManager>(std::move(sink),
+                                                                      std::min(sessionSeverity, etwOverrideSeverity),
                                                                       false,
                                                                       logging::LoggingManager::InstanceType::Temporal,
                                                                       &session_options.session_logid);
@@ -467,6 +479,8 @@ void InferenceSession::TraceSessionOptions(const SessionOptions& session_options
 #ifdef _WIN32
   TraceLoggingWrite(telemetry_provider_handle,
                     "SessionOptions",
+                    TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+                    TraceLoggingLevel(WINEVENT_LEVEL_INFO),
                     TraceLoggingUInt8(static_cast<UINT8>(session_options.execution_mode), "execution_mode"),
                     TraceLoggingUInt8(static_cast<UINT8>(session_options.execution_order), "execution_order"),
                     TraceLoggingBoolean(session_options.enable_profiling, "enable_profiling"),
@@ -487,6 +501,8 @@ void InferenceSession::TraceSessionOptions(const SessionOptions& session_options
   TraceLoggingWrite(
       telemetry_provider_handle,
       "SessionOptions_IntraOrtThreadPoolParams",
+      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+      TraceLoggingLevel(WINEVENT_LEVEL_INFO),
       TraceLoggingInt32(session_options.intra_op_param.thread_pool_size, "thread_pool_size"),
       TraceLoggingBoolean(session_options.intra_op_param.auto_set_affinity, "auto_set_affinity"),
       TraceLoggingBoolean(session_options.intra_op_param.allow_spinning, "allow_spinning"),
@@ -499,6 +515,8 @@ void InferenceSession::TraceSessionOptions(const SessionOptions& session_options
     TraceLoggingWrite(
         telemetry_provider_handle,
         "SessionOptions_ConfigEntry",
+        TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+        TraceLoggingLevel(WINEVENT_LEVEL_INFO),
         TraceLoggingString(config_pair.first.c_str(), "Key"),
         TraceLoggingString(config_pair.second.c_str(), "Value"));
   }
diff --git a/onnxruntime/core/session/ort_env.cc b/onnxruntime/core/session/ort_env.cc
index e3957baa990f8..331f1db26a029 100644
--- a/onnxruntime/core/session/ort_env.cc
+++ b/onnxruntime/core/session/ort_env.cc
@@ -39,23 +39,23 @@ OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_inf
   if (!p_instance_) {
     std::unique_ptr<LoggingManager> lmgr;
     std::string name = lm_info.logid;
+
+    std::unique_ptr<ISink> sink = nullptr;
     if (lm_info.logging_function) {
-      std::unique_ptr<ISink> logger = std::make_unique<UserLoggingSink>(lm_info.logging_function,
-                                                                        lm_info.logger_param);
-      lmgr = std::make_unique<LoggingManager>(std::move(logger),
-                                              static_cast<Severity>(lm_info.default_warning_level),
-                                              false,
-                                              LoggingManager::InstanceType::Default,
-                                              &name);
-    } else {
-      auto sink = MakePlatformDefaultLogSink();
+      sink = std::make_unique<UserLoggingSink>(lm_info.logging_function, lm_info.logger_param);
 
-      lmgr = std::make_unique<LoggingManager>(std::move(sink),
-                                              static_cast<Severity>(lm_info.default_warning_level),
-                                              false,
-                                              LoggingManager::InstanceType::Default,
-                                              &name);
+    } else {
+      sink = MakePlatformDefaultLogSink();
     }
+    auto etwOverrideSeverity = logging::OverrideLevelWithEtw(static_cast<Severity>(lm_info.default_warning_level));
+    sink = EnhanceLoggerWithEtw(std::move(sink), static_cast<Severity>(lm_info.default_warning_level),
+                                etwOverrideSeverity);
+    lmgr = std::make_unique<LoggingManager>(std::move(sink),
+                                            std::min(static_cast<Severity>(lm_info.default_warning_level), etwOverrideSeverity),
+                                            false,
+                                            LoggingManager::InstanceType::Default,
+                                            &name);
+
     std::unique_ptr<onnxruntime::Environment> env;
     if (!tp_options) {
       status = onnxruntime::Environment::Create(std::move(lmgr), env);
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index 2e9af9f1f9bb2..b012406bd026a 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -4,6 +4,7 @@
 #include <string>
 
 #include "core/common/common.h"
+#include "core/common/logging/logging.h"
 #include "core/framework/error_code_helper.h"
 #include "core/framework/provider_options.h"
 #include "core/providers/provider_factory_creators.h"
@@ -13,6 +14,7 @@
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 
 #ifdef _WIN32
+#include <winmeta.h>
 #include "core/platform/tracing.h"
 #endif
 
@@ -75,6 +77,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     TraceLoggingWrite(
         telemetry_provider_handle,
         "ProviderOptionsAppendExecutionProvider",
+        TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
+        TraceLoggingLevel(WINEVENT_LEVEL_INFO),
         TraceLoggingString(provider_name, "ProviderName"),
         TraceLoggingString(config_pair.first.c_str(), "Key"),
         TraceLoggingString(config_pair.second.c_str(), "Value"));
diff --git a/onnxruntime/test/common/logging/sinks_test.cc b/onnxruntime/test/common/logging/sinks_test.cc
index 28fb407bc2f0e..7ca8d5fc1152c 100644
--- a/onnxruntime/test/common/logging/sinks_test.cc
+++ b/onnxruntime/test/common/logging/sinks_test.cc
@@ -156,7 +156,7 @@ TEST(LoggingTests, TestCompositeSink) {
   EXPECT_CALL(*sink_ptr2, SendImpl(testing::_, testing::_, testing::_)).Times(1);
 
   CompositeSink* sink = new CompositeSink();
-  sink->AddSink(std::unique_ptr<ISink>{sink_ptr1}).AddSink(std::unique_ptr<ISink>{sink_ptr2});
+  sink->AddSink(std::unique_ptr<ISink>{sink_ptr1}, min_log_level).AddSink(std::unique_ptr<ISink>{sink_ptr2}, min_log_level);
   LoggingManager manager{std::unique_ptr<ISink>(sink), min_log_level, false, InstanceType::Temporal};
 
   auto logger = manager.CreateLogger(logid);
diff --git a/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md b/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md
index 2f8d06d66d576..59fe946b929f2 100644
--- a/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md
+++ b/onnxruntime/test/platform/windows/logging/HowToValidateEtwSinkOutput.md
@@ -1,10 +1,19 @@
-## Validating ETW Sink unit test output
+## About the ETW Sink
 
-## Setup
-Install Windows Performance Toolkit from <https://docs.microsoft.com/en-us/windows-hardware/get-started/adk-install>
-You get to select components when installing, so can select just the performance toolkit.
+The ETW Sink (ONNXRuntimeTraceLoggingProvider) allows ONNX semi-structured printf style logs to be output via ETW.
 
-Overview of the steps is at <https://msdn.microsoft.com/en-us/library/windows/desktop/dn904629(v=vs.85).aspx> if you want more detail.
+ETW makes it easy and useful to only enable and listen for events with great performance, and when you need them instead of only at compile time.
+Therefore ONNX will preserve any existing loggers and log severity [provided at compile time](docs/FAQ.md?plain=1#L7).
+
+However, when the provider is enabled a new ETW logger sink will also be added and the severity separately controlled via ETW dynamically.
+
+- Provider GUID: 929DD115-1ECB-4CB5-B060-EBD4983C421D
+- Keyword: Logs (0x2) keyword per [logging.h](include\onnxruntime\core\common\logging\logging.h)
+- Level: 1-5 ([CRITICAL through VERBOSE](https://learn.microsoft.com/en-us/windows/win32/api/evntprov/ns-evntprov-event_descriptor)) [mapping](onnxruntime\core\platform\windows\logging\etw_sink.cc) to [ONNX severity](include\onnxruntime\core\common\logging\severity.h) in an intuitive manner
+
+Notes:
+- The ETW provider must be enabled prior to session creation, as that as when internal logging setup is complete
+- Other structured ETW logs are output via the other Microsoft.ML.ONNXRuntime ETW provider. Both used together are recommended
 
 ## Capturing ETW trace output
 
@@ -25,9 +34,17 @@ Run the ETW sink unit tests
 Stop the ETW tracing
     `<path to repo>\onnxruntime\test\platform\windows\logging> wpr -stop TraceCaptureFile.etl EtwSinkTest`
 
-## View the output
+## View the trace output
+
+### Setup
+- Install Windows Performance Analyzer (Preview) from the Windows Store - <https://www.microsoft.com/en-us/p/windows-performance-analyzer-preview/9n58qrw40dfw>
+- Or from the ADK <https://docs.microsoft.com/en-us/windows-hardware/get-started/adk-install>
+  - You get to select components when installing, so can select just the performance toolkit.
+  - Overview of the steps is at <https://msdn.microsoft.com/en-us/library/windows/desktop/dn904629(v=vs.85).aspx> if you want more detail.
+
+### Viewing
 
-Open TraceCaptureFile.etl file Windows Performance Analyzer.
+Open TraceCaptureFile.etl file in Windows Performance Analyzer.
 
 Expand the "System Activity" dropdown in the left pane, and double-click "Generic Events".
 That should open events in an Analysis window in the right pane. You should see an event
diff --git a/onnxruntime/test/platform/windows/logging/etw_sink_test.cc b/onnxruntime/test/platform/windows/logging/etw_sink_test.cc
index 7436ac5bd1729..05ef81d05f4ef 100644
--- a/onnxruntime/test/platform/windows/logging/etw_sink_test.cc
+++ b/onnxruntime/test/platform/windows/logging/etw_sink_test.cc
@@ -47,8 +47,8 @@ TEST(LoggingTests, TestEtwSink) {
 /// </summary>
 TEST(LoggingTests, TestEtwSinkCtor) {
   CompositeSink* sinks = new CompositeSink();
-  sinks->AddSink(std::unique_ptr<ISink>(new EtwSink()))
-      .AddSink(std::unique_ptr<ISink>(new EtwSink()));
+  sinks->AddSink(std::unique_ptr<ISink>(new EtwSink()), Severity::kWARNING)
+      .AddSink(std::unique_ptr<ISink>(new EtwSink()), Severity::kWARNING);
 
   LoggingManager manager{std::unique_ptr<ISink>{sinks},
                          Severity::kWARNING,
diff --git a/ort.wprp b/ort.wprp
index 8738efeb599ad..b82ec5882c60d 100644
--- a/ort.wprp
+++ b/ort.wprp
@@ -8,12 +8,12 @@
   <Profiles>
     <EventCollector Id="EventCollector_OrtTraceLoggingProvider"
       Name="OrtTraceLoggingProviderCollector">
-      <BufferSize Value="65536" />
+      <BufferSize Value="1024" />
       <Buffers Value="10" PercentageOfTotalMemory="true"/>
     </EventCollector>
 
     <EventProvider Id="EventProvider_OrtTraceLoggingProvider"
-      Name="3a26b1ff-7484-7484-7484-15261f42614d" />
+      Name="3a26b1ff-7484-7484-7484-15261f42614d" Level="5" />
     <Profile Id="OrtTraceLoggingProvider.Verbose.File"
       Name="OrtTraceLoggingProvider" Description="OrtTraceLoggingProvider"
       LoggingMode="File" DetailLevel="Verbose">

From daa22f919feb0280e2951e5345483c3c8a8afebf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Thu, 11 Jan 2024 22:37:10 +0100
Subject: [PATCH 021/100] [TensorRT] query GPU properties only once when
 setting device_id (#19092)

### Description

For most models this does not show significant overhead but for very
small models it shows significant impact.
Attached screenshot shows impact when only using 2 FC layers:

![image](https://github.com/microsoft/onnxruntime/assets/44298237/b4fdf8bf-0422-43ab-a49e-7d2996cba68e)
---
 .../tensorrt/tensorrt_execution_provider.cc   | 23 ++++++++-----------
 .../tensorrt/tensorrt_execution_provider.h    |  1 +
 .../tensorrt_execution_provider_utils.h       |  4 ++--
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 7397b84373db7..4ece068b50fd1 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1315,6 +1315,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
   InitProviderOrtApi();
 
   CUDA_CALL_THROW(cudaSetDevice(device_id_));
+  cudaDeviceProp prop;
+  CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
+  compute_capability_ = GetComputeCapacity(prop);
   if (info.has_user_compute_stream) {
     external_stream_ = true;
     stream_ = static_cast<cudaStream_t>(info.user_compute_stream);
@@ -2778,19 +2781,15 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
 
     // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
     // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-    cudaDeviceProp prop;
-    CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
-    std::string compute_capability = GetComputeCapacity(prop);
-
     if (!has_dynamic_shape) {
       const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
+      const std::string engine_cache_path = cache_path + "_sm" + compute_capability_ + ".engine";
       const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-      const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
+      const std::string profile_cache_path = cache_path + "_sm" + compute_capability_ + ".profile";
       std::string timing_cache_path = "";
       bool engine_update = false;
       if (timing_cache_enable_) {
-        timing_cache_path = GetTimingCachePath(global_cache_path_, prop);
+        timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
       }
       {
         // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
@@ -3043,18 +3042,14 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
 
       // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
       // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-      cudaDeviceProp prop;
-      CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
-      std::string compute_capability = GetComputeCapacity(prop);
-
       // Prepare cache name
       const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + "_sm" + compute_capability + ".engine";
+      const std::string engine_cache_path = cache_path + "_sm" + compute_capability_ + ".engine";
       const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-      const std::string profile_cache_path = cache_path + "_sm" + compute_capability + ".profile";
+      const std::string profile_cache_path = cache_path + "_sm" + compute_capability_ + ".profile";
       std::string timing_cache_path = "";
       if (timing_cache_enable_) {
-        timing_cache_path = GetTimingCachePath(global_cache_path_, prop);
+        timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
       }
 
       // Load serialized engine
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 7eefdd3cba9e2..bacdf0f3c996c 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -258,6 +258,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
   OrtMutex tensorrt_mu_;
   int device_id_;
+  std::string compute_capability_;
   bool context_memory_sharing_enable_ = false;
   bool layer_norm_fp32_fallback_ = false;
   size_t max_ctx_mem_size_ = 0;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index 6bbeab7e94ce4..c69299d0ecdeb 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -456,10 +456,10 @@ std::string GetComputeCapacity(const cudaDeviceProp& prop) {
  * Get Timing by compute capability
  *
  */
-std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) {
+std::string GetTimingCachePath(const std::string& root, std::string& compute_cap) {
   // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache
   const std::string timing_cache_name = "TensorrtExecutionProvider_cache_sm" +
-                                        GetComputeCapacity(prop) + ".timing";
+                                        compute_cap + ".timing";
   return GetCachePath(root, timing_cache_name);
 }
 

From 8a0a972f39f662e25567c455d9ff2b4b2ac66bec Mon Sep 17 00:00:00 2001
From: Christian Larson <chrilaMSFT@users.noreply.github.com>
Date: Thu, 11 Jan 2024 15:15:51 -0800
Subject: [PATCH 022/100] Update DML EP to accept broadcasted tensor of size 1
 to match CPU (#19081)

### Description
With QDQ enabled for Dml EP we are seeing some models not optimize
constant nodes with incorrect tensor size of scale[1] and zeropoint[1]
that does not match the input size. CPU accepts this parameter type so
updating Dml EP to match CPU behavior.


### Motivation and Context
Want to match CPU EP behavior.

---------

Co-authored-by: Christian Larson <28911437+chrilaMSFT@users.noreply.github.com>
Co-authored-by: Dwayne Robinson <dwayner@microsoft.com>
---
 .../src/Operators/DmlOperatorElementWise.cpp           |  6 +++++-
 onnxruntime/test/contrib_ops/quantize_ops_test.cc      | 10 ++++++++++
 .../test/providers/cpu/tensor/quantize_linear_test.cc  | 10 ++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
index 835d43037eaee..ab8ddbfe91bf0 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorElementWise.cpp
@@ -558,7 +558,11 @@ class DmlOperatorElementwiseQLinear : public DmlOperator
             {
                 ML_CHECK_VALID_ARGUMENT(axis < outputShapeDimCount);
                 uint32_t broadcastAxisLength = outputShape[axis];
-                ML_CHECK_VALID_ARGUMENT(inputTensorShape[0] == broadcastAxisLength);
+                ML_CHECK_VALID_ARGUMENT(
+                    (inputTensorShape[0] == broadcastAxisLength) ||
+                    // Treat as broadcast dimension to match CPU behavior.
+                    (inputTensorShape[0] == 1)
+                );
                 inputTensorShape.insert(inputTensorShape.begin(), axis, 1);
                 inputTensorShape.insert(inputTensorShape.end(), outputShapeDimCount - 1 - axis, 1);
             }
diff --git a/onnxruntime/test/contrib_ops/quantize_ops_test.cc b/onnxruntime/test/contrib_ops/quantize_ops_test.cc
index 64a97ed4f945b..db685967ae5ff 100644
--- a/onnxruntime/test/contrib_ops/quantize_ops_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_ops_test.cc
@@ -76,6 +76,16 @@ TEST(DequantizeLinearOpTest, DequantizeLinear_per_tensor_float_int32_cpu) {
   test.Run();
 }
 
+TEST(DequantizeLinearOpTest, DequantizeLinearOpTest_BroadcastTensorOfOne) {
+  OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<int32_t>("x", {4}, {-30, -3, 100, 127});
+  test.AddInput<float>("x_scale", {1}, {2.0f}, true);
+  test.AddInput<int32_t>("zero_point", {1}, {0}, true);
+  test.AddOutput<float>("y", {4}, {-60.f, -6.f, 200.f, 254.f});
+  test.Run();
+}
+
 #ifdef USE_CUDA
 TEST(DequantizeLinearOpTest, DequantizeLinear_per_tensor_half_uint8) {
   OpTester test("DequantizeLinear", 1, onnxruntime::kMSDomain);
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index f4b21823a487b..026bb07edf44c 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -47,6 +47,16 @@ TEST(DequantizeLinearOpTest, Int32) {
   test.Run();
 }
 
+TEST(DequantizeLinearOpTest_BroadcastTensor, Int32) {
+  OpTester test("DequantizeLinear", 13);
+  test.AddInput<int32_t>("x", {4}, {-30, -3, 100, 127});
+  test.AddAttribute<int64_t>("axis", 0);
+  test.AddInput<float>("x_scale", {1}, {2.0f});
+  test.AddInput<int32_t>("x_zero_point", {1}, {0});
+  test.AddOutput<float>("y", {4}, {-60.f, -6.f, 200.f, 254.f});
+  test.Run();
+}
+
 // 2d inputs
 TEST(DequantizeLinearOpTest, 2D) {
   OpTester test("DequantizeLinear", 10);

From 08cf4fbcad784bd69a9e9d8102c1800ea9dbc2f1 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Thu, 11 Jan 2024 15:16:44 -0800
Subject: [PATCH 023/100] Handle all float types in IsQDQPairSupported (#19085)

### Description
This makes detection of identical QDQ scales work with float16 and
bfloat16 rather than failing.


### Motivation and Context
This addresses failures in customer models
---
 .../optimizer/qdq_transformer/qdq_util.cc     |  23 ++++++-
 .../test/optimizer/graph_transform_test.cc    |  61 ++++++++++--------
 .../qdq_optimization/dup_qdq_bfloat16.onnx    | Bin 0 -> 1446 bytes
 .../qdq_optimization/dup_qdq_float16.onnx     | Bin 0 -> 1446 bytes
 4 files changed, 53 insertions(+), 31 deletions(-)
 create mode 100644 onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_bfloat16.onnx
 create mode 100644 onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_float16.onnx

diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
index 221c06d7c8dcf..b1ab641a23256 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc
@@ -54,9 +54,26 @@ bool IsQDQPairSupported(
   Initializer dq_zp(*dq_zp_tensor_proto, model_path);
   Initializer dq_scale(*dq_scale_tensor_proto, model_path);
 
-  return q_zp.data_type() == dq_zp.data_type() &&
-         SpanEq(q_zp.DataAsByteSpan(), dq_zp.DataAsByteSpan()) &&
-         *q_scale.data<float>() == *dq_scale.data<float>();
+  if (q_zp.data_type() != dq_zp.data_type() ||
+      q_scale.data_type() != q_scale.data_type() ||
+      !SpanEq(q_zp.DataAsByteSpan(), dq_zp.DataAsByteSpan())) {
+    return false;
+  }
+
+  switch (q_scale.data_type()) {
+    case ONNX_NAMESPACE::TensorProto::FLOAT:
+      return *q_scale.data<float>() == *dq_scale.data<float>();
+
+    case ONNX_NAMESPACE::TensorProto::FLOAT16:
+      return *q_scale.data<MLFloat16>() == *dq_scale.data<MLFloat16>();
+
+    case ONNX_NAMESPACE::TensorProto::BFLOAT16:
+      return *q_scale.data<BFloat16>() == *dq_scale.data<BFloat16>();
+
+    default:
+      assert(false);
+      return false;
+  }
 }
 
 bool IsDQSupported(const Node& dq_node, const GetConstantInitializerFn& get_const_initializer) {
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index ef6e2d531bc1a..5adcb3c150b8d 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -4602,38 +4602,43 @@ TEST_F(GraphTransformationTests, GeluApproximation_SessionOptionConfig) {
 }
 
 // Test DoubleQDQPairsRemover to remove unnecessary DQ->Q nodes in the middle
-TEST_F(GraphTransformationTests, DoublQDQRemover_RemoveDupQDQ) {
-  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "qdq_optimization/dup_qdq.onnx";
-  std::shared_ptr<Model> p_model;
-  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
-  Graph& graph = p_model->MainGraph();
+TEST_F(GraphTransformationTests, DoublQDQRemover_RemoveDupQDQ_Float16) {
+  auto RunTest = [this](const ORTCHAR_T* model_uri) {
+    std::shared_ptr<Model> p_model;
+    ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+    Graph& graph = p_model->MainGraph();
 
-  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<DoubleQDQPairsRemover>(), TransformerLevel::Level1));
-  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+    onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+    ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<DoubleQDQPairsRemover>(), TransformerLevel::Level1));
+    ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
-  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  EXPECT_EQ(op_to_count["QuantizeLinear"], 3);
-  EXPECT_EQ(op_to_count["DequantizeLinear"], 4);
+    std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+    EXPECT_EQ(op_to_count["QuantizeLinear"], 3);
+    EXPECT_EQ(op_to_count["DequantizeLinear"], 4);
 
-  std::string dq_scale_name_before_reshape_node;
-  std::string zp_name_before_reshape_node;
-  std::string dq_scale_name_after_reshape_node;
-  std::string zp_name_after_reshape_node;
-  for (auto& node : graph.Nodes()) {
-    if (node.Name() == "dq_2") {
-      dq_scale_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
-      zp_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
-    }
-    if (node.Name() == "q_3") {
-      dq_scale_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
-      zp_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
+    std::string dq_scale_name_before_reshape_node;
+    std::string zp_name_before_reshape_node;
+    std::string dq_scale_name_after_reshape_node;
+    std::string zp_name_after_reshape_node;
+    for (auto& node : graph.Nodes()) {
+      if (node.Name() == "dq_2") {
+        dq_scale_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
+        zp_name_before_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
+      }
+      if (node.Name() == "q_3") {
+        dq_scale_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::SCALE_ID]->Name();
+        zp_name_after_reshape_node = node.InputDefs()[QDQ::InputIndex::ZERO_POINT_ID]->Name();
+      }
     }
-  }
-  EXPECT_EQ(dq_scale_name_before_reshape_node.empty(), false);
-  EXPECT_EQ(zp_name_before_reshape_node.empty(), false);
-  EXPECT_EQ(dq_scale_name_before_reshape_node, dq_scale_name_after_reshape_node);
-  EXPECT_EQ(zp_name_before_reshape_node, zp_name_after_reshape_node);
+    EXPECT_EQ(dq_scale_name_before_reshape_node.empty(), false);
+    EXPECT_EQ(zp_name_before_reshape_node.empty(), false);
+    EXPECT_EQ(dq_scale_name_before_reshape_node, dq_scale_name_after_reshape_node);
+    EXPECT_EQ(zp_name_before_reshape_node, zp_name_after_reshape_node);
+  };
+
+  RunTest(MODEL_FOLDER "qdq_optimization/dup_qdq.onnx");
+  RunTest(MODEL_FOLDER "qdq_optimization/dup_qdq_float16.onnx");
+  RunTest(MODEL_FOLDER "qdq_optimization/dup_qdq_bfloat16.onnx");
 }
 
 // Test Gelu -> FastGelu
diff --git a/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_bfloat16.onnx b/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_bfloat16.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..a04b7781fca402f7e2f185e030b4648ecc1c96ad
GIT binary patch
literal 1446
zcmcgs&1>5*6p!OJabJ=l)T|AmY!K+h5UAyN4ZY3u*h^c+XfML61Pe{%k2)h{^t@C1
zH&=3<R>lqmAC@4&lb(McPtvn2+zo|Tjrl5AM00-j)PYCn2aM5~h?Xg#@iGwUY|R;b
z36u2UcaU&K9eN^%Gxca1-g7}+j}Z8*eYNIENTYBjQry#)Sv)L;BXrI;3>P!Og9Pqi
zH={1|x!<N6zEj5ol%-LPs`Exg<DnjmK3(QEa*_q;+cl`V?^M!MA*%2+fkFlpi-30(
zm^0kF3&kHQLdHktcgElvUN_3}h%?z#!t)3q7OOMSDq+SFPfaD~KR_+lfZSmUGn*?M
zUO~1~16Rr+N-C+AN@_qVDI}9hDx^*`sRVinm)AMsG@Ef}6HdI7p&pFdSFee11NwRm
zVbZ3nyt*Z?Kh?3gH6UBzGL;WCbL*Df;vQQ@<Cw+cg}hgD``NGzt0n&?YTEP0fBgBC
zUBx~|$Tt4^A2b*HNXj)Desz!83R3JC_CSI({~3O%b8H1DJ~aDKsi1!8z1ag<&-R&V
sw6I^iR<~dCc7_k2Zy}7JX&G{~@?riug8T?E1|&1OS_bxapn>*Z03-gQ3;+NC

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_float16.onnx b/onnxruntime/test/testdata/transform/qdq_optimization/dup_qdq_float16.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..691da77969b1c1f1575d459e017efd25e4180e37
GIT binary patch
literal 1446
zcmcgs%}c{D6kk`J^=*?9=LbU+23`t-HC=Uhvv~BPBBGa2O~B!ne%OgjJbTi=*`#)w
z(u&Z78wtGR_4o0TJi|b(fb-`*d-kW{gdM#%;T1YQrDV**S%OJ4^La9!GYY#woIKw9
zF{8vL12OE8OOoJ$apJfb!-n$JoW%hNgE3D~TUn-2Hy`%Q6SiPD9dqW#a0kmNb&=0G
z7%lh?Z5PZmjVM&jZz>XX)u6X_mf6Tirl4+Dplbb42~)YK#M2OR8IUgmT9#mTcl9pd
zPlN}Q_4H#(;R3!kit?~MTvWn!F_<)xXQEU>j|85a3eGP<E?0nDq7pNmOB|j<x>EsH
z$iY%7E|-ccKuRe%m5NKGj#8-@+A$N?Ib<ZAQF9Rv-9c9kdMoE&iEs_-dIe#^rX#&N
z1+Q1~Sey!wjbN6Dhnl)|if&Px&cc2~qyALftBLiY8Jf`$f8DHGliEN2ywa{>?VDg}
zFWw)`#o7^awVGGnV}1q6cMNM=fYjeKuh2Pu1;{@%YfGviUg5o2+oGN;N4nNPUjAC$
m>}Bl?Z9>NYWJ2B0#AswgzukfC2oVBPWVDnF<SjwXT>k_Tl%ehb

literal 0
HcmV?d00001


From d8962d67f48c957d7cbe15f86bcb70c0f7b6074b Mon Sep 17 00:00:00 2001
From: Aditya Goel <48102515+adityagoel4512@users.noreply.github.com>
Date: Thu, 11 Jan 2024 23:50:07 +0000
Subject: [PATCH 024/100] RegexFullMatch operator (#18002)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Closes https://github.com/microsoft/onnxruntime/issues/17594.
---
 docs/OperatorKernels.md                       |   1 +
 .../providers/cpu/cpu_execution_provider.cc   |   2 +
 .../providers/cpu/text/regex_full_match.cc    |  35 ++++++
 .../providers/cpu/text/regex_full_match.h     |  20 +++
 .../cpu/text/regex_full_match_test.cc         | 119 ++++++++++++++++++
 .../onnx_backend_test_series_filters.jsonc    |   3 -
 6 files changed, 177 insertions(+), 3 deletions(-)
 create mode 100644 onnxruntime/core/providers/cpu/text/regex_full_match.cc
 create mode 100644 onnxruntime/core/providers/cpu/text/regex_full_match.h
 create mode 100644 onnxruntime/test/providers/cpu/text/regex_full_match_test.cc

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index c856d12141c9c..5e38789b65137 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -305,6 +305,7 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[1, 10]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
+|RegexFullMatch|*in* X:**T1**<br> *out* Y:**T2**|20+|**T1** = tensor(string)<br/> **T2** = tensor(bool)|
 |Relu|*in* X:**T**<br> *out* Y:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int8)|
 |||13|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index ba7738b405795..9cd0b3d0620af 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -990,6 +990,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -2449,6 +2450,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/text/regex_full_match.cc b/onnxruntime/core/providers/cpu/text/regex_full_match.cc
new file mode 100644
index 0000000000000..cc4a5a9ae4e61
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/regex_full_match.cc
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "regex_full_match.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+ONNX_CPU_OPERATOR_KERNEL(
+    RegexFullMatch,
+    20,
+    KernelDefBuilder()
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<std::string>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<bool>()),
+    RegexFullMatch);
+
+RegexFullMatch::RegexFullMatch(const OpKernelInfo& info) : OpKernel(info), re_{info.GetAttr<std::string>("pattern")} {
+  ORT_ENFORCE(re_.ok(), "Invalid regex pattern: ", re_.pattern());
+}
+
+Status RegexFullMatch::Compute(OpKernelContext* context) const {
+  const auto* input_tensor = context->Input<Tensor>(0);
+  const auto input_data = input_tensor->template DataAsSpan<std::string>();
+  auto* output_tensor = context->Output(0, input_tensor->Shape());
+  auto output_data = output_tensor->template MutableDataAsSpan<bool>();
+  auto output_iter = output_data.begin();
+  auto input_iter = input_data.begin();
+  while (input_iter != input_data.end()) {
+    *output_iter = RE2::FullMatch(*input_iter, re_);
+    input_iter++;
+    output_iter++;
+  }
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/regex_full_match.h b/onnxruntime/core/providers/cpu/text/regex_full_match.h
new file mode 100644
index 0000000000000..0d3f1f4b4b824
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/regex_full_match.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+#include "re2/re2.h"
+
+namespace onnxruntime {
+
+class RegexFullMatch final : public OpKernel {
+ public:
+  explicit RegexFullMatch(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  RE2 re_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/text/regex_full_match_test.cc b/onnxruntime/test/providers/cpu/text/regex_full_match_test.cc
new file mode 100644
index 0000000000000..4aa5a0d44b678
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/regex_full_match_test.cc
@@ -0,0 +1,119 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+namespace onnxruntime {
+namespace test {
+
+static void RunTest(const std::initializer_list<int64_t>& dims, const std::initializer_list<std::string>& input, const std::string& pattern, const std::initializer_list<bool>& output) {
+  OpTester test("RegexFullMatch", 20, kOnnxDomain);
+  test.AddAttribute("pattern", pattern);
+  test.AddInput<std::string>("Input", dims, input);
+  test.AddOutput<bool>("Output", dims, output);
+  test.Run();
+}
+
+TEST(RegexFullMatch, WebsiteMatch) {
+  RunTest({3, 1}, {"www.google.com", "www.facebook.com", "www.bbc.co.uk"}, R"(www\.[\w.-]+\.\bcom\b)", {true, true, false});
+}
+
+TEST(RegexFullMatch, EmailMatch) {
+  RunTest({2, 2}, {"account@gmail.com", "account@hotmail.com", "not email", "account@yahoo.com"}, R"((\W|^)[\w.\-]{0,25}@(yahoo|gmail)\.com(\W|$))", {true, false, false, true});
+}
+
+TEST(RegexFullMatch, MultibyteMatch) {
+  RunTest({1, 2}, {"ä", "a"}, "ä", {true, false});
+  RunTest({
+              1,
+          },
+          {"une cédille like in Besançon"}, R"(.*Besançon.*)", {
+                                                                   true,
+                                                               });
+  RunTest({
+              1,
+          },
+          {"une cédille like in Besançon"}, R"(.*Besancon.*)", {
+                                                                   false,
+                                                               });
+  RunTest({
+              1,
+          },
+          {"Mit freundlichen Grüßen"}, R"(.*Grüßen$)", {
+                                                           true,
+                                                       });
+  RunTest({
+              1,
+          },
+          {"Mit freundlichen Grüßen"}, R"(.*Grußen$)", {
+                                                           false,
+                                                       });
+  RunTest({
+              3,
+          },
+          {"HПонедельник", "Понедельник", "недельник"}, R"(^Понед.*)", {
+                                                                           false,
+                                                                           true,
+                                                                           false,
+                                                                       });
+  RunTest({
+              3,
+          },
+          {"thank you", "どうもありがとうございます", "こんにちは世界"}, R"(^こんにちは世界.*)", {
+                                                                                                     false,
+                                                                                                     false,
+                                                                                                     true,
+                                                                                                 });
+  RunTest({
+              3,
+          },
+          {"नमस्ते, आपसे मिलकर अच्छा लगा", "नमस्ते", "स्वागत एवं नमस्ते"}, R"(.+नमस्ते$)", {
+                                                                                   false,
+                                                                                   false,
+                                                                                   true,
+                                                                               });
+  RunTest({
+              3,
+          },
+          {"你好，你好吗?", "你好呀", "你好呀!"}, R"(^你好.*\?$)", {
+                                                                       true,
+                                                                       false,
+                                                                       false,
+                                                                   });
+}
+
+TEST(RegexFullMatch, InvalidPattern) {
+  OpTester test("RegexFullMatch", 20, kOnnxDomain);
+  test.AddAttribute("pattern", R"([a-z)");
+  test.AddInput<std::string>("Input", {
+                                          1,
+                                      },
+                             {
+                                 "abcdef",
+                             });
+  test.AddOutput<bool>("Output", {
+                                     1,
+                                 },
+                       {
+                           false,
+                       });
+  test.Run(BaseTester::ExpectResult::kExpectFailure, "Invalid regex pattern: [a-z");
+}
+
+TEST(RegexFullMatch, NonUtf8Pattern) {
+  uint8_t invalid_bytes[] = {0xC0, 0xC1, 0x41, 0x42, 0xC3, 0x80, 0xC2, 0x80, 0xC2, 0xC3, 0xC4, 0x00};
+  OpTester test("RegexFullMatch", 20, kOnnxDomain);
+  test.AddAttribute("pattern", std::string((char*)invalid_bytes, sizeof(invalid_bytes)));
+  test.AddInput<std::string>("Input", {
+                                          1,
+                                      },
+                             {
+                                 "abcd",
+                             });
+  test.AddOutput<bool>("Output", {
+                                     1,
+                                 },
+                       {
+                           false,
+                       });
+  test.Run(BaseTester::ExpectResult::kExpectFailure, "Invalid regex pattern");
+}
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index fb33ef0a1da3c..c2ca5f860a107 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -248,9 +248,6 @@
         "^test_image_decoder_decode_pnm_rgb",
         "^test_image_decoder_decode_tiff_rgb",
         "^test_image_decoder_decode_webp_rgb",
-        "^test_regex_full_match_basic",
-        "^test_regex_full_match_email_domain",
-        "^test_regex_full_match_empty",
         "^test_string_split_basic",
         "^test_string_split_consecutive_delimiters",
         "^test_string_split_empty_string_delimiter",

From 189be8e997ff84f30fca5aa9a070c139a4b337df Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 11 Jan 2024 16:13:22 -0800
Subject: [PATCH 025/100] Bump follow-redirects from 1.15.2 to 1.15.4 in
 /onnxruntime/test/wasm (#19069)

Bumps
[follow-redirects](https://github.com/follow-redirects/follow-redirects)
from 1.15.2 to 1.15.4.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/65858205e59f1e23c9bf173348a7a7cbb8ac47f5"><code>6585820</code></a>
Release version 1.15.4 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/7a6567e16dfa9ad18a70bfe91784c28653fbf19d"><code>7a6567e</code></a>
Disallow bracketed hostnames.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/05629af696588b90d64e738bc2e809a97a5f92fc"><code>05629af</code></a>
Prefer native URL instead of deprecated url.parse.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/1cba8e85fa73f563a439fe460cf028688e4358df"><code>1cba8e8</code></a>
Prefer native URL instead of legacy url.resolve.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/72bc2a4229bc18dc9fbd57c60579713e6264cb92"><code>72bc2a4</code></a>
Simplify _processResponse error handling.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/3d42aecdca39b144a0a2f27ea134b4cf67dd796a"><code>3d42aec</code></a>
Add bracket tests.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/bcbb096b32686ecad6cd34235358ed6f2217d4f0"><code>bcbb096</code></a>
Do not directly set Error properties.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/192dbe7ce671ecad813c074bffe3b3f5d3680fee"><code>192dbe7</code></a>
Release version 1.15.3 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/bd8c81e4f32d12f28a35d265f88b1716703687c6"><code>bd8c81e</code></a>
Fix resource leak on destroy.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/9c728c314b06f9595dcd7f245d40731e8a27d79f"><code>9c728c3</code></a>
Split linting and testing.</li>
<li>Additional commits viewable in <a
href="https://github.com/follow-redirects/follow-redirects/compare/v1.15.2...v1.15.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.2&new-version=1.15.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 onnxruntime/test/wasm/package-lock.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/test/wasm/package-lock.json b/onnxruntime/test/wasm/package-lock.json
index 3f6db4097356e..bfa000fda440a 100644
--- a/onnxruntime/test/wasm/package-lock.json
+++ b/onnxruntime/test/wasm/package-lock.json
@@ -520,9 +520,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true,
       "funding": [
         {
@@ -1972,9 +1972,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true
     },
     "fs-extra": {

From b6d82834d4c311757d0444a9f3ae26e3ab0c551d Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Fri, 12 Jan 2024 04:53:31 +0000
Subject: [PATCH 026/100] add bfp16 to gqa (#19095)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 docs/ContribOperators.md                       |  2 +-
 docs/OperatorKernels.md                        |  2 +-
 .../contrib_ops/cuda/bert/attention_impl.cu    |  3 ++-
 .../cuda/bert/flash_attention/flash_api.cc     | 11 +++++++++--
 .../cuda/bert/flash_attention/flash_api.h      |  5 ++++-
 .../flash_fwd_hdim128_bf16_sm80.cu             | 18 ++++++++++++++++++
 .../flash_fwd_hdim160_bf16_sm80.cu             | 18 ++++++++++++++++++
 .../flash_fwd_hdim192_bf16_sm80.cu             | 18 ++++++++++++++++++
 .../flash_fwd_hdim224_bf16_sm80.cu             | 18 ++++++++++++++++++
 .../flash_fwd_hdim256_bf16_sm80.cu             | 18 ++++++++++++++++++
 .../flash_fwd_hdim32_bf16_sm80.cu              | 18 ++++++++++++++++++
 .../flash_fwd_hdim64_bf16_sm80.cu              | 18 ++++++++++++++++++
 .../flash_fwd_hdim96_bf16_sm80.cu              | 18 ++++++++++++++++++
 .../flash_fwd_split_hdim128_bf16_sm80.cu       | 15 +++++++++++++++
 .../flash_fwd_split_hdim160_bf16_sm80.cu       | 15 +++++++++++++++
 .../flash_fwd_split_hdim192_bf16_sm80.cu       | 15 +++++++++++++++
 .../flash_fwd_split_hdim224_bf16_sm80.cu       | 15 +++++++++++++++
 .../flash_fwd_split_hdim256_bf16_sm80.cu       | 15 +++++++++++++++
 .../flash_fwd_split_hdim32_bf16_sm80.cu        | 15 +++++++++++++++
 .../flash_fwd_split_hdim64_bf16_sm80.cu        | 15 +++++++++++++++
 .../flash_fwd_split_hdim96_bf16_sm80.cu        | 15 +++++++++++++++
 .../cuda/bert/flash_attention/static_switch.h  | 14 +++++++++-----
 .../cuda/bert/group_query_attention.cc         |  1 +
 .../cuda/bert/group_query_attention_impl.cu    | 15 +++++++++++++--
 .../bert/packed_multihead_attention_impl.cu    |  3 ++-
 .../contrib_ops/cuda/cuda_contrib_kernels.cc   |  2 ++
 .../core/graph/contrib_ops/bert_defs.cc        |  2 +-
 27 files changed, 309 insertions(+), 15 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 38fceef67de25..b5b69c15d65c9 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2439,7 +2439,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output to float tensors.</dd>
 <dt><tt>M</tt> : tensor(int32)</dt>
 <dd>Constrain mask to int tensor.</dd>
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 5e38789b65137..f0b79eb9e429f 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -841,7 +841,7 @@ Do not modify directly.*
 |GreedySearch|*in* input_ids:**I**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**I**<br> *in* prefix_vocab_mask:**I**<br> *in* attention_mask:**I**<br> *out* sequences:**I**|1+|**T** = tensor(float), tensor(float16)|
 |GridSample|*in* X:**T1**<br> *in* Grid:**T1**<br> *out* Y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|GroupQueryAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* seqlens_k:**M**<br> *in* total_sequence_length:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(float16)|
+|GroupQueryAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* seqlens_k:**M**<br> *in* total_sequence_length:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**|1+|**M** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
 |Inverse|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Irfft|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |LongformerAttention|*in* input:**T**<br> *in* weight:**T**<br> *in* bias:**T**<br> *in* mask:**T**<br> *in* global_weight:**T**<br> *in* global_bias:**T**<br> *in* global:**G**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 83c426e7e6ed7..54c9a5da1e9da 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -313,10 +313,11 @@ Status FlashAttention(
                 parameters.batch_size, parameters.total_sequence_length,
                 parameters.num_heads, parameters.v_head_size);
 
+  bool is_bf16 = false;
   ORT_RETURN_IF_ERROR(onnxruntime::flash::mha_fwd(
       device_prop, stream, query, key, value, data.output, reinterpret_cast<void*>(data.scratch),
       parameters.batch_size, parameters.num_heads, parameters.num_heads, parameters.head_size,
-      parameters.sequence_length, parameters.total_sequence_length, scale, parameters.is_unidirectional,
+      parameters.sequence_length, parameters.total_sequence_length, scale, parameters.is_unidirectional, is_bf16,
       parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum), reinterpret_cast<void*>(data.out_accum),
       true));
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
index 76190aad68fdb..d6eb87228bb4a 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc
@@ -35,6 +35,7 @@ void set_params_fprop(Flash_fwd_params& params,
                       void* softmax_lse_d,
                       float softmax_scale,
                       bool is_causal,
+                      bool is_bf16,
                       bool kv_bsnh = true,
                       int window_size_left = -1,
                       int window_size_right = -1) {
@@ -44,7 +45,7 @@ void set_params_fprop(Flash_fwd_params& params,
   params.v_ptr = v;
   params.o_ptr = out;
 
-  params.is_bf16 = false;
+  params.is_bf16 = is_bf16;
 
   // All stride are in elements, not bytes.
   if (kv_bsnh) {
@@ -240,6 +241,7 @@ Status mha_fwd(const cudaDeviceProp& dprops,
                int seqlen_k,
                float softmax_scale,
                bool is_causal,
+               bool is_bf16,
                int num_splits,
                void* softmax_lse_accum,  // num_splits x batch_size x seqlen_q x num_heads
                void* out_accum,          // num_splits x batch_size x seqlen_q x num_heads x head_size_rounded
@@ -264,6 +266,7 @@ Status mha_fwd(const cudaDeviceProp& dprops,
                    softmax_lse,
                    softmax_scale,
                    is_causal,
+                   is_bf16,
                    kv_bsnh,
                    local_window_size,
                    is_causal ? 0 : -1);
@@ -306,7 +309,8 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
                       int max_seqlen_q,
                       int max_seqlen_k,
                       float softmax_scale,
-                      bool is_causal) {
+                      bool is_causal,
+                      bool is_bf16) {
   auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
   const int head_size_rounded = round_multiple(head_size, 32);
   const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
@@ -326,6 +330,7 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
                    softmax_lse,
                    softmax_scale,
                    is_causal,
+                   is_bf16,
                    true,
                    -1,
                    is_causal ? 0 : -1);
@@ -366,6 +371,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        int seqlen_k_new,
                        const float softmax_scale,
                        bool is_causal,
+                       bool is_bf16,
                        bool past_bsnh,  // otherwise bnsh
                        int num_splits,
                        void* softmax_lse_accum,  // num_splits x batch_size x seqlen_q x num_heads
@@ -394,6 +400,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                    softmax_lse,
                    softmax_scale,
                    is_causal,
+                   is_bf16,
                    past_bsnh,
                    local_window_size,
                    is_causal ? 0 : -1);
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
index efc1f565c4fa0..3d75d6834b8e0 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
@@ -51,6 +51,7 @@ Status mha_fwd(const cudaDeviceProp& dprops,
                int seqlen_k,
                float softmax_scale,
                bool is_causal,
+               bool is_bf16,
                int num_splits = 0,
                void* softmax_lse_accum = nullptr,  // num_splits x batch_size x seqlen_q x num_heads
                void* out_accum = nullptr,          // num_splits x batch_size x seqlen_q x num_heads x head_size_rounded
@@ -73,7 +74,8 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
                       int max_seqlen_q,
                       int max_seqlen_k,
                       float softmax_scale,
-                      bool is_causal);
+                      bool is_causal,
+                      bool is_bf16);
 
 Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        cudaStream_t stream,
@@ -94,6 +96,7 @@ Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        int seqlen_k_new,
                        const float softmax_scale,
                        bool is_causal,
+                       bool is_bf16,
                        bool past_bsnh,  // otherwise bnsh
                        int num_splits = 0,
                        void* softmax_lse_accum = nullptr,  // num_splits x batch_size x seqlen_q x num_heads
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000000000..431eb2bd69def
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu
new file mode 100644
index 0000000000000..0cb48272dec3f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 160>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim160<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000000000..142e922f71031
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu
new file mode 100644
index 0000000000000..2142b1c343110
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 224>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim224<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000000000..751363184e23a
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000000000..ebf0236435971
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000000000..166bb2a0072f4
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000000000..c8760b8168db6
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,18 @@
+// Copyright (c) 2023, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96>(Flash_fwd_params& params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t>(params, stream);
+}
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000000000..3ca416f6580c4
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim128_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu
new file mode 100644
index 0000000000000..3e37c9af80b37
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000000000..79606fd05b4d8
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim192_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu
new file mode 100644
index 0000000000000..0b0d9384709ca
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim224_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 224>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000000000..8eb5c8f84544b
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim256_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000000000..0141f27aa199f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim32_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000000000..489d2d47bc709
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim64_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000000000..bcfd47e76b99e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim96_bf16_sm80.cu
@@ -0,0 +1,15 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#if USE_FLASH_ATTENTION
+
+#include "contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h"
+
+namespace onnxruntime {
+namespace flash {
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96>(Flash_fwd_params& params, cudaStream_t stream);
+
+}  // namespace flash
+}  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
index 05ac2476690c2..5b70988949bbd 100644
--- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
+++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
@@ -23,11 +23,15 @@
     }                                           \
   }()
 
-#define FP16_SWITCH(COND, ...)         \
-  [&] {                                \
-    assert(COND);                      \
-    using elem_type = cutlass::half_t; \
-    return __VA_ARGS__();              \
+#define FP16_SWITCH(COND, ...)               \
+  [&] {                                      \
+    if (COND) {                              \
+      using elem_type = cutlass::half_t;     \
+      return __VA_ARGS__();                  \
+    } else {                                 \
+      using elem_type = cutlass::bfloat16_t; \
+      return __VA_ARGS__();                  \
+    }                                        \
   }()
 
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)   \
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
index 93892169f6c79..fd6fb79742cac 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -34,6 +34,7 @@ namespace cuda {
 
 // REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(BFloat16)
 
 template <typename T>
 GroupQueryAttention<T>::GroupQueryAttention(const OpKernelInfo& info)
diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
index b22ccb68c1e7b..5b0f5d0cfe601 100644
--- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention_impl.cu
@@ -494,6 +494,8 @@ Status FlashAttention(
 
   bool is_causal = true;
 
+  bool is_bf16 = std::is_same<T, BFloat16>::value;
+
   // Note: seqlens_k is past sequence length for flash
   if (parameters.is_prompt) {
     // Launch kernel to copy seqlen
@@ -529,7 +531,7 @@ Status FlashAttention(
         device_prop, stream, query, present_key, present_value, key, value, data.output, reinterpret_cast<void*>(data.softmax_lse),
         seqlens_k, batch_size, num_heads, kv_num_heads,
         head_size, sequence_length, present_sequence_length, kv_sequence_length,
-        scale, is_causal, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
+        scale, is_causal, is_bf16, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
         reinterpret_cast<void*>(data.out_accum), parameters.local_window_size));
   } else {
     // Not share buffer case
@@ -561,7 +563,7 @@ Status FlashAttention(
         device_prop, stream, query, present_key, present_value, nullptr, nullptr, data.output, reinterpret_cast<void*>(data.softmax_lse),
         seqlens_k, batch_size, num_heads, kv_num_heads,
         head_size, sequence_length, present_sequence_length, 0,
-        scale, is_causal, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
+        scale, is_causal, is_bf16, past_bsnh, parameters.num_splits, reinterpret_cast<void*>(data.softmax_lse_accum),
         reinterpret_cast<void*>(data.out_accum), parameters.local_window_size));
   }
 
@@ -713,6 +715,15 @@ template Status QkvToContext<half>(
     contrib::GroupQueryAttentionParameters& parameters,
     GroupQueryAttentionData<half>& data);
 
+template struct GroupQueryAttentionData<BFloat16>;
+
+template Status QkvToContext<BFloat16>(
+    const cudaDeviceProp& device_prop,
+    cublasHandle_t& cublas,
+    Stream* ort_stream,
+    contrib::GroupQueryAttentionParameters& parameters,
+    GroupQueryAttentionData<BFloat16>& data);
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
index 8a508241d80ba..83af018a97ea6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention_impl.cu
@@ -639,7 +639,8 @@ Status FlashAttention(
           sequence_length,
           sequence_length,
           scale,
-          false  // is causal
+          false,  // is causal
+          false  // is bf16
           ));
 
   DUMP_TENSOR_INIT();
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index be7e9f6a8225e..34b44694a5fcc 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -77,6 +77,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, GroupQueryAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice);
@@ -277,6 +278,7 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, MultiHeadAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GroupQueryAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, GroupQueryAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index f8f63650615fd..df8d0a59cb033 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1059,7 +1059,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 "(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +"
                 "kv_sequence_length.",
                 "T")
-        .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output to float tensors.")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output to float tensors.")
         .TypeConstraint("M", {"tensor(int32)"}, "Constrain mask to int tensor.")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           GroupQueryAttentionTypeAndShapeInference(ctx, 3);

From 46dd0d3f52183d951640258c09c54adc1daeeeb9 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Thu, 11 Jan 2024 22:20:54 -0800
Subject: [PATCH 027/100] [TensorRT EP] Load precompiled TRT engine file
 directly  (#18217)

When the TRT engine cache (precompiled engine) is present, it doesn't
make sense to go over the processes of model verification, model
optimization, TRT EP's GetCapability(), TRT EP's model proto
reconstruction, calling TRT parser and engine compilation.
This PR makes TRT EP skip those processes and directly load the engine
to perform inference.

The feature request:
https://github.com/microsoft/onnxruntime/issues/18072

Features:

- Replace original model with TRT engine wrapped ONNX model. It can save
a lot of time as mentioned above.

- How to get TRT engine wrapped ONNX model?
1. Set `trt_dump_ep_context_model` provider option to "true" and run the
inference. You will find the "xxx_wrapper.onnx" at the engine cache
path. (The same logic of generating engine cache)
    2. Use gen_trt_engine_wrapper_onnx_model.py

- Three provider options are added,
`trt_dump_ep_context_model`: Enable dump wrapped onnx model by TRT EP
`trt_ep_context_embed_mode`: Add embed_mode as attribute. 0 means engine
cache path, 1 means engine binary data.
`trt_ep_context_compute_capability_enable`: Add hardware_arch as
attribute. When running the model, TRT EP will check consistency between
model's hardware_arch and GPU's compute capability.

- When the engine cache path is given in the wrapped model, TRT EP will
first search for the engine file using the path (relative to model
path), if it can't find it, it will change to use the path as it is
(depends on user, could be relative to working dir or absolute path)

Note:

1. This PR includes the change of
https://github.com/microsoft/onnxruntime/pull/17751


Constraints:

1. The whole model should be fully supported by TRT.
4. Users need to make sure the engine is built with min/max/opt
optimization profiles that large enough to cover the range of all
inputs. TRT EP will simply fail and won't rebuild the engine if the
input shape is out of range during runtime.
---
 docs/ContribOperators.md                      |    2 +
 .../tensorrt/tensorrt_provider_options.h      |    3 +
 .../core/graph/contrib_ops/contrib_defs.cc    |    5 +
 .../shared_library/provider_interfaces.h      |    3 +
 .../shared_library/provider_wrappedtypes.h    |    3 +
 .../tensorrt/onnx_ctx_model_helper.cc         |  229 ++
 .../tensorrt/onnx_ctx_model_helper.h          |   55 +
 .../tensorrt/tensorrt_execution_provider.cc   | 2048 ++++++++++-------
 .../tensorrt/tensorrt_execution_provider.h    |   44 +
 .../tensorrt_execution_provider_info.cc       |   15 +
 .../tensorrt_execution_provider_info.h        |    3 +
 .../tensorrt_execution_provider_utils.h       |    1 +
 .../tensorrt/tensorrt_provider_factory.cc     |    3 +
 .../core/session/provider_bridge_ort.cc       |    3 +
 .../python/onnxruntime_pybind_state.cc        |   22 +
 .../gen_trt_engine_wrapper_onnx_model.py      |  174 ++
 .../python/onnxruntime_test_engine_wrapper.py |  100 +
 17 files changed, 1873 insertions(+), 840 deletions(-)
 create mode 100644 onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
 create mode 100644 onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
 create mode 100644 onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
 create mode 100644 onnxruntime/test/python/onnxruntime_test_engine_wrapper.py

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index b5b69c15d65c9..45c0e6f822ce9 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1588,6 +1588,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0.</dd>
 <dt><tt>ep_sdk_version</tt> : string</dt>
 <dd>(Optional) SDK version used to convert the model.</dd>
+<dt><tt>hardware_architecture</tt> : string</dt>
+<dd>(Optional) Hardware architecture.</dd>
 <dt><tt>main_context</tt> : int</dt>
 <dd>Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.</dd>
 <dt><tt>notes</tt> : string</dt>
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index 680ce1cc5b9a2..daa4089061825 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -46,4 +46,7 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_profile_max_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   const char* trt_profile_opt_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   int trt_cuda_graph_enable{0};                          // Enable CUDA graph in ORT TRT
+  int trt_dump_ep_context_model{0};                      // Dump EP context node model
+  int trt_ep_context_embed_mode{0};                      // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+  int trt_ep_context_compute_capability_enable{1};       // Add GPU compute capability as an EP context node's attribute
 };
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 54eb43753931a..982e8fd834b76 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3230,6 +3230,11 @@ void RegisterContribSchemas() {
           "(Optional) SDK version used to convert the model.",
           AttributeProto::STRING,
           OPTIONAL_VALUE)
+      .Attr(
+          "hardware_architecture",
+          "(Optional) Hardware architecture.",
+          AttributeProto::STRING,
+          OPTIONAL_VALUE)
       .Attr(
           "partition_name",
           "(Optional) partitioned graph name.",
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 27226005a9c0b..2883d92e90dba 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -330,6 +330,7 @@ struct ProviderHost {
   virtual int64_t AttributeProto__i(const ONNX_NAMESPACE::AttributeProto* p) = 0;
   virtual float AttributeProto__f(const ONNX_NAMESPACE::AttributeProto* p) = 0;
   virtual void AttributeProto__set_s(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) = 0;
+  virtual void AttributeProto__set_i(ONNX_NAMESPACE::AttributeProto* p, int64_t value) = 0;
   virtual const ::std::string& AttributeProto__s(const ONNX_NAMESPACE::AttributeProto* p) = 0;
   virtual void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) = 0;
   virtual void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) = 0;
@@ -351,6 +352,7 @@ struct ProviderHost {
   virtual ONNX_NAMESPACE::ValueInfoProtos* GraphProto__mutable_value_info(ONNX_NAMESPACE::GraphProto* p) = 0;
   virtual ONNX_NAMESPACE::TensorProtos* GraphProto__mutable_initializer(ONNX_NAMESPACE::GraphProto* p) = 0;
   virtual ONNX_NAMESPACE::NodeProto* GraphProto__add_node(ONNX_NAMESPACE::GraphProto* p) = 0;
+  virtual ONNX_NAMESPACE::NodeProto* GraphProto__mutable_node(ONNX_NAMESPACE::GraphProto* p, int index) = 0;
 
   // ModelProto
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> ModelProto__construct() = 0;
@@ -372,6 +374,7 @@ struct ProviderHost {
   virtual void NodeProto__operator_assign(ONNX_NAMESPACE::NodeProto* p, const ONNX_NAMESPACE::NodeProto& v) = 0;
   virtual int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) = 0;
   virtual const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const = 0;
+  virtual ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) = 0;
 
   // TensorProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TensorProto> TensorProto__construct() = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index c0b282b202ef6..149a43222b445 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -74,6 +74,7 @@ struct AttributeProto final {
   int64_t i() const { return g_host->AttributeProto__i(this); }
   float f() const { return g_host->AttributeProto__f(this); }
   void set_s(const ::std::string& value) { return g_host->AttributeProto__set_s(this, value); }
+  void set_i(int64_t value) { return g_host->AttributeProto__set_i(this, value); }
   const ::std::string& s() const { return g_host->AttributeProto__s(this); }
   void set_name(const ::std::string& value) { return g_host->AttributeProto__set_name(this, value); }
   void set_type(AttributeProto_AttributeType value) { return g_host->AttributeProto__set_type(this, value); }
@@ -118,6 +119,7 @@ struct GraphProto final {
   ValueInfoProtos* mutable_value_info() { return g_host->GraphProto__mutable_value_info(this); }
   TensorProtos* mutable_initializer() { return g_host->GraphProto__mutable_initializer(this); }
   NodeProto* add_node() { return g_host->GraphProto__add_node(this); }
+  NodeProto* mutable_node(int index) { return g_host->GraphProto__mutable_node(this, index); }
 
   GraphProto() = delete;
   GraphProto(const GraphProto&) = delete;
@@ -148,6 +150,7 @@ struct NodeProto final {
   void operator=(const NodeProto& v) { g_host->NodeProto__operator_assign(this, v); }
   int attribute_size() { return g_host->NodeProto__attribute_size(this); }
   const AttributeProto& attribute(int index) const { return g_host->NodeProto__attribute(this, index); }
+  AttributeProto* mutable_attribute(int index) { return g_host->NodeProto__mutable_attribute(this, index); }
 
   NodeProto() = delete;
   NodeProto(const NodeProto&) = delete;
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
new file mode 100644
index 0000000000000..4d8ba6a0891e3
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -0,0 +1,229 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include "onnx_ctx_model_helper.h"
+#include "core/providers/cuda/shared_inc/cuda_call.h"
+#include "core/framework/execution_provider.h"
+
+namespace onnxruntime {
+
+/*
+ *  Check whether the graph has the EP context contrib op.
+ *  The op can contain the precompiled engine info for TRT EP to directly load the engine.
+ *
+ *  Note: Please see more details about "EPContext" contrib op in contrib_defs.cc
+ */
+bool GraphHasCtxNode(const GraphViewer& graph_viewer) {
+  for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
+    auto node = graph_viewer.GetNode(i);
+    if (node != nullptr && node->OpType() == EPCONTEXT_OP) {
+      return true;
+    }
+  }
+  return false;
+}
+
+const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer) {
+  // find the top level graph
+  const Graph* cur_graph = &graph_viewer.GetGraph();
+  while (cur_graph->IsSubgraph()) {
+    cur_graph = cur_graph->ParentGraph();
+  }
+
+  const Graph& main_graph = *cur_graph;
+  return main_graph.ModelPath();
+}
+
+std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path) {
+  std::filesystem::path base_path(path.ToPathString());
+  std::filesystem::path parent_path = base_path.parent_path();
+  std::filesystem::path engine_path = parent_path.append(engine_cache_path);
+  return engine_path;
+}
+
+/*
+ * Update ep_cache_context attribute of the EP context node with the given engine binary data
+ */
+void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
+                                     char* engine_data,
+                                     size_t size) {
+  ONNX_NAMESPACE::GraphProto* graph_proto = model_proto->mutable_graph();
+  ONNX_NAMESPACE::NodeProto* node_proto = graph_proto->mutable_node(0);
+
+  for (int i = 0; i < node_proto->attribute_size(); ++i) {
+    ONNX_NAMESPACE::AttributeProto* attribute_proto = node_proto->mutable_attribute(i);
+    if (attribute_proto->name() == EP_CACHE_CONTEXT) {
+      std::string engine_data_str = "";
+      if (size > 0) {
+        engine_data_str.assign(engine_data, size);
+      }
+      attribute_proto->set_s(engine_data_str);
+    }
+  }
+}
+
+/*
+ * Create "EP context node" model where engine information is embedded
+ */
+ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
+                                               const std::string engine_cache_path,
+                                               char* engine_data,
+                                               size_t size,
+                                               const int64_t embed_mode,
+                                               bool compute_capability_enable,
+                                               std::string compute_capability,
+                                               const logging::Logger* logger) {
+  auto model_build = graph_viewer.CreateModel(*logger);
+  auto& graph_build = model_build->MainGraph();
+
+  // Get graph inputs and outputs
+  std::vector<onnxruntime::NodeArg*> inputs, outputs;
+  for (auto input : graph_viewer.GetInputs()) {
+    auto& n_input = graph_build.GetOrCreateNodeArg(input->Name(), input->TypeAsProto());
+    inputs.push_back(&n_input);
+  }
+
+  for (auto output : graph_viewer.GetOutputs()) {
+    auto& n_output = graph_build.GetOrCreateNodeArg(output->Name(), output->TypeAsProto());
+    outputs.push_back(&n_output);
+  }
+
+  // Create EP context node attributes
+  auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create();  // embed_mode
+  auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create();  // ep_cache_context
+  auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create();  // hardware_architecture
+  std::string engine_data_str = "";
+  attr_0->set_name(EMBED_MODE);
+  attr_0->set_type(onnx::AttributeProto_AttributeType_INT);
+  attr_0->set_i(embed_mode);
+  attr_1->set_name(EP_CACHE_CONTEXT);
+  attr_1->set_type(onnx::AttributeProto_AttributeType_STRING);
+  if (embed_mode) {
+    if (size > 0) {
+      engine_data_str.assign(engine_data, size);
+    }
+    attr_1->set_s(engine_data_str);
+  } else {
+    attr_1->set_s(engine_cache_path);
+  }
+  auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
+  int num_attributes = compute_capability_enable ? 3 : 2;
+  node_attributes->reserve(num_attributes);
+  node_attributes->emplace(EMBED_MODE, *attr_0);
+  node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
+
+  if (compute_capability_enable) {
+    attr_2->set_name(COMPUTE_CAPABILITY);
+    attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+    attr_2->set_s(compute_capability);
+    node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
+  }
+
+  // Create EP context node
+  graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
+  ORT_ENFORCE(graph_build.Resolve().IsOK());
+
+  // Serialize modelproto to string
+  auto new_graph_viewer = graph_build.CreateGraphViewer();
+  auto model = new_graph_viewer->CreateModel(*logger);
+  auto model_proto = model->ToProto();
+  new_graph_viewer->ToProto(*model_proto->mutable_graph(), true, true);
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+
+  return model_proto.release();
+}
+
+/*
+ * Dump "EP context node" model
+ *
+ */
+void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                      const std::string engine_cache_path) {
+  std::fstream dump(engine_cache_path + "_wrapper.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+  model_proto->SerializeToOstream(dump);
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path + "_wrapper.onnx";
+}
+
+Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
+  if (!ValidateEPCtxNode(graph_viewer)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "It's not a valid EP Context node");
+  }
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode) {
+    // Get engine from byte stream
+    const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
+                                                                                                static_cast<size_t>(context_binary.length())));
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Read engine as binary data from \"ep_cache_context\" attribute of ep context node and deserialized it";
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not deserialize engine from binary data");
+    }
+  } else {
+    // Get engine from cache file
+    std::ifstream engine_file(engine_cache_path_.string(), std::ios::binary | std::ios::in);
+    engine_file.seekg(0, std::ios::end);
+    size_t engine_size = engine_file.tellg();
+    engine_file.seekg(0, std::ios::beg);
+    std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+    engine_file.read((char*)engine_buf.get(), engine_size);
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path_.string();
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path_.string());
+    }
+  }
+  return Status::OK();
+}
+
+/*
+ * The sanity check for EP context contrib op.
+ */
+bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewer) {
+  assert(graph_viewer.NumberOfNodes() == 1);
+  assert(graph_viewer.GetNode(0)->OpType() == EPCONTEXT_OP);
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  // Check hardware_architecture(compute_capability) if it's present as an attribute
+  if (attrs.count(COMPUTE_CAPABILITY) > 0) {
+    std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
+    if (model_compute_capability != compute_capability_) {
+      LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache doesn't match with the GPU's compute capability";
+      LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache: " << model_compute_capability;
+      LOGS_DEFAULT(ERROR) << "The compute capability of the GPU: " << compute_capability_;
+      return false;
+    }
+  }
+
+  // "embed_mode" attr and "ep_cache_context" attr should be present
+  if (attrs.count(EMBED_MODE) > 0 && attrs.count(EP_CACHE_CONTEXT) > 0) {
+    // ep_cache_context: payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0
+    const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+
+    // engine cache path
+    if (embed_mode == 0) {
+      // First assume engine cache path is relatvie to model path,
+      // If not, then assume the engine cache path is an absolute path.
+      engine_cache_path_ = LocateEngineRelativeToPath(attrs.at(EP_CACHE_CONTEXT).s(), GetModelPath(graph_viewer));
+      auto default_engine_cache_path_ = engine_cache_path_;
+      if (!std::filesystem::exists(engine_cache_path_)) {
+        engine_cache_path_.assign(attrs.at(EP_CACHE_CONTEXT).s());
+        if (!std::filesystem::exists(engine_cache_path_)) {
+          LOGS_DEFAULT(ERROR) << "Can't find " << default_engine_cache_path_.string() << " or " << engine_cache_path_.string() << " TensorRT engine";
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
new file mode 100644
index 0000000000000..ab6ea733adfa1
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -0,0 +1,55 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <filesystem>
+
+#include "NvInfer.h"
+#include "core/providers/shared_library/provider_api.h"
+
+namespace onnxruntime {
+
+static const std::string EPCONTEXT_OP = "EPContext";
+static const std::string EMBED_MODE = "embed_mode";
+static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
+static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
+static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
+
+bool GraphHasCtxNode(const GraphViewer& graph_viewer);
+const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer);
+std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path);
+ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
+                                               const std::string engine_cache_path,
+                                               char* engine_data,
+                                               size_t size,
+                                               const int64_t embed_mode,
+                                               bool compute_capability_enable,
+                                               std::string compute_capability,
+                                               const logging::Logger* logger);
+void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                      const std::string engine_cache_path);
+void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
+                                     char* engine_data,
+                                     size_t size);
+
+class TensorRTCacheModelHandler {
+ public:
+  TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
+                            nvinfer1::IRuntime* trt_runtime,
+                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), compute_capability_(compute_capability) {
+  }
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
+
+  bool ValidateEPCtxNode(const GraphViewer& graph_viewer);
+
+  Status GetEpContextFromGraph(const GraphViewer& graph_viewer);
+
+ private:
+  std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine_;
+  nvinfer1::IRuntime* trt_runtime_;
+  std::filesystem::path engine_cache_path_;
+  std::string compute_capability_;
+};  // TRTCacheModelHandler
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 4ece068b50fd1..1d4ead019dc27 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -11,6 +11,7 @@
 #include "tensorrt_execution_provider.h"
 #include "tensorrt_execution_provider_utils.h"
 #include "tensorrt_execution_provider_custom_ops.h"
+#include "onnx_ctx_model_helper.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
@@ -1378,6 +1379,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     profile_max_shapes = info.profile_max_shapes;
     profile_opt_shapes = info.profile_opt_shapes;
     cuda_graph_enable_ = info.cuda_graph_enable;
+    dump_ep_context_model_ = info.dump_ep_context_model;
+    ep_context_embed_mode_ = info.ep_context_embed_mode;
+    ep_context_compute_capability_enable_ = info.ep_context_compute_capability_enable;
   } else {
     try {
       const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations);
@@ -1531,6 +1535,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       if (!cuda_graph_enable_env.empty()) {
         cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true);
       }
+
+      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
+      if (!dump_ep_context_model_env.empty()) {
+        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
+      }
+
+      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
+      if (!ep_context_embed_mode_env.empty()) {
+        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
+      }
+
+      const std::string ep_context_compute_capability_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
+      if (!ep_context_compute_capability_env.empty()) {
+        ep_context_compute_capability_enable_ = (std::stoi(ep_context_compute_capability_env) == 0 ? false : true);
+      }
+
     } catch (const std::invalid_argument& ex) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what();
     } catch (const std::out_of_range& ex) {
@@ -2283,6 +2303,19 @@ bool TensorrtExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t&
 std::vector<std::unique_ptr<ComputeCapability>>
 TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
                                          const IKernelLookup& /*kernel_lookup*/) const {
+  // Construct subgraph capability from node list
+  std::vector<std::unique_ptr<ComputeCapability>> result;
+
+  // If the model consists of only a single "EPContext" contrib op, it means TRT EP can fetch the precompiled engine info from the node and
+  // load the engine directly without having to go through the processes of graph proto reconstruction, calling TRT parser and engine compilation.
+  // So, simply return the ComputeCapability here.
+  if (graph.NumberOfNodes() == 1 && GraphHasCtxNode(graph)) {
+    SubGraph_t supported_node_vector = {{0}, true};
+    std::unique_ptr<IndexedSubGraph> sub_graph = GetSubGraph(supported_node_vector, graph, TRTGenerateId(graph), 0);
+    result.push_back(ComputeCapability::Create(std::move(sub_graph)));
+    return result;
+  }
+
   // Get ModelPath
   const auto& path_string = graph.ModelPath().ToPathString();
 #ifdef _WIN32
@@ -2371,9 +2404,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     }
   }
 
-  // Construct subgraph capability from node list
-  std::vector<std::unique_ptr<ComputeCapability>> result;
-
   // Handle the case where the graph is subgraph of control flow op.
   // The purpose is to make control flow op as well as its subgraphs run on TRT.
   // Here we need to check whether subgraph is fully supported by TRT and don't fuse the nodes of the subgraph until control flow op level.
@@ -2488,721 +2518,391 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       output_map[output_defs[i]->Name()] = i;
     }
 
-    // Reconstruct graph proto from fused node's function body
-    auto model = graph_body_viewer.CreateModel(*GetLogger());
-    auto model_proto = model->ToProto();
-    graph_body_viewer.ToProto(*model_proto->mutable_graph(), true, true);
-    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
-    std::string string_buf;
-    model_proto->SerializeToString(string_buf);
-
-    if (dump_subgraphs_) {
-      // Dump TensorRT subgraphs
-      std::fstream dump(fused_node.Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
-      model_proto->SerializeToOstream(dump);
+    Status status;
+    if (GraphHasCtxNode(graph_body_viewer)) {
+      status = CreateNodeComputeInfoFromPrecompiledEngine(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
+    } else {
+      status = CreateNodeComputeInfoFromGraph(graph_body_viewer, fused_node, input_map, output_map, node_compute_funcs);
+    }
+    if (status != Status::OK()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
     }
+  }
+  return Status::OK();
+}
 
-    TensorrtLogger& trt_logger = GetTensorrtLogger();
-    auto trt_builder = GetBuilder();
-    const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
-    auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(explicitBatch));
-    auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-    auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
-    trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
-    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
-
-    // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
-    if (fp16_enable_ && layer_norm_fp32_fallback_) {
-      for (auto idx = 1; idx < trt_network->getNbLayers() - 1; ++idx) {
-        auto layer = trt_network->getLayer(idx);
-        auto next_layer = trt_network->getLayer(idx + 1);
-        if (layer->getType() == nvinfer1::LayerType::kELEMENTWISE && next_layer->getType() == nvinfer1::LayerType::kREDUCE && (static_cast<nvinfer1::IElementWiseLayer*>(layer))->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow";
-          layer->setPrecision(nvinfer1::DataType::kFLOAT);
-          next_layer->setPrecision(nvinfer1::DataType::kFLOAT);
-          layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-          next_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
-        }
-      }
-    }
-
-    int num_inputs = trt_network->getNbInputs();
-    int num_outputs = trt_network->getNbOutputs();
-    std::unordered_map<std::string, size_t> input_indexes(num_inputs);
-    std::unordered_map<std::string, size_t> output_indexes(num_outputs);
-    std::unordered_map<std::string, size_t> output_types(num_outputs);
+Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
+                                                                 const Node& fused_node,
+                                                                 std::unordered_map<std::string, size_t>& input_map,
+                                                                 std::unordered_map<std::string, size_t>& output_map,
+                                                                 std::vector<NodeComputeInfo>& node_compute_funcs) {
+  // Reconstruct graph proto from fused node's function body
+  auto model = graph_body_viewer.CreateModel(*GetLogger());
+  auto model_proto = model->ToProto();
+  graph_body_viewer.ToProto(*model_proto->mutable_graph(), true, true);
+  model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+  std::string string_buf;
+  model_proto->SerializeToString(string_buf);
+
+  if (dump_subgraphs_) {
+    // Dump TensorRT subgraphs
+    std::fstream dump(fused_node.Name() + ".onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+    model_proto->SerializeToOstream(dump);
+  }
+
+  TensorrtLogger& trt_logger = GetTensorrtLogger();
+  auto trt_builder = GetBuilder();
+  const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+  auto trt_network = std::unique_ptr<nvinfer1::INetworkDefinition>(trt_builder->createNetworkV2(explicitBatch));
+  auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+  auto trt_parser = tensorrt_ptr::unique_pointer<nvonnxparser::IParser>(nvonnxparser::createParser(*trt_network, trt_logger));
+  trt_parser->parse(string_buf.data(), string_buf.size(), model_path_);
+  trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
+
+  // Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow
+  if (fp16_enable_ && layer_norm_fp32_fallback_) {
+    for (auto idx = 1; idx < trt_network->getNbLayers() - 1; ++idx) {
+      auto layer = trt_network->getLayer(idx);
+      auto next_layer = trt_network->getLayer(idx + 1);
+      if (layer->getType() == nvinfer1::LayerType::kELEMENTWISE && next_layer->getType() == nvinfer1::LayerType::kREDUCE && (static_cast<nvinfer1::IElementWiseLayer*>(layer))->getOperation() == nvinfer1::ElementWiseOperation::kPOW) {
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow";
+        layer->setPrecision(nvinfer1::DataType::kFLOAT);
+        next_layer->setPrecision(nvinfer1::DataType::kFLOAT);
+        layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+        next_layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
+      }
+    }
+  }
+
+  int num_inputs = trt_network->getNbInputs();
+  int num_outputs = trt_network->getNbOutputs();
+  std::unordered_map<std::string, size_t> input_indexes(num_inputs);
+  std::unordered_map<std::string, size_t> output_indexes(num_outputs);
+  std::unordered_map<std::string, size_t> output_types(num_outputs);
 
-    /*
-     * Initialize shape range for each dynamic shape input tensor:
-     *   1) If user explicitly specifies optimization profiles via provider options, TRT EP will create those profiles during EP compile time.
-     *      It won't make adjustment for profile values during EP compute time.
-     *
-     *   2) If no explicit optimization profiles provided by user, TRT EP will firstly set min/max/opt shape to [INT_MAX, INT_MIN, INT_MIN].
-     *      Later in EP compute time, the shape will be adjusted to [min_input_value, max_input_value, max_input_value] based on input tensor value.
-     *
-     *
-     * Once the TRT profiles are created:
-     *   1) If all the dynamic shape input tensors have associated profiles explicitly provided by user, those profiles will be applied to TRT builder config
-     *      and the engine will be built at EP compile time.
-     *
-     *   2) As long as one of the dynamic shape input tensors has no explicitly associated profile, TRT EP will create default shape as described above,
-     *      and all the profiles won't be applied and engine won't be built until EP compute time.
-     */
-    bool has_dynamic_shape = false;  // True if input tensor has dynamic shape and no explicit profile is specified, otherwise false.
-    bool has_explicit_profile = false;
-    bool apply_explicit_profile = false;
-    int num_profiles = 0;
-    std::vector<nvinfer1::IOptimizationProfile*> trt_profiles;
-
-    // Following c++ map data structure is used to help serialize/deserialize profiles where it saves dynamic shape dimension(s) and min/max/opt values for dynamic shape input tensor.
-    //
-    // (1) Single profile case:
-    // For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
-    // has one dynamic shape dimension: dim_1. The data will be:
-    // {
-    //   tensor_a: {
-    //              dim_0: [[min_shape, max_shape, opt_shape]],
-    //              dim_2: [[min_shape, max_shape, opt_shape]]
-    //   },
-    //   tensor_b: {
-    //              dim_1: [[min_shape, max_shape, opt_shape]]
-    //   }
-    // }
-    //
-    // (2) Multiple profiles case:
-    // For example, assume tensor_a has one dynamic shap dimension: dim 0, and tensor_b has one dynamic shape dimension: dim_1,
-    // and both of the tensors have two profiles. The data will be:
-    // {
-    //   tensor_a: {
-    //     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
-    //   },
-    //   tensor_b: {
-    //     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
-    //   }
-    // }
-    ShapeRangesMap input_explicit_shape_ranges;
-    ShapeRangesMap input_implicit_shape_ranges;
-
-    if ((!profile_min_shapes_.empty()) && (!profile_max_shapes_.empty()) && (!profile_opt_shapes_.empty())) {
-      has_explicit_profile = true;
-      num_profiles = GetNumProfiles(profile_min_shapes_);
-      for (int i = 0; i < num_profiles; i++) {
-        trt_profiles.push_back(trt_builder->createOptimizationProfile());
-      }
-    }
-
-    // Iterate all input tensors to check dynamic shape
-    for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
-      auto input = trt_network->getInput(i);
-      const std::string& input_name = input->getName();
-      nvinfer1::Dims dims = input->getDimensions();
-      int nb_dims = dims.nbDims;
-
-      // Apply explicit optimization profiles provided by user
-      if (has_explicit_profile) {
-        apply_explicit_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
-      }
+  /*
+   * Initialize shape range for each dynamic shape input tensor:
+   *   1) If user explicitly specifies optimization profiles via provider options, TRT EP will create those profiles during EP compile time.
+   *      It won't make adjustment for profile values during EP compute time.
+   *
+   *   2) If no explicit optimization profiles provided by user, TRT EP will firstly set min/max/opt shape to [INT_MAX, INT_MIN, INT_MIN].
+   *      Later in EP compute time, the shape will be adjusted to [min_input_value, max_input_value, max_input_value] based on input tensor value.
+   *
+   *
+   * Once the TRT profiles are created:
+   *   1) If all the dynamic shape input tensors have associated profiles explicitly provided by user, those profiles will be applied to TRT builder config
+   *      and the engine will be built at EP compile time.
+   *
+   *   2) As long as one of the dynamic shape input tensors has no explicitly associated profile, TRT EP will create default shape as described above,
+   *      and all the profiles won't be applied and engine won't be built until EP compute time.
+   */
+  bool has_dynamic_shape = false;  // True if input tensor has dynamic shape and no explicit profile is specified, otherwise false.
+  bool has_explicit_profile = false;
+  bool apply_explicit_profile = false;
+  int num_profiles = 0;
+  std::vector<nvinfer1::IOptimizationProfile*> trt_profiles;
 
-      // If no explicit optimization profile is being applied, TRT EP will later set min/max/opt shape values based on input tensor values at EP compute time
-      if (!apply_explicit_profile) {
-        if (input->isShapeTensor()) {
-          // Shape tensor
-          std::vector<std::vector<int64_t>> profile_vector;
-          std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
-          profile_vector.push_back(shape_vector);  // only one profile needed
-          input_implicit_shape_ranges[input_name][0] = profile_vector;
-          has_dynamic_shape = true;
-        } else {
-          // Execution tensor
-          for (int j = 0, end = nb_dims; j < end; ++j) {
-            if (dims.d[j] == -1) {
-              std::vector<std::vector<int64_t>> profile_vector;
-              std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
-              profile_vector.push_back(shape_vector);  // only one profile needed
-              input_implicit_shape_ranges[input_name][j] = profile_vector;
-              has_dynamic_shape = true;
-            }
-          }
-        }
-        apply_explicit_profile = false;
-      }
+  // Following c++ map data structure is used to help serialize/deserialize profiles where it saves dynamic shape dimension(s) and min/max/opt values for dynamic shape input tensor.
+  //
+  // (1) Single profile case:
+  // For example, assume tensor_a has two dynamic shape dimensions: dim_0 and dim_2, and tensor_b
+  // has one dynamic shape dimension: dim_1. The data will be:
+  // {
+  //   tensor_a: {
+  //              dim_0: [[min_shape, max_shape, opt_shape]],
+  //              dim_2: [[min_shape, max_shape, opt_shape]]
+  //   },
+  //   tensor_b: {
+  //              dim_1: [[min_shape, max_shape, opt_shape]]
+  //   }
+  // }
+  //
+  // (2) Multiple profiles case:
+  // For example, assume tensor_a has one dynamic shap dimension: dim 0, and tensor_b has one dynamic shape dimension: dim_1,
+  // and both of the tensors have two profiles. The data will be:
+  // {
+  //   tensor_a: {
+  //     dim_0: [[min_shape_0, max_shape_0, opt_shape_0], [min_shape_1, max_shape_1, opt_shape_1]]
+  //   },
+  //   tensor_b: {
+  //     dim_1: [[min_shape_2, max_shape_2, opt_shape_2], [min_shape_3, max_shape_3, opt_shape_3]]
+  //   }
+  // }
+  ShapeRangesMap input_explicit_shape_ranges;
+  ShapeRangesMap input_implicit_shape_ranges;
+
+  if ((!profile_min_shapes_.empty()) && (!profile_max_shapes_.empty()) && (!profile_opt_shapes_.empty())) {
+    has_explicit_profile = true;
+    num_profiles = GetNumProfiles(profile_min_shapes_);
+    for (int i = 0; i < num_profiles; i++) {
+      trt_profiles.push_back(trt_builder->createOptimizationProfile());
     }
+  }
 
-    // Set explicit profiles in TRT config if all dynamic shape inputs have associated profiles provided by user
+  // Iterate all input tensors to check dynamic shape
+  for (unsigned int i = 0, end = num_inputs; i < end; ++i) {
+    auto input = trt_network->getInput(i);
+    const std::string& input_name = input->getName();
+    nvinfer1::Dims dims = input->getDimensions();
+    int nb_dims = dims.nbDims;
+
+    // Apply explicit optimization profiles provided by user
     if (has_explicit_profile) {
-      // TRT EP has a constraint here.
-      // Users need to provide all the dynamic shape inputs with associated profiles if they want to explicitly specify profiles through provider options.
-      if (has_dynamic_shape) {
-        std::ostringstream msg;
-        msg << "User needs to provide all the dynamic shape inputs with associated profiles if they want to explicitly set profiles through provider options.\n";
-        msg << "Please note that main graph could be partitioned into TRT/CUDA/CPU subgraphs, in this case, user also needs to provide shape profiles for the TRT subgraph's input if it's dynamic shape input.\n";
-        msg << "Following input(s) has no associated shape profiles provided: ";
-        auto begin = input_implicit_shape_ranges.begin();
-        auto end = input_implicit_shape_ranges.end();
-        auto it = begin;
-        if (it != end) {
-          msg << it->first;
-          ++it;
-        }
-        for (; it != end; ++it) {
-          msg << "," << it->first;
-        }
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, msg.str());
+      apply_explicit_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
+    }
+
+    // If no explicit optimization profile is being applied, TRT EP will later set min/max/opt shape values based on input tensor values at EP compute time
+    if (!apply_explicit_profile) {
+      if (input->isShapeTensor()) {
+        // Shape tensor
+        std::vector<std::vector<int64_t>> profile_vector;
+        std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
+        profile_vector.push_back(shape_vector);  // only one profile needed
+        input_implicit_shape_ranges[input_name][0] = profile_vector;
+        has_dynamic_shape = true;
       } else {
-        for (auto trt_profile : trt_profiles) {
-          trt_config->addOptimizationProfile(trt_profile);
+        // Execution tensor
+        for (int j = 0, end = nb_dims; j < end; ++j) {
+          if (dims.d[j] == -1) {
+            std::vector<std::vector<int64_t>> profile_vector;
+            std::vector<int64_t> shape_vector{INT_MAX, INT_MIN, INT_MIN};
+            profile_vector.push_back(shape_vector);  // only one profile needed
+            input_implicit_shape_ranges[input_name][j] = profile_vector;
+            has_dynamic_shape = true;
+          }
         }
       }
+      apply_explicit_profile = false;
     }
-    // If no explicit profile is applied and the input has dynamic shape, TRT EP simply creates one profile by default.
-    // It will later set proper min/max/opt shape values duing EP compute time.
-    else if (!has_explicit_profile && has_dynamic_shape) {
-      trt_profiles.push_back(trt_builder->createOptimizationProfile());
-    }
+  }
 
-    // Check platform availability for low precision
-    if (fp16_enable_) {
-      if (!trt_builder->platformHasFastFp16()) {
-        fp16_enable_ = false;
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_FP16_ENABLE is set, but platform doesn't support fast native fp16";
+  // Set explicit profiles in TRT config if all dynamic shape inputs have associated profiles provided by user
+  if (has_explicit_profile) {
+    // TRT EP has a constraint here.
+    // Users need to provide all the dynamic shape inputs with associated profiles if they want to explicitly specify profiles through provider options.
+    if (has_dynamic_shape) {
+      std::ostringstream msg;
+      msg << "User needs to provide all the dynamic shape inputs with associated profiles if they want to explicitly set profiles through provider options.\n";
+      msg << "Please note that main graph could be partitioned into TRT/CUDA/CPU subgraphs, in this case, user also needs to provide shape profiles for the TRT subgraph's input if it's dynamic shape input.\n";
+      msg << "Following input(s) has no associated shape profiles provided: ";
+      auto begin = input_implicit_shape_ranges.begin();
+      auto end = input_implicit_shape_ranges.end();
+      auto it = begin;
+      if (it != end) {
+        msg << it->first;
+        ++it;
+      }
+      for (; it != end; ++it) {
+        msg << "," << it->first;
+      }
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, msg.str());
+    } else {
+      for (auto trt_profile : trt_profiles) {
+        trt_config->addOptimizationProfile(trt_profile);
       }
     }
+  }
+  // If no explicit profile is applied and the input has dynamic shape, TRT EP simply creates one profile by default.
+  // It will later set proper min/max/opt shape values duing EP compute time.
+  else if (!has_explicit_profile && has_dynamic_shape) {
+    trt_profiles.push_back(trt_builder->createOptimizationProfile());
+  }
 
-    if (int8_enable_) {
-      if (!trt_builder->platformHasFastInt8()) {
-        int8_enable_ = false;
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_INT8_ENABLE is set, but platform doesn't support fast native int8";
-      }
-    }
-
-    // Load INT8 calibration table
-    std::unordered_map<std::string, float> dynamic_range_map;
-    if (int8_enable_ && int8_calibration_cache_available_) {
-      const std::string calibration_cache_path = GetCachePath(cache_path_, int8_calibration_cache_name_);
-      if (!ReadDynamicRange(calibration_cache_path, int8_use_native_tensorrt_calibration_table_, dynamic_range_map)) {
-        throw std::runtime_error("Failed to read INT8 calibration table " + calibration_cache_path);
-      }
-    }
-
-    // Set precision flags
-    std::string trt_node_name_with_precision = fused_node.Name();
-    if (fp16_enable_ && int8_enable_) {
-      trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-      trt_node_name_with_precision += "_fp16_int8";
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 and INT8 mode is enabled";
-    } else if (fp16_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-      trt_node_name_with_precision += "_fp16";
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 mode is enabled";
-    } else if (int8_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-      trt_node_name_with_precision += "_int8";
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] INT8 mode is enabled";
-    }
-
-    // Set DLA
-    if (fp16_enable_ || int8_enable_) {
-      if (dla_enable_ && dla_core_ >= 0) {  // DLA can only run with FP16 and INT8
-        int number_of_dla_core = trt_builder->getNbDLACores();
-        if (number_of_dla_core == 0) {
-          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
-          dla_enable_ = false;
-        } else {
-          if (dla_core_ >= number_of_dla_core) {
-            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
-            dla_core_ = 0;
-          }
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << dla_core_;
-          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-          trt_config->setDLACore(dla_core_);
-          trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
+  // Check platform availability for low precision
+  if (fp16_enable_) {
+    if (!trt_builder->platformHasFastFp16()) {
+      fp16_enable_ = false;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_FP16_ENABLE is set, but platform doesn't support fast native fp16";
+    }
+  }
+
+  if (int8_enable_) {
+    if (!trt_builder->platformHasFastInt8()) {
+      int8_enable_ = false;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_INT8_ENABLE is set, but platform doesn't support fast native int8";
+    }
+  }
+
+  // Load INT8 calibration table
+  std::unordered_map<std::string, float> dynamic_range_map;
+  if (int8_enable_ && int8_calibration_cache_available_) {
+    const std::string calibration_cache_path = GetCachePath(cache_path_, int8_calibration_cache_name_);
+    if (!ReadDynamicRange(calibration_cache_path, int8_use_native_tensorrt_calibration_table_, dynamic_range_map)) {
+      throw std::runtime_error("Failed to read INT8 calibration table " + calibration_cache_path);
+    }
+  }
+
+  // Set precision flags
+  std::string trt_node_name_with_precision = fused_node.Name();
+  if (fp16_enable_ && int8_enable_) {
+    trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+    trt_node_name_with_precision += "_fp16_int8";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 and INT8 mode is enabled";
+  } else if (fp16_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+    trt_node_name_with_precision += "_fp16";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] FP16 mode is enabled";
+  } else if (int8_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    trt_node_name_with_precision += "_int8";
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] INT8 mode is enabled";
+  }
+
+  // Set DLA
+  if (fp16_enable_ || int8_enable_) {
+    if (dla_enable_ && dla_core_ >= 0) {  // DLA can only run with FP16 and INT8
+      int number_of_dla_core = trt_builder->getNbDLACores();
+      if (number_of_dla_core == 0) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core, but platform doesn't have any DLA core";
+        dla_enable_ = false;
+      } else {
+        if (dla_core_ >= number_of_dla_core) {
+          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Try to use DLA core #" << dla_core_ << ", but it exceeds platform's maximum DLA core number " << number_of_dla_core << ". Use DLA core 0 instead.";
+          dla_core_ = 0;
         }
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << dla_core_;
+        trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+        trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+        trt_config->setDLACore(dla_core_);
+        trt_node_name_with_precision += "_dlacore" + std::to_string(dla_core_);
       }
     }
+  }
+
+  // enable sparse weights
+  if (sparsity_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+  }
 
-    // enable sparse weights
-    if (sparsity_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-    }
 #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
-    if (build_heuristics_enable_) {
-      trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are enabled."
-                            << " For TRT > 8.5, trt_build_heuristics_enable is deprecated, please set builder optimization level as 2 to enable builder heuristics.";
-    }
+  if (build_heuristics_enable_) {
+    trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are enabled."
+                          << " For TRT > 8.5, trt_build_heuristics_enable is deprecated, please set builder optimization level as 2 to enable builder heuristics.";
+  }
 #elif NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
-    // for TRT 8.6 onwards, heuristic-based tactic option is automatically enabled by setting builder optimization level 2
-    if (build_heuristics_enable_) {
-      if (builder_optimization_level_ == 2) {
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are automatically enabled by builder optimization level 2. trt_build_heuristics_enable is deprecated on TRT 8.6 onwards.";
-      } else {
-        LOGS_DEFAULT(WARNING) << "[TensorRT EP] trt_build_heuristics_enable is deprecated on TRT 8.6 onwards. Please set builder optimization level as 2 to enable builder heuristics.";
-      }
+  // for TRT 8.6 onwards, heuristic-based tactic option is automatically enabled by setting builder optimization level 2
+  if (build_heuristics_enable_) {
+    if (builder_optimization_level_ == 2) {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder heuristics are automatically enabled by builder optimization level 2. trt_build_heuristics_enable is deprecated on TRT 8.6 onwards.";
+    } else {
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] trt_build_heuristics_enable is deprecated on TRT 8.6 onwards. Please set builder optimization level as 2 to enable builder heuristics.";
     }
+  }
 #endif
 
 #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
-    // switch optimizaion level
-    if (builder_optimization_level_ != 3) {
-      trt_config->setBuilderOptimizationLevel(builder_optimization_level_);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-    }
+  // switch optimizaion level
+  if (builder_optimization_level_ != 3) {
+    trt_config->setBuilderOptimizationLevel(builder_optimization_level_);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+  }
 
-    // limit auxiliary streams
-    if (auxiliary_streams_ >= 0) {
-      trt_config->setMaxAuxStreams(auxiliary_streams_);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << auxiliary_streams_;
-    }
+  // limit auxiliary streams
+  if (auxiliary_streams_ >= 0) {
+    trt_config->setMaxAuxStreams(auxiliary_streams_);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << auxiliary_streams_;
+  }
 #else
-    if (builder_optimization_level_ != 3) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-    }
-    if (auxiliary_streams_ >= 0) {
-      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-    }
+  if (builder_optimization_level_ != 3) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+  }
+  if (auxiliary_streams_ >= 0) {
+    LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+  }
 #endif
-    // limit used tactic sources
-    if (!tactic_sources_.empty()) {
-      nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-      tactics |= GetTacticSourceFromString(tactic_sources_);
-      trt_config->setTacticSources(tactics);
-      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using " << tactic_sources_;
-    }
-
-    // Build TRT engine (if needed) and load TRT engine if:
-    //   (1) Graph has no dynamic shape input
-    //   (2) All the dynamic shape inputs have associated explicit profiles specified by user
-    //
-    // Otherwise engine will be handled at inference time.
-    std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
-    std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
 
-    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-    if (!has_dynamic_shape) {
-      const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + "_sm" + compute_capability_ + ".engine";
-      const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-      const std::string profile_cache_path = cache_path + "_sm" + compute_capability_ + ".profile";
-      std::string timing_cache_path = "";
-      bool engine_update = false;
-      if (timing_cache_enable_) {
-        timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
-      }
-      {
-        // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
-        auto lock = GetApiLock();
+  // limit used tactic sources
+  if (!tactic_sources_.empty()) {
+    nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+    tactics |= GetTacticSourceFromString(tactic_sources_);
+    trt_config->setTacticSources(tactics);
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using " << tactic_sources_;
+  }
 
-        // If explicit profile flag is on and engine cache enable flag is on,
-        // we need to compare explicit profiles and profiles used to build the engine in order to decide whether to rebuild the engine.
-        if (has_explicit_profile && engine_cache_enable_) {
-          engine_update = CompareProfiles(profile_cache_path, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_);
-          if (engine_update) {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine will be built";
-          } else {
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine won't be rebuilt";
-          }
-        }
+  // Build TRT engine (if needed) and load TRT engine if:
+  //   (1) Graph has no dynamic shape input
+  //   (2) All the dynamic shape inputs have associated explicit profiles specified by user
+  //
+  // Otherwise engine will be handled at inference time.
+  std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+
+  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+  const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
+  const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
+  const std::string engine_cache_path = cache_path_prefix + ".engine";
+  const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+  const std::string profile_cache_path = cache_path_prefix + ".profile";
+
+  if (!has_dynamic_shape) {
+    std::string timing_cache_path = "";
+    bool engine_update = false;
+    if (timing_cache_enable_) {
+      timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
+    }
+    {
+      // ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
+      auto lock = GetApiLock();
 
-        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-        if (engine_cache_enable_ && !engine_decryption_enable_ && engine_file && !engine_update) {
-          engine_file.seekg(0, std::ios::end);
-          size_t engine_size = engine_file.tellg();
-          engine_file.seekg(0, std::ios::beg);
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          engine_file.read((char*)engine_buf.get(), engine_size);
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-          if (trt_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not deserialize engine from cache: " + engine_cache_path);
-          }
-        } else if (engine_decryption_enable_ && engine_cache_enable_ && std::filesystem::exists(encrypted_engine_cache_path) && !engine_update) {
-          // Decrypt engine
-          size_t engine_size = 0;
-          if (!engine_decryption_(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not get engine buffer size");
-          }
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          if (!engine_decryption_(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not call engine decryption function decrypt");
-          }
-          // Deserialize engine
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-          if (trt_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-          }
+      // If explicit profile flag is on and engine cache enable flag is on,
+      // we need to compare explicit profiles and profiles used to build the engine in order to decide whether to rebuild the engine.
+      if (has_explicit_profile && engine_cache_enable_) {
+        engine_update = CompareProfiles(profile_cache_path, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_);
+        if (engine_update) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine will be built";
         } else {
-          // Set INT8 per tensor dynamic range
-          if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
-            trt_config->setInt8Calibrator(nullptr);
-            if (!SetDynamicRange(*trt_network, dynamic_range_map)) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name());
-            }
-          }
-
-          // Load timing cache from file. Create a fresh cache if the file doesn't exist
-          std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-          if (timing_cache_enable_) {
-            std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
-            timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
-            if (timing_cache == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not create timing cache: " + timing_cache_path);
-            }
-            trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
-            }
-          }
-
-          // Build engine
-          std::chrono::steady_clock::time_point engine_build_start;
-          if (detailed_build_log_) {
-            engine_build_start = std::chrono::steady_clock::now();
-          }
-          std::unique_ptr<nvinfer1::IHostMemory> serialized_engine{trt_builder->buildSerializedNetwork(*trt_network, *trt_config)};
-          if (serialized_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP failed to create engine from network for fused node: " + fused_node.Name());
-          }
-          trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
-          if (trt_engine == nullptr) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP failed to deserialize engine for fused node: " + fused_node.Name());
-          }
-          if (detailed_build_log_) {
-            auto engine_build_stop = std::chrono::steady_clock::now();
-            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-          }
-          if (engine_cache_enable_) {
-            // Serialize engine profile if it has explicit profiles
-            if (has_explicit_profile) {
-              SerializeProfileV2(profile_cache_path, input_explicit_shape_ranges);
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
-            }
-
-            if (engine_decryption_enable_) {
-              // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-              if (engine_encryption_ != nullptr) {
-                if (!engine_encryption_(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
-                  return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "TensorRT EP call to engine encryption library failed");
-                }
-                LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
-              } else {
-                LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
-              }
-            } else {
-              std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
-              file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
-            }
-          }
-          // serialize and save timing cache
-          if (timing_cache_enable_) {
-            auto timing_cache = trt_config->getTimingCache();
-            std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
-            if (timingCacheHostData == nullptr) {
-              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                     "TensorRT EP could not serialize timing cache: " + timing_cache_path);
-            }
-            saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
-            if (detailed_build_log_) {
-              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
-            }
-          }
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Engine won't be rebuilt";
         }
       }
 
-      // Build context
-      // Note: Creating an execution context from an engine is thread safe per TRT doc
-      // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      if (context_memory_sharing_enable_) {
-        size_t mem_size = trt_engine->getDeviceMemorySize();
-        if (mem_size > max_ctx_mem_size_) {
-          max_ctx_mem_size_ = mem_size;
+      std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+      if (engine_cache_enable_ && !engine_decryption_enable_ && engine_file && !engine_update) {
+        engine_file.seekg(0, std::ios::end);
+        size_t engine_size = engine_file.tellg();
+        engine_file.seekg(0, std::ios::beg);
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        engine_file.read((char*)engine_buf.get(), engine_size);
+        trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+        if (trt_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not deserialize engine from cache: " + engine_cache_path);
         }
-        trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
-      } else {
-        trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
-      }
-      if (!trt_context) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                               "TensorRT EP could not build execution context for fused node: " + fused_node.Name());
-      }
-    }
-
-    // Create input to index map
-    for (int i = 0; i < num_inputs; ++i) {
-      auto input = trt_network->getInput(i);
-      const std::string& input_name = input->getName();
-      const auto& iter = input_map.find(input_name);
-      if (iter != input_map.end()) {
-        input_indexes[input_name] = iter->second;
-      }
-    }
-
-    // Create output to index and type maps
-    const auto& graph_output = model_proto->graph().output();
-    for (int i = 0; i < num_outputs; ++i) {
-      const std::string& output_name = trt_network->getOutput(i)->getName();
-      const auto& iter = output_map.find(output_name);
-      if (iter != output_map.end()) {
-        output_indexes[output_name] = iter->second;
-      }
-      const auto& tensor_type = graph_output[i].type().tensor_type();
-      output_types[output_name] = tensor_type.elem_type();
-    }
-
-    // Save TRT engine, other TRT objects and input/output info to map
-    parsers_.emplace(fused_node.Name(), std::move(trt_parser));
-    engines_.emplace(fused_node.Name(), std::move(trt_engine));
-    contexts_.emplace(fused_node.Name(), std::move(trt_context));
-    networks_.emplace(fused_node.Name(), std::move(trt_network));
-    input_info_[fused_node.Name()].push_back(input_indexes);
-    output_info_[fused_node.Name()].push_back(output_indexes);
-    output_info_[fused_node.Name()].push_back(output_types);
-    input_shape_ranges_[fused_node.Name()] = input_implicit_shape_ranges;
-    profiles_.emplace(fused_node.Name(), std::move(trt_profiles));
-
-    // Create function state
-    // TODO: remove default capture
-    NodeComputeInfo compute_info;
-    compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
-      std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
-      // translate tactic sources string to nvinfer1::TacticSources
-      nvinfer1::TacticSources tactics = 0;
-      if (!tactic_sources_.empty()) {
-        tactics = GetTacticSourceFromString(tactic_sources_);
-      }
-      *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
-            &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
-            &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
-            dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
-            runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
-            dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
-            global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_,
-            builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics};
-      *state = p.release();
-      return 0;
-    };
-
-    // Release function state
-    compute_info.release_state_func = [](FunctionState state) {
-      delete static_cast<TensorrtFuncState*>(state);
-    };
-
-    // Create compute function
-    compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
-      Ort::KernelContext ctx(context);
-
-      TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
-
-      // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
-      // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
-      // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-      std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
-      const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
-      const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
-      const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
-      bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
-      auto fused_node_name = trt_state->fused_node_name;
-      auto& shape_ranges = trt_state->input_shape_ranges;
-      auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
-      auto trt_builder = trt_state->builder;
-      auto trt_engine = trt_state->engine->get();
-      auto trt_context = trt_state->context->get();
-      auto trt_profiles = trt_state->profiles;
-      auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
-      int num_inputs = static_cast<int>(input_indexes.size());
-      int num_outputs = static_cast<int>(output_indexes.size());
-      bool engine_update = false;
-      bool context_update = false;
-      std::unordered_set<std::string> input_names;
-      std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
-
-      OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
-      if (alloc_ == nullptr) {
-        Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
-      }
-      OrtAllocator* alloc = alloc_;
-
-      void* cuda_stream;
-      Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
-      cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
-
-      // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-      // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-      // Prepare cache name
-      const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
-      const std::string engine_cache_path = cache_path + "_sm" + compute_capability_ + ".engine";
-      const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
-      const std::string profile_cache_path = cache_path + "_sm" + compute_capability_ + ".profile";
-      std::string timing_cache_path = "";
-      if (timing_cache_enable_) {
-        timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
-      }
-
-      // Load serialized engine
-      if (trt_state->engine_cache_enable && trt_engine == nullptr) {
-        std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
-        std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
-        if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
-          // Deserialize profile
-          shape_ranges = DeserializeProfileV2(profile_file);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-
-          // Prepare buffer
-          engine_file.seekg(0, std::ios::end);
-          size_t engine_size = engine_file.tellg();
-          engine_file.seekg(0, std::ios::beg);
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          engine_file.read((char*)engine_buf.get(), engine_size);
-
-          // Deserialize engine
-          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-          trt_state->engine->reset();
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-              trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
-          if (!(*(trt_state->engine))) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
-          }
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
-          trt_engine = trt_state->engine->get();
-          context_update = true;
-        } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
-          shape_ranges = DeserializeProfileV2(profile_file);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
-          // Decrypt engine
-          size_t engine_size = 0;
-          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not get engine buffer size");
-          }
-          std::unique_ptr<char[]> engine_buf{new char[engine_size]};
-          if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not call engine decryption function decrypt");
-          }
-          // Deserialize engine
-          // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
-          // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-          trt_state->engine->reset();
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
-          if (!(*(trt_state->engine))) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                   "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
-          }
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
-          trt_engine = trt_state->engine->get();
-          context_update = true;
+      } else if (engine_decryption_enable_ && engine_cache_enable_ && std::filesystem::exists(encrypted_engine_cache_path) && !engine_update) {
+        // Decrypt engine
+        size_t engine_size = 0;
+        if (!engine_decryption_(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not get engine buffer size");
         }
-      }
-
-      // Check and update shape ranges for dynamic shape inputs.
-      for (int i = 0, end = num_inputs; i < end; ++i) {
-        auto input = trt_state->network->get()->getInput(i);
-        const std::string& input_name = input->getName();
-        input_names.insert(input_name);
-
-        // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
-        // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
-        if (shape_ranges.find(input_name) != shape_ranges.end()) {
-          auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
-          if (status != Status::OK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
-          }
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        if (!engine_decryption_(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not call engine decryption function decrypt");
         }
-      }
-
-      // Regenerate engine
-      if (engine_update) {
-        // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
-        trt_state->context->reset();
-        trt_state->engine->reset();
-        auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
-        trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
-        for (auto trt_profile : trt_profiles) {
-          trt_config->addOptimizationProfile(trt_profile);
+        // Deserialize engine
+        trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+        if (trt_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
         }
-
-        // Set INT8 Per Tensor Dynamic range
-        if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+      } else {
+        // Set INT8 per tensor dynamic range
+        if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) {
           trt_config->setInt8Calibrator(nullptr);
-          if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
+          if (!SetDynamicRange(*trt_network, dynamic_range_map)) {
+            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                   "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name());
           }
         }
 
-        // Set precision
-        if (trt_state->fp16_enable && trt_state->int8_enable) {
-          trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
-        } else if (trt_state->fp16_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
-        } else if (trt_state->int8_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
-        }
-
-        // Set DLA (DLA can only run with FP16 or INT8)
-        if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
-          trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
-          trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
-          trt_config->setDLACore(trt_state->dla_core);
-        }
-
-        // enable sparse weights
-        if (trt_state->sparsity_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
-        }
-#if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
-        // enable builder heuristics
-        if (trt_state->build_heuristics_enable) {
-          trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
-        }
-#elif NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
-        // switch optimizaion level
-        if (trt_state->builder_optimization_level != 3) {
-          trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
-        }
-
-        // limit auxiliary streams
-        if (trt_state->auxiliary_streams >= 0) {
-          trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
-        }
-#else
-        if (trt_state->builder_optimization_level != 3) {
-          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
-        }
-        if (trt_state->auxiliary_streams >= 0) {
-          LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
-        }
-#endif
-        // limit used tactic sources
-        if (trt_state->filter_tactic_sources) {
-          nvinfer1::TacticSources tactics = trt_config->getTacticSources();
-          tactics |= trt_state->tactic_sources;
-          trt_config->setTacticSources(tactics);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
-        }
-
         // Load timing cache from file. Create a fresh cache if the file doesn't exist
         std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
-        if (trt_state->timing_cache_enable) {
+        if (timing_cache_enable_) {
           std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
           timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
           if (timing_cache == nullptr) {
@@ -3216,44 +2916,37 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         }
 
         // Build engine
-        std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
-        {
-          auto lock = GetApiLock();
-          std::chrono::steady_clock::time_point engine_build_start;
-          if (detailed_build_log_) {
-            engine_build_start = std::chrono::steady_clock::now();
-          }
-          serialized_engine = std::unique_ptr<nvinfer1::IHostMemory>(
-              trt_builder->buildSerializedNetwork(*trt_state->network->get(), *trt_config));
-          if (!serialized_engine) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create engine from network.");
-          }
-          *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
-              trt_state->runtime->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
-          if (!(*(trt_state->engine))) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to deserialize engine.");
-          }
-          if (detailed_build_log_) {
-            auto engine_build_stop = std::chrono::steady_clock::now();
-            LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
-          }
+        std::chrono::steady_clock::time_point engine_build_start;
+        if (detailed_build_log_) {
+          engine_build_start = std::chrono::steady_clock::now();
         }
-        if (!(*(trt_state->engine))) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+        std::unique_ptr<nvinfer1::IHostMemory> serialized_engine{trt_builder->buildSerializedNetwork(*trt_network, *trt_config)};
+        if (serialized_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP failed to create engine from network for fused node: " + fused_node.Name());
         }
-        trt_engine = trt_state->engine->get();
-        if (trt_state->engine_cache_enable) {
-          // Serialize engine profile
-          SerializeProfileV2(profile_cache_path, shape_ranges);
-          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+        trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(runtime_->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
+        if (trt_engine == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP failed to deserialize engine for fused node: " + fused_node.Name());
+        }
+        if (detailed_build_log_) {
+          auto engine_build_stop = std::chrono::steady_clock::now();
+          LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+        }
+        if (engine_cache_enable_) {
+          // Serialize engine profile if it has explicit profiles
+          if (has_explicit_profile) {
+            SerializeProfileV2(profile_cache_path, input_explicit_shape_ranges);
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+          }
 
-          // Serialize engine
-          if (trt_state->engine_decryption_enable) {
+          if (engine_decryption_enable_) {
             // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
-            if (trt_state->engine_encryption != nullptr) {
-              if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
+            if (engine_encryption_ != nullptr) {
+              if (!engine_encryption_(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
                 return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                       "TensorRT EP could not call engine encryption function encrypt");
+                                       "TensorRT EP call to engine encryption library failed");
               }
               LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
             } else {
@@ -3262,12 +2955,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
           } else {
             std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
             file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
           }
         }
-
         // serialize and save timing cache
-        if (trt_state->timing_cache_enable) {
+        if (timing_cache_enable_) {
           auto timing_cache = trt_config->getTimingCache();
           std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
           if (timingCacheHostData == nullptr) {
@@ -3279,183 +2971,859 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
             LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
           }
         }
-        context_update = true;
+        // dump EP context node model
+        if (dump_ep_context_model_) {
+          std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxNodeModel(graph_body_viewer,
+                                                                                     engine_cache_path,
+                                                                                     reinterpret_cast<char*>(serialized_engine->data()),
+                                                                                     serialized_engine->size(),
+                                                                                     ep_context_embed_mode_,
+                                                                                     ep_context_compute_capability_enable_,
+                                                                                     compute_capability_,
+                                                                                     GetLogger())};
+          DumpCtxNodeModel(model_proto.get(), cache_path_prefix);
+        }
       }
+    }
 
-      if (context_update) {
-        if (trt_state->context_memory_sharing_enable) {
-          *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
-              trt_state->engine->get()->createExecutionContextWithoutDeviceMemory());
-        } else {
-          *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
-              trt_state->engine->get()->createExecutionContext());
+    // Build context
+    // Note: Creating an execution context from an engine is thread safe per TRT doc
+    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    if (context_memory_sharing_enable_) {
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+      if (mem_size > max_ctx_mem_size_) {
+        max_ctx_mem_size_ = mem_size;
+      }
+      trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
+    } else {
+      trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
+    }
+    if (!trt_context) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not build execution context for fused node: " + fused_node.Name());
+    }
+  }
+
+  // Create input to index map
+  for (int i = 0; i < num_inputs; ++i) {
+    auto input = trt_network->getInput(i);
+    const std::string& input_name = input->getName();
+    const auto& iter = input_map.find(input_name);
+    if (iter != input_map.end()) {
+      input_indexes[input_name] = iter->second;
+    }
+  }
+
+  // Create output to index and type maps
+  const auto& graph_output = model_proto->graph().output();
+  for (int i = 0; i < num_outputs; ++i) {
+    const std::string& output_name = trt_network->getOutput(i)->getName();
+    const auto& iter = output_map.find(output_name);
+    if (iter != output_map.end()) {
+      output_indexes[output_name] = iter->second;
+    }
+    const auto& tensor_type = graph_output[i].type().tensor_type();
+    output_types[output_name] = tensor_type.elem_type();
+  }
+
+  // Save TRT engine, other TRT objects and input/output info to map
+  parsers_.emplace(fused_node.Name(), std::move(trt_parser));
+  engines_.emplace(fused_node.Name(), std::move(trt_engine));
+  contexts_.emplace(fused_node.Name(), std::move(trt_context));
+  networks_.emplace(fused_node.Name(), std::move(trt_network));
+  input_info_[fused_node.Name()].push_back(input_indexes);
+  output_info_[fused_node.Name()].push_back(output_indexes);
+  output_info_[fused_node.Name()].push_back(output_types);
+  input_shape_ranges_[fused_node.Name()] = input_implicit_shape_ranges;
+  profiles_.emplace(fused_node.Name(), std::move(trt_profiles));
+
+  // For dynamic shape input model, firstly TRT EP creates a model proto which includes inputs, outputs and empty engine.
+  // TRT EP will serialize the model at inference time due to engine can be updated and the updated engine should be included in the model.
+  // However, if the embed_mode is 0 (only includes engine path), TRT EP will serialize it here.
+  if (dump_ep_context_model_ && has_dynamic_shape) {
+    model_proto_.reset(CreateCtxNodeModel(graph_body_viewer,
+                                          engine_cache_path,
+                                          nullptr,
+                                          0,
+                                          ep_context_embed_mode_,
+                                          ep_context_compute_capability_enable_,
+                                          compute_capability_,
+                                          GetLogger()));
+    if (ep_context_embed_mode_ == 0) {
+      DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+    }
+  }
+
+  // Create function state
+  // TODO: remove default capture
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
+    std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
+    // translate tactic sources string to nvinfer1::TacticSources
+    nvinfer1::TacticSources tactics = 0;
+    if (!tactic_sources_.empty()) {
+      tactics = GetTacticSourceFromString(tactic_sources_);
+    }
+    *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
+          &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
+          &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
+          input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
+          dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
+          runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
+          dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
+          global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_,
+          builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics};
+    *state = p.release();
+    return 0;
+  };
+
+  // Release function state
+  compute_info.release_state_func = [](FunctionState state) {
+    delete static_cast<TensorrtFuncState*>(state);
+  };
+
+  // Create compute function
+  compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+
+    TensorrtFuncState* trt_state = reinterpret_cast<TensorrtFuncState*>(state);
+
+    // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine,
+    // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading.
+    // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
+    const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
+    const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
+    const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+    bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
+    auto fused_node_name = trt_state->fused_node_name;
+    auto& shape_ranges = trt_state->input_shape_ranges;
+    auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
+    auto trt_builder = trt_state->builder;
+    auto trt_engine = trt_state->engine->get();
+    auto trt_context = trt_state->context->get();
+    auto trt_profiles = trt_state->profiles;
+    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
+    int num_inputs = static_cast<int>(input_indexes.size());
+    int num_outputs = static_cast<int>(output_indexes.size());
+    bool engine_update = false;
+    bool context_update = false;
+    std::unordered_set<std::string> input_names;
+    std::unordered_map<std::string, std::vector<int32_t>> tensor_shape_values;
+
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
+    if (alloc_ == nullptr) {
+      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+    }
+    OrtAllocator* alloc = alloc_;
+
+    void* cuda_stream;
+    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+
+    // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+    // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
+    // Prepare cache name
+    const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
+    const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
+    const std::string engine_cache_path = cache_path_prefix + ".engine";
+    const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
+    const std::string profile_cache_path = cache_path_prefix + ".profile";
+    std::string timing_cache_path = "";
+    if (timing_cache_enable_) {
+      timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_);
+    }
+
+    // Load serialized engine
+    if (trt_state->engine_cache_enable && trt_engine == nullptr) {
+      std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
+      std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
+      if (engine_file && !trt_state->engine_decryption_enable && profile_file) {
+        // Deserialize profile
+        shape_ranges = DeserializeProfileV2(profile_file);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+
+        // Prepare buffer
+        engine_file.seekg(0, std::ios::end);
+        size_t engine_size = engine_file.tellg();
+        engine_file.seekg(0, std::ios::beg);
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        engine_file.read((char*)engine_buf.get(), engine_size);
+
+        // Deserialize engine
+        // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+        // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+        trt_state->engine->reset();
+        *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+            trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
+        if (!(*(trt_state->engine))) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
         }
-        if (!(*(trt_state->context))) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path;
+        trt_engine = trt_state->engine->get();
+        context_update = true;
+      } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) {
+        shape_ranges = DeserializeProfileV2(profile_file);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path;
+        // Decrypt engine
+        size_t engine_size = 0;
+        if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), nullptr, &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not get engine buffer size");
+        }
+        std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+        if (!trt_state->engine_decryption(encrypted_engine_cache_path.c_str(), &engine_buf[0], &engine_size)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not call engine decryption function decrypt");
         }
-        trt_context = trt_state->context->get();
+        // Deserialize engine
+        // Note: Deserializing an engine from a TensorRT runtime is thread safe per TRT doc
+        // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+        trt_state->engine->reset();
+        *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_state->runtime->deserializeCudaEngine(engine_buf.get(), engine_size));
+        if (!(*(trt_state->engine))) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not deserialize engine from encrypted cache: " + encrypted_engine_cache_path);
+        }
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Decrypted and DeSerialized " + encrypted_engine_cache_path;
+        trt_engine = trt_state->engine->get();
+        context_update = true;
       }
+    }
 
-      // Get input and output binding names
-      int total_bindings = trt_engine->getNbIOTensors();
-      std::vector<char const*> input_binding_names, output_binding_names;
-      for (int i = 0, end = total_bindings; i < end; ++i) {
-        auto const& name = trt_engine->getIOTensorName(i);
-        auto const& mode = trt_engine->getTensorIOMode(name);
-        if (mode == nvinfer1::TensorIOMode::kINPUT) {
-          input_binding_names.push_back(name);
-        } else {
-          output_binding_names.push_back(name);
+    // Check and update shape ranges for dynamic shape inputs.
+    for (int i = 0, end = num_inputs; i < end; ++i) {
+      auto input = trt_state->network->get()->getInput(i);
+      const std::string& input_name = input->getName();
+      input_names.insert(input_name);
+
+      // If there is any input tensor in shape_ranges, it means this input tensor has dynamic shape and its profile shape values have not yet resolved.
+      // TRT EP will help determine the min/max/opt profile values based on current input tensor value.
+      if (shape_ranges.find(input_name) != shape_ranges.end()) {
+        auto status = ApplyProfileShapesFromInputTensorValue(trt_profiles, ctx, input, shape_ranges, input_indexes, tensor_shape_values, stream, &engine_update);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to parse input tensor and generate optimization profiles.");
         }
       }
+    }
 
-      /*
-       * Set input shapes and bind input buffers
-       */
-      std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
-      for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
-        char const* input_name = input_binding_names[i];
+    // Regenerate engine
+    if (engine_update) {
+      // Destroy the IExecutionContext objects before destroying an engine object, otherwise it will lead to undefined behavior.
+      trt_state->context->reset();
+      trt_state->engine->reset();
+      auto trt_config = std::unique_ptr<nvinfer1::IBuilderConfig>(trt_builder->createBuilderConfig());
+      trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, *(trt_state->max_workspace_size_ptr));
+      for (auto trt_profile : trt_profiles) {
+        trt_config->addOptimizationProfile(trt_profile);
+      }
 
-        size_t input_index = 0;
-        const auto iter = input_indexes.find(input_name);
-        if (iter != input_indexes.end()) {
-          input_index = iter->second;
+      // Set INT8 Per Tensor Dynamic range
+      if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) {
+        trt_config->setInt8Calibrator(nullptr);
+        if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range.");
         }
-        auto input_tensor = ctx.GetInput(input_index);
-        auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
-        const auto tensor_shapes = tensor_info.GetShape();
+      }
 
-        // Only use for "shape tensor" input
-        std::vector<int32_t> shape_values;
-        if (tensor_shape_values.find(input_name) != tensor_shape_values.end()) {
-          shape_values = tensor_shape_values[input_name];
+      // Set precision
+      if (trt_state->fp16_enable && trt_state->int8_enable) {
+        trt_config->setFlags(1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kFP16) | 1U << static_cast<uint32_t>(nvinfer1::BuilderFlag::kINT8));
+      } else if (trt_state->fp16_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kFP16);
+      } else if (trt_state->int8_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kINT8);
+      }
+
+      // Set DLA (DLA can only run with FP16 or INT8)
+      if ((trt_state->fp16_enable || trt_state->int8_enable) && trt_state->dla_enable) {
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] use DLA core " << trt_state->dla_core;
+        trt_config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
+        trt_config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
+        trt_config->setDLACore(trt_state->dla_core);
+      }
+
+      // enable sparse weights
+      if (trt_state->sparsity_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
+      }
+#if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
+      // enable builder heuristics
+      if (trt_state->build_heuristics_enable) {
+        trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder heuristics are enabled";
+      }
+#elif NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8
+      // switch optimizaion level
+      if (trt_state->builder_optimization_level != 3) {
+        trt_config->setBuilderOptimizationLevel(trt_state->builder_optimization_level);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Builder optimization level is set to " << builder_optimization_level_;
+      }
+
+      // limit auxiliary streams
+      if (trt_state->auxiliary_streams >= 0) {
+        trt_config->setMaxAuxStreams(trt_state->auxiliary_streams);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Auxiliary streams are se to " << trt_state->auxiliary_streams;
+      }
+#else
+      if (trt_state->builder_optimization_level != 3) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Builder optimization level can only be used on TRT 8.6 onwards!";
+      }
+      if (trt_state->auxiliary_streams >= 0) {
+        LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!";
+      }
+#endif
+      // limit used tactic sources
+      if (trt_state->filter_tactic_sources) {
+        nvinfer1::TacticSources tactics = trt_config->getTacticSources();
+        tactics |= trt_state->tactic_sources;
+        trt_config->setTacticSources(tactics);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Tactic sources are limited using bitmask " << tactics;
+      }
+
+      // Load timing cache from file. Create a fresh cache if the file doesn't exist
+      std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
+      if (trt_state->timing_cache_enable) {
+        std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
+        timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
+        if (timing_cache == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not create timing cache: " + timing_cache_path);
+        }
+        trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
+        if (detailed_build_log_) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
         }
+      }
 
-        auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
-        if (status != Status::OK()) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      // Build engine
+      std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
+      {
+        auto lock = GetApiLock();
+        std::chrono::steady_clock::time_point engine_build_start;
+        if (detailed_build_log_) {
+          engine_build_start = std::chrono::steady_clock::now();
+        }
+        serialized_engine = std::unique_ptr<nvinfer1::IHostMemory>(
+            trt_builder->buildSerializedNetwork(*trt_state->network->get(), *trt_config));
+        if (!serialized_engine) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create engine from network.");
+        }
+        *(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
+            trt_state->runtime->deserializeCudaEngine(serialized_engine->data(), serialized_engine->size()));
+        if (!(*(trt_state->engine))) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to deserialize engine.");
         }
+        if (detailed_build_log_) {
+          auto engine_build_stop = std::chrono::steady_clock::now();
+          LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
+        }
+      }
+      if (!(*(trt_state->engine))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
+      }
+      trt_engine = trt_state->engine->get();
+      if (trt_state->engine_cache_enable) {
+        // Serialize engine profile
+        SerializeProfileV2(profile_cache_path, shape_ranges);
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + profile_cache_path;
+
+        // Serialize engine
+        if (trt_state->engine_decryption_enable) {
+          // Encrypt engine. The library is not always deployed with the encrypt function, so check if it is available first.
+          if (trt_state->engine_encryption != nullptr) {
+            if (!trt_state->engine_encryption(encrypted_engine_cache_path.c_str(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size())) {
+              return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                     "TensorRT EP could not call engine encryption function encrypt");
+            }
+            LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized and encrypted engine " + encrypted_engine_cache_path;
+          } else {
+            LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine cache encryption function is not found. No cache is written to disk";
+          }
+        } else {
+          std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
+          file.write(reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
+        }
+      }
+
+      // serialize and save timing cache
+      if (trt_state->timing_cache_enable) {
+        auto timing_cache = trt_config->getTimingCache();
+        std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
+        if (timingCacheHostData == nullptr) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                                 "TensorRT EP could not serialize timing cache: " + timing_cache_path);
+        }
+        saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
+        if (detailed_build_log_) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
+        }
+      }
+
+      // dump ep context model
+      if (dump_ep_context_model_ && ep_context_embed_mode_) {
+        UpdateCtxNodeModelEngineContext(model_proto_.get(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
+        DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+      }
+      context_update = true;
+    }
+
+    if (context_update) {
+      if (trt_state->context_memory_sharing_enable) {
+        *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
+            trt_state->engine->get()->createExecutionContextWithoutDeviceMemory());
+      } else {
+        *(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
+            trt_state->engine->get()->createExecutionContext());
+      }
+      if (!(*(trt_state->context))) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context.");
       }
+      trt_context = trt_state->context->get();
+    }
 
-      /*
-       * Set output shapes and bind output buffers
-       */
-      std::unordered_map<char const*, void*> buffers;
-      buffers.reserve(num_outputs);
-      using OutputOrtValue = Ort::UnownedValue;
-      std::unordered_map<size_t, OutputOrtValue> output_tensors;
-      output_tensors.reserve(num_outputs);
-      std::unordered_map<size_t, int> output_dim_sizes;
-      output_dim_sizes.reserve(num_outputs);
-      std::unordered_set<char const*> dds_output_set;
+    // Get input and output binding names
+    int total_bindings = trt_engine->getNbIOTensors();
+    std::vector<char const*> input_binding_names, output_binding_names;
+    for (int i = 0, end = total_bindings; i < end; ++i) {
+      auto const& name = trt_engine->getIOTensorName(i);
+      auto const& mode = trt_engine->getTensorIOMode(name);
+      if (mode == nvinfer1::TensorIOMode::kINPUT) {
+        input_binding_names.push_back(name);
+      } else {
+        output_binding_names.push_back(name);
+      }
+    }
 
-      for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-        char const* output_name = output_binding_names[i];
+    /*
+     * Set input shapes and bind input buffers
+     */
+    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+      char const* input_name = input_binding_names[i];
 
+      size_t input_index = 0;
+      const auto iter = input_indexes.find(input_name);
+      if (iter != input_indexes.end()) {
+        input_index = iter->second;
+      }
+      auto input_tensor = ctx.GetInput(input_index);
+      auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo();
+      const auto tensor_shapes = tensor_info.GetShape();
+
+      // Only use for "shape tensor" input
+      std::vector<int32_t> shape_values;
+      if (tensor_shape_values.find(input_name) != tensor_shape_values.end()) {
+        shape_values = tensor_shape_values[input_name];
+      }
+
+      auto status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    /*
+     * Set output shapes and bind output buffers
+     */
+    std::unordered_map<char const*, void*> buffers;
+    buffers.reserve(num_outputs);
+    using OutputOrtValue = Ort::UnownedValue;
+    std::unordered_map<size_t, OutputOrtValue> output_tensors;
+    output_tensors.reserve(num_outputs);
+    std::unordered_map<size_t, int> output_dim_sizes;
+    output_dim_sizes.reserve(num_outputs);
+    std::unordered_set<char const*> dds_output_set;
+
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_index = 0;
+      const auto& index_iter = output_indexes.find(output_name);
+      if (index_iter != output_indexes.end()) {
+        output_index = index_iter->second;
+      }
+
+      size_t output_type = 0;
+      const auto type_iter = output_types.find(output_name);
+      if (type_iter != output_types.end()) {
+        output_type = type_iter->second;
+      }
+
+      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                        dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
+
+    // Set execution context memory
+    if (trt_state->context_memory_sharing_enable) {
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+      if (mem_size > *max_context_mem_size_ptr) {
+        *max_context_mem_size_ptr = mem_size;
+      }
+      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    }
+
+    // Start CUDA graph capture.
+    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
+    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
+      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+      cuda_graph_.SetStream(stream);
+      CaptureBegin();
+    }
+
+    // Run TRT inference
+    if (!trt_context->enqueueV3(stream)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
+    }
+
+    if (sync_stream_after_enqueue || dds_output_set.size() > 0) {
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    }
+
+    // Assign TRT output back to ORT output
+    // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+    // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_type = 0;
+      const auto& iter = output_types.find(output_name);
+      if (iter != output_types.end()) {
+        output_type = iter->second;
+      }
+
+      if (dds_output_set.find(output_name) != dds_output_set.end()) {
         size_t output_index = 0;
         const auto& index_iter = output_indexes.find(output_name);
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-
-        size_t output_type = 0;
-        const auto type_iter = output_types.find(output_name);
-        if (type_iter != output_types.end()) {
-          output_type = type_iter->second;
-        }
-
-        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                          dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
         if (status != Status::OK()) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+        }
+      } else {
+        auto& output_tensor = output_tensors[i];
+        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+          }
+        } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
+          }
         }
       }
+    }
 
-      // Set execution context memory
-      if (trt_state->context_memory_sharing_enable) {
-        size_t mem_size = trt_engine->getDeviceMemorySize();
-        if (mem_size > *max_context_mem_size_ptr) {
-          *max_context_mem_size_ptr = mem_size;
-        }
-        trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    // End CUDA graph capture.
+    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
+    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
+    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
+    if (cuda_graph_enable_ && !IsGraphCaptured()) {
+      if (IsGraphCaptureAllowed()) {
+        CaptureEnd();
+        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // so run the captured graph here to actually execute the work.
+        ORT_RETURN_IF_ERROR(ReplayGraph());
+      } else {
+        IncrementRegularRunCountBeforeGraphCapture();
+      }
+    }
+
+    return Status::OK();
+  };
+
+  node_compute_funcs.push_back(compute_info);
+  return Status::OK();
+}
+
+Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
+                                                                             const Node& fused_node,
+                                                                             std::unordered_map<std::string, size_t>& input_map,
+                                                                             std::unordered_map<std::string, size_t>& output_map,
+                                                                             std::vector<NodeComputeInfo>& node_compute_funcs) {
+  std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
+  std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+  std::unordered_map<std::string, size_t> input_indexes;   // TRT engine input name -> ORT kernel context input index
+  std::unordered_map<std::string, size_t> output_indexes;  // TRT engine output name -> ORT kernel context output index
+  std::unordered_map<std::string, size_t> output_types;    // TRT engine output name -> ORT output tensor type
+
+  // Get engine binary data and deserialize it
+  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), compute_capability_);
+  auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
+  if (status != Status::OK()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+  }
+
+  // Build context
+  //
+  // Note: Creating an execution context from an engine is thread safe per TRT doc
+  // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+  if (context_memory_sharing_enable_) {
+    size_t mem_size = trt_engine->getDeviceMemorySize();
+    if (mem_size > max_ctx_mem_size_) {
+      max_ctx_mem_size_ = mem_size;
+    }
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContextWithoutDeviceMemory());
+  } else {
+    trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext());
+  }
+  if (!trt_context) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                           "TensorRT EP could not build execution context for fused node: " + fused_node.Name());
+  }
+
+  // Create input/output to index maps
+  for (int32_t i = 0; i < trt_engine->getNbIOTensors(); ++i) {
+    auto const& name = trt_engine->getIOTensorName(i);
+    auto const& mode = trt_engine->getTensorIOMode(name);
+    if (mode == nvinfer1::TensorIOMode::kINPUT) {
+      const auto& iter = input_map.find(name);
+      if (iter != input_map.end()) {
+        input_indexes[name] = iter->second;
+      }
+    } else {
+      const auto& iter = output_map.find(name);
+      if (iter != output_map.end()) {
+        output_indexes[name] = iter->second;
+      }
+    }
+  }
+
+  // Create output to type map
+  for (auto node_arg : graph_body_viewer.GetOutputs()) {
+    auto output_name = node_arg->Name();
+    auto& type = node_arg->TypeAsProto()->tensor_type();
+    output_types[output_name] = type.elem_type();
+  }
+
+  // Save TRT engine, TRT context and input/output info to map
+  engines_.emplace(fused_node.Name(), std::move(trt_engine));
+  contexts_.emplace(fused_node.Name(), std::move(trt_context));
+  input_info_[fused_node.Name()].push_back(input_indexes);
+  output_info_[fused_node.Name()].push_back(output_indexes);
+  output_info_[fused_node.Name()].push_back(output_types);
+
+  // Create function state
+  // TODO: remove default capture
+  NodeComputeInfo compute_info;
+  compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
+    std::unique_ptr<TensorrtShortFuncState> p = std::make_unique<TensorrtShortFuncState>();
+    *p = {context->allocate_func,
+          context->release_func,
+          context->allocator_handle,
+          context->node_name,
+          &engines_[context->node_name],
+          &contexts_[context->node_name],
+          input_info_[context->node_name],
+          output_info_[context->node_name],
+          sync_stream_after_enqueue_,
+          context_memory_sharing_enable_,
+          &max_ctx_mem_size_,
+          &tensorrt_mu_};
+    *state = p.release();
+    return 0;
+  };
+
+  // Release function state
+  compute_info.release_state_func = [](FunctionState state) {
+    delete static_cast<TensorrtShortFuncState*>(state);
+  };
+
+  // Create compute function
+  compute_info.compute_func = [this](FunctionState state, const OrtApi* api, OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+
+    TensorrtShortFuncState* trt_state = reinterpret_cast<TensorrtShortFuncState*>(state);
+
+    // The whole compute_function should be considered the critical section.
+    // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
+    std::lock_guard<OrtMutex> lock(*(trt_state->tensorrt_mu_ptr));
+
+    const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
+    const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
+    const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+    auto fused_node_name = trt_state->fused_node_name;
+    bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
+    auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
+    auto trt_engine = trt_state->engine->get();
+    auto trt_context = trt_state->context->get();
+    auto max_context_mem_size_ptr = trt_state->max_context_mem_size_ptr;
+    // int num_inputs = static_cast<int>(input_indexes.size());
+    int num_outputs = static_cast<int>(output_indexes.size());
+
+    OrtMemoryInfo mem_info("", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id_), device_id_);
+    if (alloc_ == nullptr) {
+      Ort::ThrowOnError(api->KernelContext_GetAllocator(context, &mem_info, &alloc_));
+    }
+    OrtAllocator* alloc = alloc_;
+
+    void* cuda_stream;
+    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+
+    // Get input and output binding names
+    int total_bindings = trt_engine->getNbIOTensors();
+    std::vector<char const*> input_binding_names, output_binding_names;
+    for (int i = 0, end = total_bindings; i < end; ++i) {
+      auto const& name = trt_engine->getIOTensorName(i);
+      auto const& mode = trt_engine->getTensorIOMode(name);
+      if (mode == nvinfer1::TensorIOMode::kINPUT) {
+        input_binding_names.push_back(name);
+      } else {
+        output_binding_names.push_back(name);
       }
+    }
+
+    /*
+     * Set input shapes and bind input buffers
+     */
+    std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
+    for (size_t i = 0, end = input_binding_names.size(); i < end; ++i) {
+      char const* input_name = input_binding_names[i];
 
-      // Start CUDA graph capture.
-      // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
-      // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
-      if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
-        LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
-        cuda_graph_.SetStream(stream);
-        CaptureBegin();
+      size_t input_index = 0;
+      const auto iter = input_indexes.find(input_name);
+      if (iter != input_indexes.end()) {
+        input_index = iter->second;
       }
 
-      // Run TRT inference
-      if (!trt_context->enqueueV3(stream)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
+      // Only use for "shape tensor" input
+      std::vector<int32_t> shape_values;
+
+      Status status = BindContextInput(ctx, trt_engine, trt_context, input_name, input_index, shape_values, scratch_buffers, alloc, stream);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
       }
+    }
+
+    /*
+     * Set output shapes and bind output buffers
+     */
+    std::unordered_map<char const*, void*> buffers;
+    buffers.reserve(num_outputs);
+    using OutputOrtValue = Ort::UnownedValue;
+    std::unordered_map<size_t, OutputOrtValue> output_tensors;
+    output_tensors.reserve(num_outputs);
+    std::unordered_map<size_t, int> output_dim_sizes;
+    output_dim_sizes.reserve(num_outputs);
+    std::unordered_set<char const*> dds_output_set;
 
-      if (sync_stream_after_enqueue || dds_output_set.size() > 0) {
-        CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_index = 0;
+      const auto& index_iter = output_indexes.find(output_name);
+      if (index_iter != output_indexes.end()) {
+        output_index = index_iter->second;
       }
 
-      // Assign TRT output back to ORT output
-      // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
-      // (2) Cast TRT INT32 output to ORT INT64 output or TRT float output to double output
-      for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
-        char const* output_name = output_binding_names[i];
+      size_t output_type = 0;
+      const auto type_iter = output_types.find(output_name);
+      if (type_iter != output_types.end()) {
+        output_type = type_iter->second;
+      }
 
-        size_t output_type = 0;
-        const auto& iter = output_types.find(output_name);
-        if (iter != output_types.end()) {
-          output_type = iter->second;
-        }
+      Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
+                                        dds_output_set, dds_output_allocator_map, scratch_buffers, alloc, buffers);
+      if (status != Status::OK()) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
+      }
+    }
 
-        if (dds_output_set.find(output_name) != dds_output_set.end()) {
-          size_t output_index = 0;
-          const auto& index_iter = output_indexes.find(output_name);
-          if (index_iter != output_indexes.end()) {
-            output_index = index_iter->second;
-          }
-          auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
-          if (status != Status::OK()) {
-            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+    // Set execution context memory
+    if (trt_state->context_memory_sharing_enable) {
+      size_t mem_size = trt_engine->getDeviceMemorySize();
+      if (mem_size > *max_context_mem_size_ptr) {
+        *max_context_mem_size_ptr = mem_size;
+      }
+      trt_context->setDeviceMemory(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, *max_context_mem_size_ptr).get());
+    }
+
+    // Start CUDA graph capture.
+    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
+    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
+    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
+      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+      cuda_graph_.SetStream(stream);
+      CaptureBegin();
+    }
+
+    // Run TRT inference
+    if (!trt_context->enqueueV3(stream)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
+    }
+
+    if (sync_stream_after_enqueue || dds_output_set.size() > 0) {
+      CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
+    }
+
+    // Assign TRT output back to ORT output
+    // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
+    // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
+    for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
+      char const* output_name = output_binding_names[i];
+
+      size_t output_type = 0;
+      const auto& iter = output_types.find(output_name);
+      if (iter != output_types.end()) {
+        output_type = iter->second;
+      }
+
+      if (dds_output_set.find(output_name) != dds_output_set.end()) {
+        size_t output_index = 0;
+        const auto& index_iter = output_indexes.find(output_name);
+        if (index_iter != output_indexes.end()) {
+          output_index = index_iter->second;
+        }
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
+        if (status != Status::OK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
+        }
+      } else {
+        auto& output_tensor = output_tensors[i];
+        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
           }
-        } else {
-          auto& output_tensor = output_tensors[i];
-          if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<int64_t>();
-            if (output_tensor_ptr != nullptr) {
-              cuda::Impl_Cast<int32_t, int64_t>(stream, reinterpret_cast<int32_t*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
-            }
-          } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
-            auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-            if (output_tensor_ptr != nullptr) {
-              cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
-            }
+        } else if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
+          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
+          if (output_tensor_ptr != nullptr) {
+            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
           }
         }
       }
+    }
 
-      // End CUDA graph capture.
-      // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
-      // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
-      // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
-      if (cuda_graph_enable_ && !IsGraphCaptured()) {
-        if (IsGraphCaptureAllowed()) {
-          CaptureEnd();
-          // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
-          // so run the captured graph here to actually execute the work.
-          ORT_RETURN_IF_ERROR(ReplayGraph());
-        } else {
-          IncrementRegularRunCountBeforeGraphCapture();
-        }
+    // End CUDA graph capture.
+    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
+    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
+    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
+    if (cuda_graph_enable_ && !IsGraphCaptured()) {
+      if (IsGraphCaptureAllowed()) {
+        CaptureEnd();
+        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // so run the captured graph here to actually execute the work.
+        ORT_RETURN_IF_ERROR(ReplayGraph());
+      } else {
+        IncrementRegularRunCountBeforeGraphCapture();
       }
+    }
 
-      return Status::OK();
-    };
+    return Status::OK();
+  };
 
-    node_compute_funcs.push_back(compute_info);
-  }
+  node_compute_funcs.push_back(compute_info);
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index bacdf0f3c996c..9b8798e0fc4ca 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -46,6 +46,9 @@ static const std::string kProfilesMinShapes = "ORT_TENSORRT_PROFILE_MIN_SHAPES";
 static const std::string kProfilesMaxShapes = "ORT_TENSORRT_PROFILE_MAX_SHAPES";
 static const std::string kProfilesOptShapes = "ORT_TENSORRT_PROFILE_OPT_SHAPES";
 static const std::string kCudaGraphEnable = "ORT_TENSORRT_CUDA_GRAPH_ENABLE";
+static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL";
+static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE";
+static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars
@@ -177,6 +180,22 @@ struct TensorrtFuncState {
   bool cuda_graph_enable = 0;
 };
 
+// Minimum information to construct kernel function state for direct engine load code path
+struct TensorrtShortFuncState {
+  AllocateFunc test_allocate_func = nullptr;
+  DestroyFunc test_release_func = nullptr;
+  AllocatorHandle allocator = nullptr;
+  std::string fused_node_name;
+  std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
+  std::unique_ptr<nvinfer1::IExecutionContext>* context = nullptr;
+  std::vector<std::unordered_map<std::string, size_t>> input_info;
+  std::vector<std::unordered_map<std::string, size_t>> output_info;
+  bool sync_stream_after_enqueue = false;
+  bool context_memory_sharing_enable = false;
+  size_t* max_context_mem_size_ptr = nullptr;
+  OrtMutex* tensorrt_mu_ptr = nullptr;
+};
+
 // Holds important information for building valid ORT graph.
 struct SubGraphContext {
   std::unordered_set<std::string> output_args;
@@ -276,6 +295,12 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   // and should be kept for the lifetime of TRT EP object.
   OrtAllocator* alloc_ = nullptr;
 
+  // For create/dump EP context node model
+  bool dump_ep_context_model_ = false;
+  int ep_context_embed_mode_ = 0;
+  bool ep_context_compute_capability_enable_ = true;
+  std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_ = ONNX_NAMESPACE::ModelProto::Create();
+
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
   mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
 
@@ -489,6 +514,25 @@ class TensorrtExecutionProvider : public IExecutionProvider {
    */
   bool IsLocalValue(const Graph& graph, const std::string& name) const;
 
+  /**
+   * Create a vector of NodeComputeInfo instances directly from "TRT engine" wrapped onnx model without
+   * going through the time-consuming processes of model parsing and engine building.
+   */
+  Status CreateNodeComputeInfoFromPrecompiledEngine(const GraphViewer& graph_body_viewer,
+                                                    const Node& fused_node,
+                                                    std::unordered_map<std::string, size_t>& input_map,
+                                                    std::unordered_map<std::string, size_t>& output_map,
+                                                    std::vector<NodeComputeInfo>& node_compute_funcs);
+
+  /**
+   * Create a vector of NodeComputeInfo instances from graph.
+   */
+  Status CreateNodeComputeInfoFromGraph(const GraphViewer& graph_body_viewer,
+                                        const Node& fused_node,
+                                        std::unordered_map<std::string, size_t>& input_map,
+                                        std::unordered_map<std::string, size_t>& output_map,
+                                        std::vector<NodeComputeInfo>& node_compute_funcs);
+
   bool IsGraphCaptureAllowed() const;
   void CaptureBegin();
   void CaptureEnd();
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 3ead33f9131d9..f7820ac8a08c3 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -46,6 +46,9 @@ constexpr const char* kProfilesMinShapes = "trt_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "trt_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "trt_profile_opt_shapes";
 constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable";
+constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
+constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
+constexpr const char* kEpContextComputeCapabilityEnable = "trt_ep_context_compute_capability_enable";
 }  // namespace provider_option_names
 }  // namespace tensorrt
 
@@ -97,6 +100,9 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesMaxShapes, info.profile_max_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesOptShapes, info.profile_opt_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kCudaGraphEnable, info.cuda_graph_enable)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, info.ep_context_compute_capability_enable)
           .Parse(options));  // add new provider option here.
 
   return info;
@@ -138,6 +144,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kProfilesMaxShapes, MakeStringWithClassicLocale(info.profile_max_shapes)},
       {tensorrt::provider_option_names::kProfilesOptShapes, MakeStringWithClassicLocale(info.profile_opt_shapes)},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.cuda_graph_enable)},
+      {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
+      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -188,6 +197,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kProfilesMaxShapes, kProfilesMaxShapes_},
       {tensorrt::provider_option_names::kProfilesOptShapes, kProfilesOptShapes_},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.trt_cuda_graph_enable)},
+      {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
+      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.trt_ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -279,5 +291,8 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_profile_opt_shapes = copy_string_if_needed(internal_options.profile_opt_shapes);
 
   trt_provider_options_v2.trt_cuda_graph_enable = internal_options.cuda_graph_enable;
+  trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model;
+  trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
+  trt_provider_options_v2.trt_ep_context_compute_capability_enable = internal_options.ep_context_compute_capability_enable;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index b16543aa3d7dd..76223b7847359 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -51,6 +51,9 @@ struct TensorrtExecutionProviderInfo {
   std::string profile_max_shapes{""};
   std::string profile_opt_shapes{""};
   bool cuda_graph_enable{false};
+  bool dump_ep_context_model{false};
+  int ep_context_embed_mode{0};
+  bool ep_context_compute_capability_enable{1};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index c69299d0ecdeb..07f6f8eb3476f 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -5,6 +5,7 @@
 #include <unordered_map>
 #include <string>
 #include <iostream>
+#include <filesystem>
 #include <experimental/filesystem>
 #include "flatbuffers/idl.h"
 #include "ort_trt_int8_cal_table.fbs.h"
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 426584553f349..0e29df72f0322 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -116,6 +116,9 @@ struct Tensorrt_Provider : Provider {
     info.profile_max_shapes = options.trt_profile_max_shapes == nullptr ? "" : options.trt_profile_max_shapes;
     info.profile_opt_shapes = options.trt_profile_opt_shapes == nullptr ? "" : options.trt_profile_opt_shapes;
     info.cuda_graph_enable = options.trt_cuda_graph_enable != 0;
+    info.dump_ep_context_model = options.trt_dump_ep_context_model != 0;
+    info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
+    info.ep_context_compute_capability_enable = options.trt_ep_context_compute_capability_enable != 0;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index e3b8dea90a898..e2d46012c097b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -427,6 +427,7 @@ struct ProviderHostImpl : ProviderHost {
   int64_t AttributeProto__i(const ONNX_NAMESPACE::AttributeProto* p) override { return p->i(); }
   float AttributeProto__f(const ONNX_NAMESPACE::AttributeProto* p) override { return p->f(); }
   void AttributeProto__set_s(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { return p->set_s(value); }
+  void AttributeProto__set_i(ONNX_NAMESPACE::AttributeProto* p, int64_t value) override { return p->set_i(value); }
   const ::std::string& AttributeProto__s(const ONNX_NAMESPACE::AttributeProto* p) override { return p->s(); }
   void AttributeProto__set_name(ONNX_NAMESPACE::AttributeProto* p, const ::std::string& value) override { return p->set_name(value); }
   void AttributeProto__set_type(ONNX_NAMESPACE::AttributeProto* p, ONNX_NAMESPACE::AttributeProto_AttributeType value) override { return p->set_type(value); }
@@ -447,6 +448,7 @@ struct ProviderHostImpl : ProviderHost {
   ONNX_NAMESPACE::ValueInfoProtos* GraphProto__mutable_value_info(ONNX_NAMESPACE::GraphProto* p) override { return p->mutable_value_info(); }
   ONNX_NAMESPACE::TensorProtos* GraphProto__mutable_initializer(ONNX_NAMESPACE::GraphProto* p) override { return p->mutable_initializer(); }
   ONNX_NAMESPACE::NodeProto* GraphProto__add_node(ONNX_NAMESPACE::GraphProto* p) override { return p->add_node(); }
+  ONNX_NAMESPACE::NodeProto* GraphProto__mutable_node(ONNX_NAMESPACE::GraphProto* p, int index) override { return p->mutable_node(index); }
 
   void GraphProto__operator_assign(ONNX_NAMESPACE::GraphProto* p, const ONNX_NAMESPACE::GraphProto& v) override { *p = v; }
 
@@ -470,6 +472,7 @@ struct ProviderHostImpl : ProviderHost {
   void NodeProto__operator_assign(ONNX_NAMESPACE::NodeProto* p, const ONNX_NAMESPACE::NodeProto& v) override { *p = v; }
   int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) override { return p->attribute_size(); }
   const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const override { return p->attribute(index); }
+  ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) override { return p->mutable_attribute(index); }
 
   // TensorProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TensorProto> TensorProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TensorProto>(); }
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 6f383d733edbd..06eb2afdf80f2 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -713,6 +713,28 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_cuda_graph_enable' should be 'True' or 'False'. Default value is 'False'.\n");
             }
+          } else if (option.first == "trt_dump_ep_context_model") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_dump_ep_context_model = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_dump_ep_context_model = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_dump_ep_context_model' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
+          } else if (option.first == "trt_ep_context_embed_mode") {
+            if (!option.second.empty()) {
+              params.trt_ep_context_embed_mode = std::stoi(option.second);
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n");
+            }
+          } else if (option.first == "trt_ep_context_compute_capability_enable") {
+            if (option.second == "True" || option.second == "true") {
+              params.trt_ep_context_compute_capability_enable = true;
+            } else if (option.second == "False" || option.second == "false") {
+              params.trt_ep_context_compute_capability_enable = false;
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_compute_capability_enable' should be 'True' or 'False'. Default value is 'False'.\n");
+            }
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }
diff --git a/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
new file mode 100644
index 0000000000000..717a0816247e7
--- /dev/null
+++ b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from argparse import ArgumentParser
+
+import onnx
+import tensorrt as trt
+from onnx import TensorProto, helper
+
+
+class TensorRTEngineWrapperCreator:
+    def __init__(self, args):
+        ctx_embed_mode = args.embed_mode
+        engine_cache_path = args.trt_engine_cache_path
+        self.model_name = args.model_name
+        self.dynamic_dim_count = 0
+
+        # Get serialized engine from engine cache
+        with open(engine_cache_path, "rb") as file:
+            engine_buffer = file.read()
+
+        if ctx_embed_mode:
+            ep_cache_context_content = engine_buffer
+        else:
+            ep_cache_context_content = engine_cache_path
+
+        # Deserialize an TRT engine
+        logger = trt.Logger(trt.Logger.WARNING)
+        runtime = trt.Runtime(logger)
+        engine = runtime.deserialize_cuda_engine(engine_buffer)
+        num_bindings = engine.num_bindings
+
+        input_tensors = []
+        output_tensors = []
+        input_tensor_shapes = []
+        output_tensor_shapes = []
+        input_tensor_types = []
+        output_tensor_types = []
+
+        # Get type and shape of each input/output
+        for b_index in range(num_bindings):
+            tensor_name = engine.get_tensor_name(b_index)
+            tensor_shape = engine.get_tensor_shape(tensor_name)
+            tensor_type = engine.get_tensor_dtype(tensor_name)
+            if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
+                input_tensors.append(tensor_name)
+                input_tensor_shapes.append(tensor_shape)
+                input_tensor_types.append(tensor_type)
+            else:
+                output_tensors.append(tensor_name)
+                output_tensor_shapes.append(tensor_shape)
+                output_tensor_types.append(tensor_type)
+
+        # Note:
+        # The TRT engine should be built with min, max and opt profiles so that dynamic shape input can have dimension of "-1"
+        print(input_tensors)
+        print(input_tensor_types)
+        print(input_tensor_shapes)
+        print(output_tensors)
+        print(output_tensor_types)
+        print(output_tensor_shapes)
+
+        nodes = [
+            helper.make_node(
+                "EPContext",
+                input_tensors,
+                output_tensors,
+                "EPContext",
+                domain="com.microsoft",
+                embed_mode=ctx_embed_mode,
+                ep_cache_context=ep_cache_context_content,
+            ),
+        ]
+
+        model_inputs = []
+        for i in range(len(input_tensors)):
+            model_inputs.append(
+                helper.make_tensor_value_info(
+                    input_tensors[i],
+                    self.trt_data_type_to_onnx_data_type(input_tensor_types[i]),
+                    self.trt_shape_to_ort_shape(input_tensor_shapes[i]),
+                )
+            )
+
+        model_outputs = []
+        for i in range(len(output_tensors)):
+            model_outputs.append(
+                helper.make_tensor_value_info(
+                    output_tensors[i],
+                    self.trt_data_type_to_onnx_data_type(output_tensor_types[i]),
+                    self.trt_shape_to_ort_shape(output_tensor_shapes[i]),
+                )
+            )
+
+        self.graph = helper.make_graph(
+            nodes,
+            "trt_engine_wrapper",
+            model_inputs,
+            model_outputs,
+        )
+
+    def trt_data_type_to_onnx_data_type(self, trt_data_type):
+        if trt_data_type == trt.DataType.FLOAT:
+            return TensorProto.FLOAT
+        elif trt_data_type == trt.DataType.HALF:
+            return TensorProto.FLOAT16
+        elif trt_data_type == trt.DataType.INT8:
+            return TensorProto.INT8
+        elif trt_data_type == trt.DataType.INT32:
+            return TensorProto.INT32
+        elif trt_data_type == trt.DataType.BOOL:
+            return TensorProto.BOOL
+        elif trt_data_type == trt.DataType.UINT8:
+            return TensorProto.UINT8
+        else:
+            return TensorProto.UNDEFINED
+
+    # TRT uses "-1" to represent dynamic dimension
+    # ORT uses symbolic name to represent dynamic dimension
+    # Here we only do the conversion when there is any dynamic dimension in the shape
+    def trt_shape_to_ort_shape(self, trt_data_shape):
+        def has_dynamic_dim(trt_data_shape):
+            if any(dim == -1 for dim in trt_data_shape):
+                return True
+            return False
+
+        if not has_dynamic_dim(trt_data_shape):
+            return trt_data_shape
+
+        ort_data_shape = []
+        if has_dynamic_dim(trt_data_shape):
+            for dim in trt_data_shape:
+                if dim == -1:
+                    ort_data_shape.append("free_dim_" + str(self.dynamic_dim_count))
+                    self.dynamic_dim_count += 1
+                else:
+                    ort_data_shape.append(dim)
+        return ort_data_shape
+
+    def create_model(self):
+        model = helper.make_model(self.graph)
+        onnx.save(model, self.model_name)
+        print(self.model_name + " is created.")
+
+
+def main():
+    parser = ArgumentParser("Generate Onnx model which includes the TensorRT engine binary.")
+    parser.add_argument(
+        "-p", "--trt_engine_cache_path", help="Required. Path to TensorRT engine cache.", required=True, type=str
+    )
+    parser.add_argument(
+        "-e",
+        "--embed_mode",
+        help="mode 0 means the engine cache path and mode 1 means engine binary data",
+        required=False,
+        default=0,
+        type=int,
+    )
+    parser.add_argument(
+        "-m",
+        "--model_name",
+        help="Model name to be created",
+        required=False,
+        default="trt_engine_wrapper.onnx",
+        type=str,
+    )
+    args = parser.parse_args()
+    ctor = TensorRTEngineWrapperCreator(args)
+    ctor.create_model()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/python/onnxruntime_test_engine_wrapper.py b/onnxruntime/test/python/onnxruntime_test_engine_wrapper.py
new file mode 100644
index 0000000000000..4123318b9f0af
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_engine_wrapper.py
@@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import unittest
+
+import numpy as np
+import onnx
+from helper import get_name
+from onnx import TensorProto, helper
+
+import onnxruntime as onnxrt
+
+
+class TestInferenceSessionWithCtxNode(unittest.TestCase):
+    trt_engine_cache_path_ = "./trt_engine_cache"
+    ctx_node_model_name_ = "ctx_node.onnx"
+
+    # This test is only for TRT EP to test EPContext node with TRT engine
+    @unittest.skipIf(
+        "TensorrtExecutionProvider" not in onnxrt.get_available_providers(),
+        reason="Test TRT EP only",
+    )
+    def create_ctx_node(self, ctx_embed_mode=0, cache_path=""):
+        if ctx_embed_mode:
+            # Get engine buffer from engine cache
+            with open(cache_path, "rb") as file:
+                engine_buffer = file.read()
+            ep_cache_context_content = engine_buffer
+        else:
+            ep_cache_context_content = cache_path
+
+        nodes = [
+            helper.make_node(
+                "EPContext",
+                ["X"],
+                ["Y"],
+                "EPContext",
+                domain="com.microsoft",
+                embed_mode=ctx_embed_mode,
+                ep_cache_context=ep_cache_context_content,
+            ),
+        ]
+
+        graph = helper.make_graph(
+            nodes,
+            "trt_engine_wrapper",
+            [  # input
+                helper.make_tensor_value_info("X", TensorProto.FLOAT, ["N", 2]),
+            ],
+            [  # output
+                helper.make_tensor_value_info("Y", TensorProto.FLOAT, ["N", 1]),
+            ],
+        )
+        model = helper.make_model(graph)
+        onnx.save(model, self.ctx_node_model_name_)
+
+    def test_ctx_node(self):
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+
+        # First session and run to create engine cache
+        providers = [
+            (
+                "TensorrtExecutionProvider",
+                {"trt_engine_cache_enable": True, "trt_engine_cache_path": self.trt_engine_cache_path_},
+            )
+        ]
+        session = onnxrt.InferenceSession(get_name("matmul_2.onnx"), providers=providers)
+        session.run(
+            ["Y"],
+            {"X": x},
+        )
+
+        # Get engine cache name
+        cache_name = ""
+        for f in os.listdir(self.trt_engine_cache_path_):
+            if f.endswith(".engine"):
+                cache_name = f
+        print(cache_name)
+
+        # Second session and run to test ctx node with engine cache path
+        self.create_ctx_node(cache_path=os.path.join(self.trt_engine_cache_path_, cache_name))
+        providers = [("TensorrtExecutionProvider", {})]
+        session = onnxrt.InferenceSession(get_name(self.ctx_node_model_name_), providers=providers)
+        session.run(
+            ["Y"],
+            {"X": x},
+        )
+
+        # Third session and run to test ctx node with engine binary content
+        self.create_ctx_node(ctx_embed_mode=1, cache_path=os.path.join(self.trt_engine_cache_path_, cache_name))
+        session = onnxrt.InferenceSession(get_name(self.ctx_node_model_name_), providers=providers)
+        session.run(
+            ["Y"],
+            {"X": x},
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From e2c145d37f6fb8a575c70798eaa03d7fedf9d23a Mon Sep 17 00:00:00 2001
From: gunandrose4u <52735340+gunandrose4u@users.noreply.github.com>
Date: Fri, 12 Jan 2024 14:24:01 +0800
Subject: [PATCH 028/100] Add Anubis metrics schema for local benchmark results
 uploading (#19018)

### Description
1. Add metrics.py for define the metrics schema used by Anubis
2. Add two examples (llama2 and whisper) of how to save local benchmark
results following Anubis metrics schema


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Kyle Zhang <Xi.Zhang@microsoft.com>
Co-authored-by: ironman <bitzhangxi@outlook.com>
---
 .../python/tools/transformers/metrics.py      | 164 ++++++++++++++++++
 .../models/llama/benchmark_all.py             |  65 ++++++-
 .../models/whisper/benchmark_all.py           |  77 +++++++-
 3 files changed, 300 insertions(+), 6 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/metrics.py

diff --git a/onnxruntime/python/tools/transformers/metrics.py b/onnxruntime/python/tools/transformers/metrics.py
new file mode 100644
index 0000000000000..282c75ba8f6a5
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/metrics.py
@@ -0,0 +1,164 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import datetime
+import json
+from typing import Optional
+
+import pandas as pd
+
+
+class BaseObject:
+    def __init__(self):
+        self.customized = {}
+
+    def to_dict(self):
+        default_values = self.__dict__.copy()
+        default_values.pop("customized", None)
+        default_values.update(self.customized)
+
+        for k, v in default_values.items():
+            if isinstance(v, BaseObject):
+                default_values[k] = v.to_dict()
+
+        return {k: v for k, v in default_values.items() if v}
+
+
+class ModelInfo(BaseObject):
+    def __init__(
+        self,
+        full_name: Optional[str] = None,
+        is_huggingface: Optional[bool] = False,
+        is_text_generation: Optional[bool] = False,
+        short_name: Optional[str] = None,
+    ):
+        super().__init__()
+        self.full_name = full_name
+        self.is_huggingface = is_huggingface
+        self.is_text_generation = is_text_generation
+        self.short_name = short_name
+        self.input_shape = []
+
+
+class BackendOptions(BaseObject):
+    def __init__(
+        self,
+        enable_profiling: Optional[bool] = False,
+        execution_provider: Optional[str] = None,
+        use_io_binding: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.enable_profiling = enable_profiling
+        self.execution_provider = execution_provider
+        self.use_io_binding = use_io_binding
+
+
+class Config(BaseObject):
+    def __init__(
+        self,
+        backend: Optional[str] = "onnxruntime",
+        batch_size: Optional[int] = 1,
+        seq_length: Optional[int] = 0,
+        precision: Optional[str] = "fp32",
+        warmup_runs: Optional[int] = 1,
+        measured_runs: Optional[int] = 10,
+    ):
+        super().__init__()
+        self.backend = backend
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.precision = precision
+        self.warmup_runs = warmup_runs
+        self.measured_runs = measured_runs
+        self.model_info = ModelInfo()
+        self.backend_options = BackendOptions()
+
+
+class Metadata(BaseObject):
+    def __init__(
+        self,
+        device: Optional[str] = None,
+        package_name: Optional[str] = None,
+        package_version: Optional[str] = None,
+        platform: Optional[str] = None,
+        python_version: Optional[str] = None,
+    ):
+        super().__init__()
+        self.device = device
+        self.package_name = package_name
+        self.package_version = package_version
+        self.platform = platform
+        self.python_version = python_version
+
+
+class Metrics(BaseObject):
+    def __init__(
+        self,
+        latency_ms_mean: Optional[float] = 0.0,
+        throughput_qps: Optional[float] = 0.0,
+        max_memory_usage_GB: Optional[float] = 0.0,
+    ):
+        super().__init__()
+        self.latency_ms_mean = latency_ms_mean
+        self.throughput_qps = throughput_qps
+        self.max_memory_usage_GB = max_memory_usage_GB
+
+
+class BenchmarkRecord:
+    def __init__(
+        self,
+        model_name: str,
+        precision: str,
+        backend: str,
+        device: str,
+        package_name: str,
+        package_version: str,
+        batch_size: Optional[int] = 1,
+        warmup_runs: Optional[int] = 1,
+        measured_runs: Optional[int] = 10,
+        trigger_date: Optional[str] = None,
+    ):
+        self.config = Config()
+        self.metrics = Metrics()
+        self.metadata = Metadata()
+        self.trigger_date = trigger_date or datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        self.config.model_info.full_name = model_name
+        self.config.precision = precision
+        self.config.backend = backend
+        self.config.batch_size = batch_size
+        self.config.warmup_runs = warmup_runs
+        self.config.measured_runs = measured_runs
+        self.metadata.device = device
+        self.metadata.package_name = package_name
+        self.metadata.package_version = package_version
+
+    def to_dict(self) -> dict:
+        return {
+            "config": self.config.to_dict(),
+            "metadata": self.metadata.to_dict(),
+            "metrics": self.metrics.to_dict(),
+            "trigger_date": self.trigger_date,
+        }
+
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict(), default=str)
+
+    @classmethod
+    def save_as_csv(cls, file_name: str, records: list) -> None:
+        if records is None or len(records) == 0:
+            return
+        rds = [record.to_dict() for record in records]
+        df = pd.json_normalize(rds)
+        df.to_csv(file_name, index=False)
+
+    @classmethod
+    def save_as_json(cls, file_name: str, records: list) -> None:
+        if records is None or len(records) == 0:
+            return
+        rds = [record.to_dict() for record in records]
+        with open(file_name, "w") as f:
+            json.dump(rds, f, indent=4, default=str)
diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
index b35a5e27f9ea3..a8b84729b46be 100644
--- a/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/llama/benchmark_all.py
@@ -7,6 +7,7 @@
 
 import torch
 from benchmark_helper import setup_logger
+from metrics import BenchmarkRecord
 
 logger = logging.getLogger(__name__)
 
@@ -121,11 +122,19 @@ def get_args():
         help="Number of mins to attempt the benchmark before moving on",
     )
 
+    parser.add_argument(
+        "--log-folder",
+        type=str,
+        default=None,
+        help="Path to folder to save logs and results",
+    )
+
     args = parser.parse_args()
 
     setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-"))  # noqa: B010
     log_folder_name = f"./{args.model_size}_{args.precision}"
-    setattr(args, "log_folder", log_folder_name)  # noqa: B010
+    if not args.log_folder:
+        args.log_folder = log_folder_name
     os.makedirs(args.log_folder, exist_ok=True)
 
     # Convert timeout value to secs
@@ -197,6 +206,9 @@ def save_results(results, filename):
     df = pd.DataFrame(
         results,
         columns=[
+            "Warmup Runs",
+            "Measured Runs",
+            "Model Name",
             "Engine",
             "Precision",
             "Device",
@@ -211,6 +223,8 @@ def save_results(results, filename):
     )
 
     # Set column types
+    df["Warmup Runs"] = df["Warmup Runs"].astype("int")
+    df["Measured Runs"] = df["Measured Runs"].astype("int")
     df["Batch Size"] = df["Batch Size"].astype("int")
     df["Sequence Length"] = df["Sequence Length"].astype("int")
     df["Latency (s)"] = df["Latency (s)"].astype("float")
@@ -218,7 +232,52 @@ def save_results(results, filename):
     df["Throughput (tps)"] = df["Throughput (tps)"].astype("float")
     df["Memory (GB)"] = df["Memory (GB)"].astype("float")
 
-    df.to_csv(filename, index=False)
+    # get package name and version
+    import pkg_resources
+
+    installed_packages = pkg_resources.working_set
+    installed_packages_list = sorted(
+        [
+            f"{i.key}=={i.version}"
+            for i in installed_packages
+            if i.key in ["ort-nightly-gpu", "ort-nightly", "onnxruntime", "onnxruntime-gpu"]
+        ]
+    )
+
+    ort_pkg_name = ""
+    ort_pkg_version = ""
+    if installed_packages_list:
+        ort_pkg_name = installed_packages_list[0].split("==")[0]
+        ort_pkg_version = installed_packages_list[0].split("==")[1]
+
+    # Save results to csv with standard format
+    records = []
+    for _, row in df.iterrows():
+        if row["Engine"] == "optimum-ort":
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], "onnxruntime", row["Device"], ort_pkg_name, ort_pkg_version
+            )
+        elif row["Engine"] in ["pytorch-eager", "pytorch-compile"]:
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], "pytorch", row["Device"], torch.__name__, torch.__version__
+            )
+        else:
+            record = BenchmarkRecord(row["Model Name"], row["Precision"], row["Engine"], row["Device"], "", "")
+        record.config.warmup_runs = row["Warmup Runs"]
+        record.config.measured_runs = row["Measured Runs"]
+        record.config.batch_size = row["Batch Size"]
+        record.config.seq_length = row["Sequence Length"]
+        record.config.customized["measure_step"] = row["Step"]
+        record.config.customized["engine"] = row["Engine"]
+        record.metrics.customized["latency_s_mean"] = row["Latency (s)"]
+        record.metrics.latency_ms_mean = row["Latency (ms)"]
+        record.metrics.customized["throughput_tps"] = row["Throughput (tps)"]
+        record.metrics.max_memory_usage_GB = row["Memory (GB)"]
+
+        records.append(record)
+
+    BenchmarkRecord.save_as_csv(filename, records)
+    BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
     logger.info(f"Results saved in {filename}!")
 
 
@@ -234,7 +293,7 @@ def benchmark(args, benchmark_cmd, engine):
 
     # Create entries for csv
     logger.info("Gathering data from log files...")
-    base_results = [engine, args.precision, args.device]
+    base_results = [args.warmup_runs, args.num_runs, args.model_name, engine, args.precision, args.device]
     results = process_log_file(args.device_id, log_path, base_results)
 
     return results
diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
index 071b539ac1899..d205a2d340721 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark_all.py
@@ -8,6 +8,7 @@
 import librosa
 import torch
 from benchmark_helper import setup_logger
+from metrics import BenchmarkRecord
 from transformers import WhisperConfig, WhisperProcessor
 
 logger = logging.getLogger(__name__)
@@ -123,13 +124,21 @@ def get_args():
         help="Number of mins to attempt the benchmark before moving on",
     )
 
+    parser.add_argument(
+        "--log-folder",
+        type=str,
+        default=None,
+        help="Path to folder to save logs and results",
+    )
+
     parser.add_argument("--tune", default=False, action="store_true")
 
     args = parser.parse_args()
 
     setattr(args, "model_size", args.model_name.split("/")[-1].replace(".", "-"))  # noqa: B010
     log_folder_name = f"./{args.model_size}-{args.precision}"
-    setattr(args, "log_folder", log_folder_name)  # noqa: B010
+    if not args.log_folder:
+        args.log_folder = log_folder_name
     os.makedirs(args.log_folder, exist_ok=True)
 
     # Convert timeout value to secs
@@ -235,6 +244,9 @@ def save_results(results, filename):
     df = pd.DataFrame(
         results,
         columns=[
+            "Warmup Runs",
+            "Measured Runs",
+            "Model Name",
             "Engine",
             "Precision",
             "Device",
@@ -254,6 +266,8 @@ def save_results(results, filename):
     )
 
     # Set column types
+    df["Warmup Runs"] = df["Warmup Runs"].astype("int")
+    df["Measured Runs"] = df["Measured Runs"].astype("int")
     df["Duration (s)"] = df["Duration (s)"].astype("float")
     df["Token Length"] = df["Token Length"].astype("int")
     df["Load Audio Latency (s)"] = df["Load Audio Latency (s)"].astype("float")
@@ -266,7 +280,55 @@ def save_results(results, filename):
     df["Memory (GB)"] = df["Memory (GB)"].astype("float")
     df["Real Time Factor (RTF)"] = df["Real Time Factor (RTF)"].astype("float")
 
-    df.to_csv(filename, index=False)
+    # get package name and version
+    import pkg_resources
+
+    installed_packages = pkg_resources.working_set
+    installed_packages_list = sorted(
+        [
+            f"{i.key}=={i.version}"
+            for i in installed_packages
+            if i.key in ["ort-nightly-gpu", "ort-nightly", "onnxruntime", "onnxruntime-gpu"]
+        ]
+    )
+    ort_pkg_name = ""
+    ort_pkg_version = ""
+    if installed_packages_list:
+        ort_pkg_name = installed_packages_list[0].split("==")[0]
+        ort_pkg_version = installed_packages_list[0].split("==")[1]
+
+    # Save results to csv with standard format
+    records = []
+    for _, row in df.iterrows():
+        if row["Engine"] == "onnxruntime":
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], row["Engine"], row["Device"], ort_pkg_name, ort_pkg_version
+            )
+        else:
+            record = BenchmarkRecord(
+                row["Model Name"], row["Precision"], row["Engine"], row["Device"], torch.__name__, torch.__version__
+            )
+        record.config.customized["audio_file"] = row["Audio File"]
+        record.config.warmup_runs = row["Warmup Runs"]
+        record.config.measured_runs = row["Measured Runs"]
+
+        record.metrics.customized["duration"] = row["Duration (s)"]
+        record.metrics.customized["token_length"] = row["Token Length"]
+        record.metrics.customized["load_audio_latency"] = row["Load Audio Latency (s)"]
+        record.metrics.customized["load_audio_throughput"] = row["Load Audio Throughput (qps)"]
+        record.metrics.customized["feature_extractor_latency_s"] = row["Feature Extractor Latency (s)"]
+        record.metrics.customized["feature_extractor_throughput_qps"] = row["Feature Extractor Throughput (qps)"]
+        record.metrics.customized["per_token_latency_ms"] = row["Per Token Latency (ms/token)"]
+        record.metrics.customized["rtf"] = row["Real Time Factor (RTF)"]
+
+        record.metrics.latency_ms_mean = row["Latency (s)"] * 1000
+        record.metrics.throughput_qps = row["Throughput (qps)"]
+        record.metrics.max_memory_usage_GB = row["Memory (GB)"]
+
+        records.append(record)
+
+    BenchmarkRecord.save_as_csv(filename, records)
+    BenchmarkRecord.save_as_json(filename.replace(".csv", ".json"), records)
     logger.info(f"Results saved in {filename}!")
 
 
@@ -282,7 +344,16 @@ def benchmark(args, benchmark_cmd, engine, audio_file, duration):
 
     # Create entries for csv
     logger.info("Gathering data from log files...")
-    base_results = [engine, args.precision, args.device, audio_file, duration]
+    base_results = [
+        args.warmup_runs,
+        args.num_runs,
+        args.model_name,
+        engine,
+        args.precision,
+        args.device,
+        audio_file,
+        duration,
+    ]
     results = process_log_file(args.device_id, log_path, base_results)
 
     return results

From 5373c0c7300e607ffdc008a26c7bc4a97ec8f1f3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 11 Jan 2024 22:25:50 -0800
Subject: [PATCH 029/100] Bump follow-redirects from 1.15.2 to 1.15.4 in
 /js/web (#19068)

Bumps
[follow-redirects](https://github.com/follow-redirects/follow-redirects)
from 1.15.2 to 1.15.4.
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/65858205e59f1e23c9bf173348a7a7cbb8ac47f5"><code>6585820</code></a>
Release version 1.15.4 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/7a6567e16dfa9ad18a70bfe91784c28653fbf19d"><code>7a6567e</code></a>
Disallow bracketed hostnames.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/05629af696588b90d64e738bc2e809a97a5f92fc"><code>05629af</code></a>
Prefer native URL instead of deprecated url.parse.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/1cba8e85fa73f563a439fe460cf028688e4358df"><code>1cba8e8</code></a>
Prefer native URL instead of legacy url.resolve.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/72bc2a4229bc18dc9fbd57c60579713e6264cb92"><code>72bc2a4</code></a>
Simplify _processResponse error handling.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/3d42aecdca39b144a0a2f27ea134b4cf67dd796a"><code>3d42aec</code></a>
Add bracket tests.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/bcbb096b32686ecad6cd34235358ed6f2217d4f0"><code>bcbb096</code></a>
Do not directly set Error properties.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/192dbe7ce671ecad813c074bffe3b3f5d3680fee"><code>192dbe7</code></a>
Release version 1.15.3 of the npm package.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/bd8c81e4f32d12f28a35d265f88b1716703687c6"><code>bd8c81e</code></a>
Fix resource leak on destroy.</li>
<li><a
href="https://github.com/follow-redirects/follow-redirects/commit/9c728c314b06f9595dcd7f245d40731e8a27d79f"><code>9c728c3</code></a>
Split linting and testing.</li>
<li>Additional commits viewable in <a
href="https://github.com/follow-redirects/follow-redirects/compare/v1.15.2...v1.15.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.2&new-version=1.15.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/web/package-lock.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 890c5a0f34765..cd71c20ba4d2f 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1357,9 +1357,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true,
       "funding": [
         {
@@ -4609,9 +4609,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.2",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz",
-      "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==",
+      "version": "1.15.4",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
+      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
       "dev": true
     },
     "from": {

From acba63c36a69966ed90991ef9cba782326b6451e Mon Sep 17 00:00:00 2001
From: Jiajie Hu <jiajie.hu@intel.com>
Date: Fri, 12 Jan 2024 16:08:16 +0800
Subject: [PATCH 030/100] [js/webgpu] Change A/sqrt(B) to A*inverseSqrt(B) in
 normalization ops (#19101)

### Description
Change `A / sqrt(B)` to `A * inverseSqrt(B)` in BatchNormalization,
InstanceNormalization, LayerNormalization and SkipLayerNormalization.

### Motivation and Context
For the same reason as the existence of the `inverseSqrt` built-in in
WebGPU spec.
---
 js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts      | 2 +-
 js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts   | 4 ++--
 js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts      | 8 ++++----
 js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts
index ec9da2613f406..00a6ca75b34fa 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/batch-norm.ts
@@ -108,7 +108,7 @@ const createBatchNormInferenceProgramInfo =
     let inputMean = ${inputMean.getByOffset('cOffset')};
     let inputVar = ${inputVar.getByOffset('cOffset')};
     let x = ${x.getByOffset('global_idx')};
-    let value = (x - inputMean) / sqrt(inputVar + epsilon) * scale + bias;
+    let value = (x - inputMean) * inverseSqrt(inputVar + epsilon) * scale + bias;
     ${y.setByOffset('global_idx', 'value')}
   }`;
       return {
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 056dd54d54591..a835c90bd5451 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -92,7 +92,7 @@ const createInstanceNormProgramInfo =
     }
     workgroupBarrier();
 
-    let invStdDev = 1 / sqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon}));
+    let invStdDev = inverseSqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon}));
     let channelScale = invStdDev * f32(${scale.getByOffset('channel')});
     let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale;
     for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
@@ -212,7 +212,7 @@ const computeMean =
     }
     sum = sum / f32(uniforms.H);
     squaredSum = squaredSum / f32(uniforms.H);
-    let invStdDev = 1 / sqrt(squaredSum - sum * sum + f32(${epsilon}));
+    let invStdDev = inverseSqrt(squaredSum - sum * sum + f32(${epsilon}));
     let channelScale = invStdDev * ${sumCastType}(scale[currentChannelNumber]);
     let channelShift = ${sumCastType}(bias[currentChannelNumber]) - sum * channelScale;
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index bc446079faf8f..3c9f6ce71bb67 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -93,19 +93,19 @@ const createLayerNormProgramInfo =
       meanSquareVector += value * value;
     }
     let mean = ${sumVector('meanVector', components)} / uniforms.norm_size;
-    let meanSquare = sqrt(${sumVector('meanSquareVector', components)}
-      / uniforms.norm_size - mean * mean + uniforms.epsilon);
+    let invStdDev =
+        inverseSqrt(${sumVector('meanSquareVector', components)} / uniforms.norm_size - mean * mean + uniforms.epsilon);
 
     for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
       let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
       let f32scale = ${castToF32(dataType, components, 'scale[j]')};
-      output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale
+      output[j + offset] = ${variables[0].type.value}((f32input - mean) * invStdDev * f32scale
         ${bias ? `+ ${castToF32(dataType, components, 'bias[j]')}` : ''}
       );
     }
 
     ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
-    ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''};
+    ${hasInvStdOutput ? 'inv_std_output[global_idx] = invStdDev' : ''};
   }`;
       };
       const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
index 7e500f865c19b..a2fda9f07d09f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/skip-layer-norm.ts
@@ -132,11 +132,11 @@ const createSkipLayerNormProgramInfo =
           squareSum += f32Value * f32Value;
         }
         let mean = ${sumVector('sum', components)} / hiddenSize;
-        let variance = sqrt(${sumVector('squareSum', components)} / hiddenSize - mean * mean + epsilon);
+        let invStdDev = inverseSqrt(${sumVector('squareSum', components)} / hiddenSize - mean * mean + epsilon);
         ${hasMeanOutput ? 'meanOutput[global_idx] = mean;' : ''}
-        ${hasInvStdDevOutput ? 'invStdOutput[global_idx] = 1.0 / variance;' : ''}
+        ${hasInvStdDevOutput ? 'invStdOutput[global_idx] = invStdDev;' : ''}
         for (var i: u32 = 0; i < hiddenSizeVectorized; i++) {
-          output[offset + i] = (output[offset + i] - ${dataType}(mean)) / ${dataType}(variance) * gamma[i]
+          output[offset + i] = (output[offset + i] - ${dataType}(mean)) * ${dataType}(invStdDev) * gamma[i]
            + ${hasBetaInput ? 'beta[i]' : '0.0'};
         }
       }`;

From e1db44b4f0e819db7eb85d0a0039bee23d762d40 Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Fri, 12 Jan 2024 18:25:09 +0800
Subject: [PATCH 031/100] [WebNN EP] Add quantize Ops (#18011)

### Description
<!-- Describe your changes. -->

Add four quantize Ops: MatmulInteger, ConvInteger, DynamicQuantizeLinear
and DequantizeLinear.
Add datatype TensorProto_DataType_INT8 and TensorProto_DataType_UINT8.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Support quantized models.
---
 .../core/providers/webnn/builders/helper.cc   |  2 +
 .../core/providers/webnn/builders/helper.h    | 10 ++-
 .../webnn/builders/impl/cast_op_builder.cc    |  2 +
 .../webnn/builders/impl/conv_op_builder.cc    | 29 +++++++-
 .../impl/dequantizeLinear_op_builder.cc       | 70 +++++++++++++++++++
 .../impl/dynamicQuantizeLinear_op_builder.cc  | 49 +++++++++++++
 .../webnn/builders/impl/gemm_op_builder.cc    | 15 ++++
 .../core/providers/webnn/builders/model.cc    |  8 +++
 .../providers/webnn/builders/model_builder.cc | 39 +++++++++++
 .../providers/webnn/builders/model_builder.h  |  1 +
 .../webnn/builders/op_builder_factory.cc      |  7 ++
 .../webnn/builders/op_builder_factory.h       |  2 +
 .../webnn/webnn_execution_provider.cc         |  2 +
 13 files changed, 232 insertions(+), 4 deletions(-)
 create mode 100644 onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
 create mode 100644 onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc

diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index 7718fbdc2df88..a55145b0125a7 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -166,6 +166,8 @@ bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type) {
   // TODO: Remove legacy "type" once all browsers implement the new "dataType".
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       desc.set("type", emscripten::val("uint8"));
       desc.set("dataType", emscripten::val("uint8"));
       return true;
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index ea57ab1af19af..f3fc7ec5cc4cd 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -101,6 +101,8 @@ inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::va
   }
   switch (tensor.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       scalar = emscripten::val{*reinterpret_cast<uint8_t*>(unpacked_tensor.data())};
       break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
@@ -148,9 +150,12 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"Clip", {"clamp", true}},
     {"Concat", {"concat", true}},
     {"Conv", {"conv2d", true}},
+    {"ConvInteger", {"conv2dInteger", false}},
     {"ConvTranspose", {"convTranspose2d", true}},
     {"Cos", {"cos", false}},
     {"Div", {"div", true}},
+    {"DequantizeLinear", {"dequantizeLinear", false}},
+    {"DynamicQuantizeLinear", {"dynamicQuantizeLinear", false}},
     {"Elu", {"elu", true}},
     {"Equal", {"equal", false}},
     {"Erf", {"erf", false}},
@@ -176,6 +181,7 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"Log", {"log", false}},
     {"LpPool", {"l2Pool2d", false}},
     {"MatMul", {"matmul", false}},
+    {"MatMulInteger", {"matmulInteger", false}},
     {"Max", {"max", true}},
     {"MaxPool", {"maxPool2d", true}},
     {"Min", {"min", true}},
@@ -242,8 +248,10 @@ constexpr std::array<ONNX_NAMESPACE::TensorProto_DataType, 1> supported_cpu_data
     ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
 };
 
-constexpr std::array<ONNX_NAMESPACE::TensorProto_DataType, 7> supported_gpu_data_types = {
+constexpr std::array<ONNX_NAMESPACE::TensorProto_DataType, 9> supported_gpu_data_types = {
     ONNX_NAMESPACE::TensorProto_DataType_BOOL,
+    ONNX_NAMESPACE::TensorProto_DataType_INT8,
+    ONNX_NAMESPACE::TensorProto_DataType_UINT8,
     ONNX_NAMESPACE::TensorProto_DataType_FLOAT16,
     ONNX_NAMESPACE::TensorProto_DataType_FLOAT,
     ONNX_NAMESPACE::TensorProto_DataType_INT32,
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
index ed9cbbaaec967..062f1c56061a9 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
@@ -39,6 +39,8 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::string operand_type;
   switch (to_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       operand_type = "uint8";
       break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index df0d54e3fd4b4..123a9cc016515 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -183,6 +183,11 @@ Status AddInitializerInNewLayout(ModelBuilder& model_builder,
 
   size_t element_size{0};
   switch (data_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      element_size = sizeof(uint8_t);
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       element_size = sizeof(uint16_t);
       break;
@@ -257,7 +262,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   const auto& weight_name = input_defs[1]->Name();
   emscripten::val options = emscripten::val::object();
   ORT_RETURN_IF_ERROR(SetConvBaseOptions(model_builder, node, options, strides, dilations, pads, logger));
-  if (op_type == "Conv") {
+  if (op_type == "Conv" || op_type == "ConvInteger") {
     int groups = options["groups"].as<int>();
     std::vector<int64_t> input_shape;
     ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
@@ -271,9 +276,26 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
         options.set("filterLayout", emscripten::val("ihwo"));
       }
     }
-    emscripten::val filter = model_builder.GetOperand(input_defs[1]->Name());
+    emscripten::val filter = model_builder.GetOperand(weight_name);
+    if (op_type == "Conv") {
+      output = model_builder.GetBuilder().call<emscripten::val>("conv2d", input, filter, options);
+    } else {
+      emscripten::val x_zero_point = emscripten::val::null();
+      emscripten::val w_zero_point = emscripten::val::null();
+      if (input_defs.size() >= 3) {
+        x_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+      } else {
+        x_zero_point = model_builder.GetZeroConstant("uint8");
+      }
+      if (input_defs.size() >= 4) {
+        w_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
+      } else {
+        w_zero_point = model_builder.GetZeroConstant("uint8");
+      }
+      output = model_builder.GetBuilder().call<emscripten::val>("conv2dInteger",
+                                                                input, x_zero_point, filter, w_zero_point, options);
+    }
 
-    output = model_builder.GetBuilder().call<emscripten::val>("conv2d", input, filter, options);
   } else {
     if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
       options.set("inputLayout", emscripten::val("nhwc"));
@@ -341,6 +363,7 @@ void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
   static std::vector<std::string> op_types =
       {
           "Conv",
+          "ConvInteger",
           "ConvTranspose",
       };
 
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
new file mode 100644
index 0000000000000..66d502a4e6727
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc
@@ -0,0 +1,70 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "core/providers/webnn/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class DequantizeLinearOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+};
+
+Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                                        const Node& node,
+                                                        const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
+  emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name());
+  emscripten::val zero_point = emscripten::val::null();
+  if (input_defs.size() == 3) {
+    zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+  } else {
+    zero_point = model_builder.GetZeroConstant("uint8");
+  }
+  emscripten::val output;
+  std::vector<int64_t> input_shape;
+  std::vector<int64_t> scale_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[1], scale_shape, logger), "Cannot get scale shape");
+  NodeAttrHelper helper(node);
+  int32_t axis = helper.Get("axis", 1);
+  // axis is valid for input shape greater than 1D.
+  if (input_shape.size() > 1) {
+    axis = static_cast<int32_t>(HandleNegativeAxis(axis, input_shape.size()));
+  }
+  // Insert ones before and after the axis dimension for broadcasting of 1D scale tensor.
+  if (1 == scale_shape.size() && 1 < input_shape.size()) {
+    std::vector<int32_t> target_shape{static_cast<int>(input_shape[axis])};
+    target_shape.insert(target_shape.begin(), axis, 1);
+    target_shape.insert(target_shape.end(), input_shape.size() - axis - 1, 1);
+    scale = model_builder.GetBuilder().call<emscripten::val>("reshape", scale, emscripten::val::array(target_shape));
+    zero_point = model_builder.GetBuilder().call<emscripten::val>("reshape",
+                                                                  zero_point, emscripten::val::array(target_shape));
+  }
+  output = model_builder.GetBuilder().call<emscripten::val>("dequantizeLinear", input, scale, zero_point);
+
+  model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
+
+  return Status::OK();
+}
+
+void CreateDequantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<DequantizeLinearOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
new file mode 100644
index 0000000000000..3b5f64584b828
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "core/providers/webnn/builders/impl/base_op_builder.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class DynamicQuantizaLinearOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+};
+
+Status DynamicQuantizaLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                                             const Node& node,
+                                                             const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
+  emscripten::val output_array;
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape");
+  emscripten::val options = emscripten::val::object();
+
+  output_array = model_builder.GetBuilder().call<emscripten::val>("dynamicQuantizeLinear", input);
+
+  for (size_t i = 0, count = output_array["length"].as<size_t>(); i < count; i++) {
+    model_builder.AddOperand(node.OutputDefs()[i]->Name(), std::move(output_array[i]));
+  }
+  return Status::OK();
+}
+
+void CreateDynamicQuantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<DynamicQuantizaLinearOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
index 03ef284336f28..4bf991a1b0105 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc
@@ -39,6 +39,20 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
   emscripten::val output = emscripten::val::object();
   if (op_type == "MatMul") {
     output = model_builder.GetBuilder().call<emscripten::val>("matmul", a, b);
+  } else if (op_type == "MatMulInteger") {
+    emscripten::val a_zero_point = emscripten::val::null();
+    emscripten::val b_zero_point = emscripten::val::null();
+    if (input_defs.size() >= 3) {
+      a_zero_point = model_builder.GetOperand(node.InputDefs()[2]->Name());
+    } else {
+      a_zero_point = model_builder.GetZeroConstant("uint8");
+    }
+    if (input_defs.size() >= 4) {
+      b_zero_point = model_builder.GetOperand(node.InputDefs()[3]->Name());
+    } else {
+      b_zero_point = model_builder.GetZeroConstant("uint8");
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("matmulInteger", a, a_zero_point, b, b_zero_point);
   } else {  // Gemm
     emscripten::val options = emscripten::val::object();
     NodeAttrHelper helper(node);
@@ -149,6 +163,7 @@ void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
       {
           "Gemm",
           "MatMul",
+          "MatMulInteger",
       };
 
   op_registrations.builders.push_back(std::make_unique<GemmOpBuilder>());
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index b25d00d45a497..a4031fd9350c5 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -33,6 +33,8 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
@@ -88,6 +90,8 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
@@ -164,6 +168,8 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = input_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_inputs_.set(input, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
@@ -195,6 +201,8 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = output_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_outputs_.set(output, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index b57e1b89b0af0..4e0c83db8b127 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -14,6 +14,8 @@
 #include "core/providers/common.h"
 #include "core/providers/shared/utils/utils.h"
 
+#include <utility>
+
 namespace onnxruntime {
 namespace webnn {
 
@@ -158,6 +160,9 @@ Status ModelBuilder::RegisterInitializers() {
       }
       switch (data_type) {
         case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+        case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+          desc.set("type", emscripten::val("uint8"));
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint8_t*>(tensor_ptr))};
           break;
@@ -313,6 +318,8 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
   ORT_RETURN_IF_NOT(SetWebnnDataType(desc, data_type), "Unsupported data type");
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint8_t),
                                                            reinterpret_cast<const uint8_t*>(dest))};
       break;
@@ -439,6 +446,38 @@ void ModelBuilder::AddOperand(const std::string& name, const emscripten::val& op
   wnn_operands_.insert(std::make_pair(name, operand));
 }
 
+// Get the zero scalar constant.
+// Workaround for builer.constant(value, type) method since it has not been implemented now.
+// https://webmachinelearning.github.io/webnn/#api-mlgraphbuilder-constant-value-type
+// BTW, the spec is discussing if the builer.constant(value, type) should be dropped at
+// https://github.com/webmachinelearning/webnn/issues/475. Fix me according to the spec decision.
+const emscripten::val& ModelBuilder::GetZeroConstant(const std::string& data_type) {
+  std::string name = "webnn_zero_constant_" + data_type;
+  // If the operand does not exist, create it.
+  if (wnn_operands_.find(name) == wnn_operands_.end()) {
+    emscripten::val desc = emscripten::val::object();
+    emscripten::val dims = emscripten::val::array();
+    desc.set("dimensions", dims);
+    emscripten::val zero_buffer = emscripten::val::undefined();
+    if (data_type == "uint8") {
+      if (!SetWebnnDataType(desc, ONNX_NAMESPACE::TensorProto_DataType_UINT8)) {
+        ORT_THROW("Unsupported data type: " + data_type);
+      }
+      zero_buffer = emscripten::val::global("Uint8Array").new_(1);
+    } else if (data_type == "float32") {
+      if (!SetWebnnDataType(desc, ONNX_NAMESPACE::TensorProto_DataType_FLOAT)) {
+        ORT_THROW("Unsupported data type: " + data_type);
+      }
+      zero_buffer = emscripten::val::global("Float32Array").new_(1);
+    } else {
+      ORT_THROW("Unsupported data type: " + data_type);
+    }
+    emscripten::val zero_constant = wnn_builder_.call<emscripten::val>("constant", desc, zero_buffer);
+    wnn_operands_.insert(std::make_pair(name, zero_constant));
+  }
+  return wnn_operands_.at(name);
+}
+
 void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
   skipped_initializers_.insert(tensor_name);
 }
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index 16c8bf2d3c77f..16cc7a376b71c 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -36,6 +36,7 @@ class ModelBuilder {
   const emscripten::val& GetContext() const { return wnn_context_; }
   const emscripten::val& GetOperand(const std::string& name) const { return wnn_operands_.at(name); }
   void AddOperand(const std::string& name, const emscripten::val& operand);
+  const emscripten::val& GetZeroConstant(const std::string& data_type);
   // Use the buffers to persist WebNN allocated data like transposed weight.
   // It ensures the validity during inference session.
   std::vector<std::unique_ptr<uint8_t[]>> mem_persist_buffers_;
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 613771eda71fe..46d20a94de177 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -72,6 +72,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Conv
     CreateConvOpBuilder("Conv", op_registrations);
+    CreateConvOpBuilder("ConvInteger", op_registrations);
     CreateConvOpBuilder("ConvTranspose", op_registrations);
   }
 
@@ -79,6 +80,11 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateConcatOpBuilder("Concat", op_registrations);
   }
 
+  {  // Quantize/Dequantize
+    CreateDynamicQuantizeLinearOpBuilder("DynamicQuantizeLinear", op_registrations);
+    CreateDequantizeLinearOpBuilder("DequantizeLinear", op_registrations);
+  }
+
   {  // Expand
     CreateExpandOpBuilder("Expand", op_registrations);
   }
@@ -94,6 +100,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   {  // Gemm/MatMul
     CreateGemmOpBuilder("Gemm", op_registrations);
     CreateGemmOpBuilder("MatMul", op_registrations);
+    CreateGemmOpBuilder("MatMulInteger", op_registrations);
   }
 
   {  // Logical
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
index 0b7934692f011..a50a7318e375a 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
@@ -26,6 +26,8 @@ void CreateCastOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateConcatOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateDynamicQuantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateDequantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index cf18b3225eb47..2922cf9540a8e 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -317,6 +317,8 @@ common::Status WebNNExecutionProvider::Compile(const std::vector<FusedNodeAndGra
           void* output_buffer;
           switch (output_type) {
             case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+            case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+            case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
             case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
             case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
             case ONNX_NAMESPACE::TensorProto_DataType_INT32:

From e3ee255950a125f571343ad65037734f7b4feff5 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 12 Jan 2024 07:21:12 -0800
Subject: [PATCH 032/100] Remove the references to CreateFileMapping2 (#19102)

### Description
Remove the references to CreateFileMapping2 because the function is
mainly for system services. To use the function, we need to link to one
of the four [Windows umbrella
libraries](https://learn.microsoft.com/en-us/windows/win32/apiindex/windows-umbrella-libraries).
It's tricky because a custom build might want to use any of the four. So
I cannot just choose one and add that one to our CMakeLists.txt.
Given it's so complicated and the code is not actually used now, I will
remove it. It is not used because it requires NTDDI_VERSION >=
NTDDI_WIN10_RS5 but in our top level CMakeLists.txt we set the version
to the first Windows 10 release which is lower than RS5.
---
 onnxruntime/core/platform/windows/env.cc | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 45648010baf86..1a0713db43db8 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -380,18 +380,6 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
                            " - ", std::system_category().message(error_code));
   }
 
-#if NTDDI_VERSION >= NTDDI_WIN10_RS5 && WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP | WINAPI_PARTITION_SYSTEM)
-  wil::unique_hfile file_mapping_handle{
-      CreateFileMapping2(file_handle.get(),
-                         nullptr,
-                         FILE_MAP_READ,
-                         PAGE_READONLY,
-                         SEC_COMMIT,
-                         0,
-                         nullptr,
-                         nullptr,
-                         0)};
-#else
   wil::unique_hfile file_mapping_handle{
       CreateFileMappingW(file_handle.get(),
                          nullptr,
@@ -399,7 +387,6 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
                          0,
                          0,
                          nullptr)};
-#endif
   if (file_mapping_handle.get() == INVALID_HANDLE_VALUE) {
     const auto error_code = GetLastError();
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,

From 285606108ae7afae96e8a0cc7932bc5bf05640aa Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 12 Jan 2024 07:22:02 -0800
Subject: [PATCH 033/100] Set pythonInterpreter in
 set-python-manylinux-variables-step.yml (#19105)

### Description
Set pythonInterpreter in set-python-manylinux-variables-step.yml. To fix
a build error:

```
Starting: Set Python manylinux variables
==============================================================================
Task         : Python script
Description  : Run a Python file or inline script
Version      : 0.231.1
Author       : Microsoft Corporation
Help         : https://docs.microsoft.com/azure/devops/pipelines/tasks/utility/python-script
==============================================================================
##[error]Parameter 'toolPath' cannot be null or empty.
Finishing: Set Python manylinux variables
```
The error was because today I deleted a bunch of software from the VM
image. The task might fail if no Python versions are found in
$(Agent.ToolsDirectory).
---
 .../templates/set-python-manylinux-variables-step.yml            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
index 1fe58a7239369..68836117db81d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/set-python-manylinux-variables-step.yml
@@ -5,6 +5,7 @@ steps:
 - task: PythonScript@0
   displayName: 'Set Python manylinux variables'
   inputs:
+    pythonInterpreter: /usr/bin/python3
     scriptSource: inline
     script: |
       version = "$(PythonVersion)"

From 0e8d4c3d2119f74df774420e4991bb3d9c09ed95 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 12 Jan 2024 07:24:40 -0800
Subject: [PATCH 034/100] Enable Address Sanitizer in CI (#19073)

### Description
1. Add two build jobs for enabling Address Sanitizer in CI. One for
Windows CPU, One for Linux CPU.
2. Set default compiler flags/linker flags in build.py for normal
Windows/Linux/MacOS build. This can help control compiler flags in a
more centralized way.
3. All Windows binaries in our official packages will be built with
"/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft
public symbol
server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols).

Limitations:
1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries.
Therefore once Address Sanitizer is enabled, before running tests we
need to manually set LD_LIBRARY_PATH properly otherwise
libonnxruntime.so may not be able to find custom ops and shared EPs.
4. On Linux we also need to set LD_PRELOAD before running some tests(if
the main executable, like python, is not built with address sanitizer.
On Windows we do not need to.
5. On Windows before running python tests we should manually copy
address sanitizer DLL to the onnxruntime/capi directory, because python
3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the
information provided by PATH env.
6. On Linux Address Sanitizer found a lot of memory leaks from our
python binding code. Therefore right now we cannot enable Address
Sanitizer when building ONNX Runtime with python binding.
7. Address Sanitizer itself uses a lot of memory address space and
delays memory deallocations, which is easy to cause OOM issues in 32-bit
applications. We cannot run all the tests in onnxruntime_test_all in
32-bit mode with Address Sanitizer due to this reason. However, we still
can run individual tests in such a way. We just cannot run all of them
in one process.

### Motivation and Context
To catch memory issues.
---
 .pipelines/windowsai-steps.yml                |   2 +-
 cmake/adjust_global_compile_flags.cmake       |  42 +----
 onnxruntime/test/framework/bfc_arena_test.cc  |   4 +
 onnxruntime/test/framework/tunable_op_test.cc |   3 +
 .../test/logging_apis/test_logging_apis.cc    |  12 +-
 onnxruntime/test/shared_lib/test_inference.cc |   7 +-
 .../test/shared_lib/test_ort_format_models.cc |  15 +-
 tools/ci_build/build.py                       | 148 ++++++++++++++++--
 .../c-api-noopenmp-packaging-pipelines.yml    |   2 +
 .../azure-pipelines/linux-ci-pipeline.yml     | 137 +++++++++-------
 .../azure-pipelines/mac-ios-ci-pipeline.yml   |   2 +
 .../azure-pipelines/post-merge-jobs.yml       |   3 +-
 .../azure-pipelines/py-packaging-pipeline.yml |   6 -
 .../azure-pipelines/templates/c-api-cpu.yml   |   3 +-
 .../templates/c-api-linux-cpu.yml             |  10 +-
 .../templates/jobs/win-ci-prebuild-steps.yml  |  35 +++++
 .../linux-cpu-packaging-pipeline.yml          |   4 -
 .../templates/py-packaging-stage.yml          |   7 +-
 .../azure-pipelines/templates/py-win-gpu.yml  |   2 +-
 .../templates/use-xcode-version.yml           |   2 +-
 .../azure-pipelines/templates/win-ci.yml      |   6 +-
 .../azure-pipelines/win-ci-pipeline.yml       |  30 +++-
 .../win-gpu-reduce-op-ci-pipeline.yml         |   5 +-
 .../win-gpu-tensorrt-ci-pipeline.yml          |  24 ++-
 .../github/linux/build_cuda_c_api_package.sh  |   6 +-
 .../linux/build_linux_python_package.sh       |  22 +--
 .../github/linux/build_rocm_c_api_package.sh  |   2 -
 .../linux/build_tensorrt_c_api_package.sh     |   2 -
 .../x64/default/cpu/scripts/install_centos.sh |   4 +-
 .../github/linux/docker/manylinux.patch       |   6 +-
 tools/ci_build/github/windows/helpers.ps1     |  40 ++++-
 .../windows/install_third_party_deps.ps1      |   6 +-
 32 files changed, 386 insertions(+), 213 deletions(-)

diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml
index 292ce60c6b6cf..6e551d8187171 100644
--- a/.pipelines/windowsai-steps.yml
+++ b/.pipelines/windowsai-steps.yml
@@ -84,7 +84,7 @@ jobs:
         7z x cmake-3.26.3-windows-x86_64.zip
         set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
         set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
-        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
+        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_qspectre --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO=/PROFILE" CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
       workingDirectory: '$(Build.BinariesDirectory)'
       displayName: 'Generate cmake config'
 
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 9f00c873715f4..94884a3973ef6 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -74,11 +74,6 @@ if (onnxruntime_MINIMAL_BUILD)
   endif()
 
   if (MSVC)
-    # turn on LTO (which adds some compiler flags and turns on LTCG) unless it's a Debug build to minimize binary size
-    if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-      set(onnxruntime_ENABLE_LTO ON)
-    endif()
-
     # undocumented internal flag to allow analysis of a minimal build binary size
     if (ADD_DEBUG_INFO_TO_MINIMAL_BUILD)
       string(APPEND CMAKE_CXX_FLAGS " /Zi")
@@ -267,37 +262,11 @@ if (MSVC)
     string(APPEND CMAKE_C_FLAGS " /arch:AVX512")
   endif()
 
-  if (NOT GDK_PLATFORM)
-    add_compile_definitions(WINAPI_FAMILY=100) # Desktop app
-    message("Building ONNX Runtime for Windows 10 and newer")
-    add_compile_definitions(WINVER=0x0A00 _WIN32_WINNT=0x0A00 NTDDI_VERSION=0x0A000000)
-  endif()
   if (onnxruntime_ENABLE_LTO AND NOT onnxruntime_USE_CUDA)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Gw /GL")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Gw /GL")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Gw /GL")
   endif()
-
-  # The WinML build tool chain builds ARM/ARM64, and the internal tool chain does not have folders for spectre mitigation libs.
-  # WinML performs spectre mitigation differently.
-  if (NOT DEFINED onnxruntime_DISABLE_QSPECTRE_CHECK)
-    check_cxx_compiler_flag(-Qspectre HAS_QSPECTRE)
-    if (HAS_QSPECTRE)
-      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Qspectre")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qspectre")
-    endif()
-  endif()
-  set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} /DYNAMICBASE")
-  check_cxx_compiler_flag(-guard:cf HAS_GUARD_CF)
-  if (HAS_GUARD_CF)
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /guard:cf")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /guard:cf")
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} /guard:cf")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /guard:cf")
-    set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} /guard:cf")
-    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /guard:cf")
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} /guard:cf")
-  endif()
 else()
   if (NOT APPLE)
     #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
@@ -378,16 +347,9 @@ else()
 
 endif()
 
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    #For Mac compliance
-    message("Adding flags for Mac builds")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
-elseif (WIN32)
-    # parallel build
-    # These compiler opitions cannot be forwarded to NVCC, so cannot use add_compiler_options
-    string(APPEND CMAKE_CXX_FLAGS " /MP")
+if (WIN32)
     # required to be set explicitly to enable Eigen-Unsupported SpecialFunctions
     string(APPEND CMAKE_CXX_FLAGS " -DEIGEN_HAS_C99_MATH")
-else()
+elseif(LINUX)
     add_compile_definitions("_GNU_SOURCE")
 endif()
diff --git a/onnxruntime/test/framework/bfc_arena_test.cc b/onnxruntime/test/framework/bfc_arena_test.cc
index 2d3c1521f9c03..0d3e4449da939 100644
--- a/onnxruntime/test/framework/bfc_arena_test.cc
+++ b/onnxruntime/test/framework/bfc_arena_test.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <absl/base/config.h>
 #include "core/framework/bfc_arena.h"
 #include "core/framework/allocator_utils.h"
 #include "gtest/gtest.h"
@@ -164,6 +165,8 @@ void TestCustomMemoryLimit_ProcessException(const OnnxRuntimeException& ex) {
 #endif  // #ifdef GTEST_USES_POSIX_RE
 }
 
+// Address Sanitizer would report allocation-size-too-big if we don't disable this test.
+#ifndef ABSL_HAVE_ADDRESS_SANITIZER
 TEST(BFCArenaTest, TestCustomMemoryLimit) {
   {
     // Configure a 1MiB byte limit
@@ -214,6 +217,7 @@ TEST(BFCArenaTest, TestCustomMemoryLimit) {
     b.Free(first_ptr);
   }
 }
+#endif
 
 TEST(BFCArenaTest, AllocationsAndDeallocationsWithGrowth) {
   // Max of 2GiB, but starts out small.
diff --git a/onnxruntime/test/framework/tunable_op_test.cc b/onnxruntime/test/framework/tunable_op_test.cc
index 0d9e557ebc813..19253e1a5bd2c 100644
--- a/onnxruntime/test/framework/tunable_op_test.cc
+++ b/onnxruntime/test/framework/tunable_op_test.cc
@@ -459,6 +459,8 @@ class TunableVecAddSelectFastestIfSupported : public TunableOp<VecAddParamsRecor
   }
 };
 
+// We run Android tests in a simulator so the result might be different
+#if defined(__ANDROID__) && defined(NDEBUG)
 TEST(TunableOp, SelectFastestIfSupported) {
 #ifdef ORT_NO_RTTI
   GTEST_SKIP() << "TunableOp needs RTTI to work correctly";
@@ -483,6 +485,7 @@ TEST(TunableOp, SelectFastestIfSupported) {
   ASSERT_EQ(last_run, "FastestNarrow");
 #endif
 }
+#endif
 
 TEST(TunableOp, DisabledWithManualSelection) {
 #ifdef ORT_NO_RTTI
diff --git a/onnxruntime/test/logging_apis/test_logging_apis.cc b/onnxruntime/test/logging_apis/test_logging_apis.cc
index 65d0eddb4bb06..0bf3b65bc755a 100644
--- a/onnxruntime/test/logging_apis/test_logging_apis.cc
+++ b/onnxruntime/test/logging_apis/test_logging_apis.cc
@@ -12,7 +12,7 @@
 #pragma GCC diagnostic pop
 #endif
 #endif
-
+#include <absl/base/config.h>
 #include "gtest/gtest.h"
 
 // Manually initialize the Ort API object for every test.
@@ -167,7 +167,13 @@ TEST_F(RealCAPITestsFixture, CApiLoggerLogMessage) {
                                                     ORT_FILE, line_num, static_cast<const char*>(__FUNCTION__)));
 }
 
+// The code below where it tests for formatting error generates an out-of-bound memory access. Therefore we disable it
+// when memory sanitizer is enabled.
+#ifdef ABSL_HAVE_ADDRESS_SANITIZER
+TEST_F(RealCAPITestsFixture, DISABLED_CppApiORTCXXLOG) {
+#else
 TEST_F(RealCAPITestsFixture, CppApiORTCXXLOG) {
+#endif
   // Tests the output and filtering of the ORT_CXX_LOG and ORT_CXX_LOG_NOEXCEPT macros in the C++ API.
   // The first two calls go through, but the last two calls are filtered out due to an insufficient severity.
 
@@ -203,7 +209,11 @@ TEST_F(RealCAPITestsFixture, CppApiORTCXXLOG) {
   ORT_CXX_LOG_NOEXCEPT(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO, "Ignored2");
 }
 
+#ifdef ABSL_HAVE_ADDRESS_SANITIZER
+TEST_F(RealCAPITestsFixture, DISABLED_CppApiORTCXXLOGF) {
+#else
 TEST_F(RealCAPITestsFixture, CppApiORTCXXLOGF) {
+#endif
   // Tests the output and filtering of the ORT_CXX_LOGF and ORT_CXX_LOGF_NOEXCEPT macros in the C++ API.
   // The first set of calls go through. The next set of calls are filtered out due to an insufficient severity.
   // The last calls have a formatting error and we expect an exception depending on which macro is used.
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 35c6b308e8fea..6ffe72f81bd24 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <thread>
 
+#include <absl/base/config.h>
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 
@@ -402,6 +403,8 @@ TEST(CApiTest, SparseInputModel) {
 #endif  // DISABLE_CONTRIB_OPS
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
 
+// Memory leak
+#ifndef ABSL_HAVE_ADDRESS_SANITIZER
 TEST(CApiTest, custom_op_handler) {
   std::cout << "Running custom op inference" << std::endl;
 
@@ -435,6 +438,7 @@ TEST(CApiTest, custom_op_handler) {
                        custom_op_domain, nullptr);
 #endif
 }
+#endif
 
 #ifdef USE_CUDA
 TEST(CApiTest, custom_op_set_input_memory_type) {
@@ -1452,7 +1456,8 @@ TEST(CApiTest, test_custom_op_library) {
 #endif
 }
 
-#if defined(__ANDROID__)
+// Has memory leak
+#if defined(__ANDROID__) || defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(CApiTest, DISABLED_test_custom_op_shape_infer_attr) {
 // To accomodate a reduced op build pipeline
 #elif defined(REDUCED_OPS_BUILD) && defined(USE_CUDA)
diff --git a/onnxruntime/test/shared_lib/test_ort_format_models.cc b/onnxruntime/test/shared_lib/test_ort_format_models.cc
index d67c5a3048092..99a9ebc3362ae 100644
--- a/onnxruntime/test/shared_lib/test_ort_format_models.cc
+++ b/onnxruntime/test/shared_lib/test_ort_format_models.cc
@@ -3,7 +3,7 @@
 
 // custom ops are only supported in a minimal build if explicitly enabled
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-
+#include <absl/base/config.h>
 #include "core/common/common.h"
 #include "core/graph/constants.h"
 #include "core/session/onnxruntime_cxx_api.h"
@@ -16,10 +16,10 @@
 
 extern std::unique_ptr<Ort::Env> ort_env;
 
-static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
-                          const std::vector<Input>& inputs, const char* output_name,
-                          const std::vector<int64_t>& expected_dims_y, const std::vector<float>& expected_values_y,
-                          Ort::CustomOpDomain& custom_op_domain, void* cuda_compute_stream = nullptr) {
+[[maybe_unused]] static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& model_uri,
+                                           const std::vector<Input>& inputs, const char* output_name,
+                                           const std::vector<int64_t>& expected_dims_y, const std::vector<float>& expected_values_y,
+                                           Ort::CustomOpDomain& custom_op_domain, void* cuda_compute_stream = nullptr) {
   Ort::SessionOptions session_options;
   session_options.Add(custom_op_domain);
 
@@ -27,6 +27,7 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
   auto cuda_options = CreateDefaultOrtCudaProviderOptionsWithCustomStream(cuda_compute_stream);
   session_options.AppendExecutionProvider_CUDA(cuda_options);
 #else
+  session_options.DisableCpuMemArena();
   ORT_UNUSED_PARAMETER(cuda_compute_stream);
 #endif
   Ort::Session session(env, model_uri.c_str(), session_options);
@@ -65,7 +66,7 @@ static void TestInference(Ort::Env& env, const std::basic_string<ORTCHAR_T>& mod
   }
 }
 
-#if !defined(ORT_MINIMAL_BUILD)
+#if !defined(ORT_MINIMAL_BUILD) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(OrtFormatCustomOpTests, ConvertOnnxModelToOrt) {
   const std::basic_string<ORTCHAR_T> onnx_file = ORT_TSTR("testdata/foo_1.onnx");
   const std::basic_string<ORTCHAR_T> ort_file = ORT_TSTR("testdata/foo_1.onnx.test_output.ort");
@@ -120,7 +121,7 @@ TEST(OrtFormatCustomOpTests, ConvertOnnxModelToOrt) {
 
 // the saved ORT format model has the CPU EP assigned to the custom op node, so we only test if we're not using the
 // CUDA EP for the test.
-#ifndef USE_CUDA
+#if !defined(USE_CUDA) && !defined(ABSL_HAVE_ADDRESS_SANITIZER)
 TEST(OrtFormatCustomOpTests, LoadOrtModel) {
   const std::basic_string<ORTCHAR_T> ort_file = ORT_TSTR("testdata/foo_1.onnx.ort");
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 3d0ec92a7bd23..592e6d6a564fb 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -418,8 +418,18 @@ def convert_arg_line_to_args(self, arg_line):
         "(e.g. macOS or iOS)"
         "This is only supported on MacOS",
     )
+    # A 32-bit progress doesn't have enough memory to run all the tests in onnxruntime_test_all.
+    # Mimalloc is incompatible with address sanitizer.
+    # Address sanitizer itself is also a memory leak checker, so when it is enabled we should disable_memleak_checker.
     parser.add_argument(
-        "--disable_memleak_checker", action="store_true", help="Disable memory leak checker from Windows build"
+        "--enable_address_sanitizer", action="store_true", help="Enable address sanitizer. Windows/Linux/MacOS only."
+    )
+    # The following feature requires installing some special Visual Studio components that do not get installed by default. Therefore the options is default OFF.
+    parser.add_argument("--enable_qspectre", action="store_true", help="Enable Qspectre. Windows only.")
+    parser.add_argument(
+        "--disable_memleak_checker",
+        action="store_true",
+        help="Disable memory leak checker from Windows build. By default it is enabled in Windows Debug build. This option is Windows only.",
     )
 
     # WebAssembly build
@@ -600,6 +610,11 @@ def convert_arg_line_to_args(self, arg_line):
         "--use_telemetry", action="store_true", help="Only official builds can set this flag to enable telemetry."
     )
     parser.add_argument("--enable_wcos", action="store_true", help="Build for Windows Core OS.")
+    # Do not enable LTO when the compiler is MSVC and the flag for generating debug symbols is set to /Z7 and training
+    # is also enabled. Because both LTO and /Zi could significantly increase *.obj/*.lib files' size, and on Windows
+    # there is a 4GB per file limit(ERROR LNK1248). We may solve the issue by splitting the big static libs to smaller
+    # ones. Before the refactoring work is done, we should avoid enabling LTO and ccache at the same time because ccache
+    # needs /Z7.
     parser.add_argument("--enable_lto", action="store_true", help="Enable Link Time Optimization")
     parser.add_argument("--enable_transformers_tool_test", action="store_true", help="Enable transformers tool test")
     parser.add_argument(
@@ -1406,6 +1421,17 @@ def generate_build_tree(
     if args.use_lock_free_queue:
         add_default_definition(cmake_extra_defines, "onnxruntime_USE_LOCK_FREE_QUEUE", "ON")
 
+    if is_windows():
+        if args.use_cache:
+            add_default_definition(
+                cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>"
+            )
+        else:
+            # Always enable debug info even in release build. The debug information is in separated *.pdb files that
+            # can be easily discarded when debug symbols are not needed. We enable it by default because many auditting
+            # tools need to use the symbols.
+            add_default_definition(cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "ProgramDatabase")
+
     cmake_args += [f"-D{define}" for define in cmake_extra_defines]
 
     cmake_args += cmake_extra_args
@@ -1445,8 +1471,96 @@ def generate_build_tree(
                     f"-DVERSION_PRIVATE_PART={MM}{DD}",
                     f"-DVERSION_STRING={ort_major}.{ort_minor}.{build_number}.{source_version[0:7]}",
                 ]
-
+    cflags = None
+    cxxflags = None
+    ldflags = None
     for config in configs:
+        # Setup default values for cflags/cxxflags/ldflags.
+        # The values set here are purely for security and compliance purposes. ONNX Runtime should work fine without these flags.
+        if (
+            "CFLAGS" not in os.environ
+            and "CXXFLAGS" not in os.environ
+            and not args.ios
+            and not args.android
+            and not args.build_wasm
+            and not (is_linux() and platform.machine() != "aarch64" and platform.machine() != "x86_64")
+        ):
+            if is_windows():
+                cflags = ["/guard:cf", "/DWIN32", "/D_WINDOWS"]
+                if args.parallel:
+                    cflags += ["/MP"]
+                if not args.use_gdk:
+                    # Target Windows 10
+                    cflags += [
+                        "/DWINAPI_FAMILY=100",
+                        "/DWINVER=0x0A00",
+                        "/D_WIN32_WINNT=0x0A00",
+                        "/DNTDDI_VERSION=0x0A000000",
+                    ]
+                # The "/profile" flag implies "/DEBUG:FULL /DEBUGTYPE:cv,fixup /OPT:REF /OPT:NOICF /INCREMENTAL:NO /FIXED:NO". We set it for satisfying a Microsoft internal compliance requirement. External users
+                # do not need to have it.
+                ldflags = ["/profile", "/DYNAMICBASE"]
+                if args.enable_qspectre:
+                    cflags += ["/Qspectre"]
+                if config == "Release":
+                    cflags += ["/O2", "/Ob2", "/DNDEBUG"]
+                elif config == "RelWithDebInfo":
+                    cflags += ["/O2", "/Ob1", "/DNDEBUG"]
+                elif config == "Debug":
+                    cflags += ["/Ob0", "/Od", "/RTC1"]
+                    if args.enable_address_sanitizer:
+                        cflags += ["/fsanitize=address"]
+                elif config == "MinSizeRel":
+                    cflags += ["/O1", "/Ob1", "/DNDEBUG"]
+                cxxflags = cflags.copy()
+                if not args.disable_exceptions:
+                    cxxflags += ["/EHsc"]
+            elif is_linux() or is_macOS():
+                if is_linux():
+                    ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now"]
+                else:
+                    ldflags = []
+                if config == "Release":
+                    cflags = [
+                        "-DNDEBUG",
+                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-Wp,-D_GLIBCXX_ASSERTIONS",
+                        "-fstack-protector-strong",
+                        "-O3",
+                        "-pipe",
+                    ]
+                    if is_linux():
+                        ldflags += ["-Wl,--strip-all"]
+                elif config == "RelWithDebInfo":
+                    cflags = [
+                        "-DNDEBUG",
+                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-Wp,-D_GLIBCXX_ASSERTIONS",
+                        "-fstack-protector-strong",
+                        "-O3",
+                        "-pipe",
+                        "-ggdb3",
+                    ]
+                elif config == "Debug":
+                    cflags = ["-ggdb3", "-O0"]
+                    if args.enable_address_sanitizer:
+                        cflags += ["-fsanitize=address"]
+                        ldflags += ["-fsanitize=address"]
+                elif config == "MinSizeRel":
+                    cflags = [
+                        "-DNDEBUG",
+                        "-Wp,-D_FORTIFY_SOURCE=2",
+                        "-Wp,-D_GLIBCXX_ASSERTIONS",
+                        "-fstack-protector-strong",
+                        "-Os",
+                        "-pipe",
+                        "-ggdb3",
+                    ]
+                if is_linux() and platform.machine() == "x86_64":
+                    # The following flags needs GCC 8 and newer
+                    cflags += ["-fstack-clash-protection", "-fcf-protection"]
+                cxxflags = cflags.copy()
+
         config_build_dir = get_config_build_dir(build_dir, config)
         os.makedirs(config_build_dir, exist_ok=True)
         if args.use_tvm:
@@ -1460,20 +1574,21 @@ def generate_build_tree(
                 + os.environ["PATH"]
             )
         preinstalled_dir = Path(build_dir) / config
+        temp_cmake_args = cmake_args.copy()
+        if cflags is not None and cxxflags is not None:
+            temp_cmake_args += [
+                "-DCMAKE_C_FLAGS=%s" % (" ".join(cflags)),
+                "-DCMAKE_CXX_FLAGS=%s" % (" ".join(cxxflags)),
+            ]
+        if ldflags is not None and len(ldflags) != 0:
+            temp_cmake_args += [
+                "-DCMAKE_EXE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+                "-DCMAKE_MODULE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+                "-DCMAKE_SHARED_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),
+            ]
         run_subprocess(
             [
-                *cmake_args,
-                "-Donnxruntime_ENABLE_MEMLEAK_CHECKER="
-                + (
-                    "ON"
-                    if config.lower() == "debug"
-                    and not args.use_tvm
-                    and not args.use_openvino
-                    and not args.use_gdk
-                    and not args.enable_msvc_static_runtime
-                    and not args.disable_memleak_checker
-                    else "OFF"
-                ),
+                *temp_cmake_args,
                 f"-DCMAKE_BUILD_TYPE={config}",
                 f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed"
                 if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm)
@@ -2363,6 +2478,10 @@ def main():
     cmake_extra_defines = normalize_arg_list(args.cmake_extra_defines)
     cross_compiling = args.arm or args.arm64 or args.arm64ec or args.android
 
+    if args.enable_address_sanitizer:
+        # Disable ONNX Runtime's builtin memory checker
+        args.disable_memleak_checker = True
+
     # If there was no explicit argument saying what to do, default
     # to update, build and test (for native builds).
     if not (args.update or args.clean or args.build or args.test or args.gen_doc):
@@ -2665,6 +2784,7 @@ def main():
     # fail unexpectedly. Similar, if your packaging step forgot to copy a file into the package, we don't know it
     # either.
     if args.build:
+        # TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and the target OS is Windows
         if args.build_wheel:
             nightly_build = bool(os.getenv("NIGHTLY_BUILD") == "1")
             default_training_package_device = bool(os.getenv("DEFAULT_TRAINING_PACKAGE_DEVICE") == "1")
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index f97bf2dc6987f..024fc1116954f 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -727,6 +727,7 @@ stages:
   dependsOn:
   - Setup
   - Windows_Packaging_gpu
+  - Windows_Packaging_CPU_x64_default
   - Windows_Packaging_tensorrt
   - Linux_C_API_Packaging_GPU_x64
   - Linux_C_API_Packaging_GPU_TensorRT_x64
@@ -780,6 +781,7 @@ stages:
         SpecificArtifact: ${{ parameters.SpecificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
+    # The following one is from a CPU job that publishes protoc.exe
     - template: templates/flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download Pipeline Artifact - NuGet'
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 64b78dca504ca..07f672c75d029 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -38,7 +38,84 @@ stages:
 - stage: x64
   dependsOn: []
   jobs:
-    - job: Linux_Build
+    - job: Linux_Debug
+      timeoutInMinutes: 180
+      workspace:
+        clean: all
+      variables:
+        skipComponentGovernanceDetection: true
+        ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
+        TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
+      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      steps:
+      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+        displayName: 'Clean Agent Directories'
+        condition: always()
+
+      - checkout: self
+        clean: true
+        submodules: none
+
+      - template: templates/get-docker-image-steps.yml
+        parameters:
+          Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/cpu/Dockerfile
+          Context: tools/ci_build/github/linux/docker/inference/x64/default/cpu
+          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi"
+          Repository: onnxruntimecpubuildcentos8x64
+
+      - template: templates/linux-build-step-with-cache.yml
+        parameters:
+          WithCache: false
+          Today: $(TODAY)
+          AdditionalKey: onnxruntime_linux_debug_with_address_sanitizer
+          CacheDir: $(ORT_CACHE_DIR)
+          ChangeEveryCommit: true
+          BuildStep:
+            - task: CmdLine@2
+              displayName: 'build'
+              inputs:
+                script: |
+                  mkdir -p $HOME/.onnx
+                  docker run --rm \
+                    --volume /data/onnx:/data/onnx:ro \
+                    --volume /data/models:/data/models:ro \
+                    --volume $(Build.SourcesDirectory):/onnxruntime_src \
+                    --volume $(Build.BinariesDirectory):/build \
+                    --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+                    -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+                    -e NIGHTLY_BUILD \
+                    -e BUILD_BUILDNUMBER \
+                    onnxruntimecpubuildcentos8x64 \
+                    /bin/bash -c "
+                      set -ex; \
+                      python3.9 /onnxruntime_src/tools/ci_build/build.py \
+                        --build_dir /build --cmake_generator 'Ninja' \
+                        --config Debug \
+                        --skip_submodule_sync \
+                        --build_shared_lib \
+                        --parallel \
+                        --build_csharp \
+                        --enable_onnx_tests --enable_address_sanitizer \
+                        --update --build;
+                      LD_PRELOAD=/usr/lib64/libasan.so.6 python3.9 /onnxruntime_src/tools/ci_build/build.py \
+                        --build_dir /build --cmake_generator 'Ninja' \
+                        --config Debug \
+                        --skip_submodule_sync \
+                        --build_shared_lib \
+                        --parallel \
+                        --build_csharp \
+                        --enable_onnx_tests --enable_address_sanitizer \
+                        --test;"
+                workingDirectory: $(Build.SourcesDirectory)
+
+      - task: PublishTestResults@2
+        displayName: 'Publish unit test results'
+        inputs:
+          testResultsFiles: '**/*.results.xml'
+          searchFolder: '$(Build.BinariesDirectory)'
+          testRunTitle: 'Unit Test Run'
+        condition: succeededOrFailed()
+    - job: Linux_Release
       timeoutInMinutes: 180
       workspace:
         clean: all
@@ -148,7 +225,7 @@ stages:
                       ccache -s; \
                       /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
                         --build_dir /build --cmake_generator 'Ninja' \
-                        --config Debug Release \
+                        --config Release \
                         --skip_submodule_sync \
                         --build_shared_lib \
                         --parallel \
@@ -166,61 +243,7 @@ stages:
           ln -s /data/models $(Build.BinariesDirectory)/models
         displayName: link model dir
 
-      - bash: |
-          mkdir -p $HOME/.onnx
-          docker run --rm \
-            --volume /data/onnx:/data/onnx:ro \
-            --volume $(Build.SourcesDirectory):/onnxruntime_src \
-            --volume $(Build.BinariesDirectory):/build \
-            --volume /data/models:/build/models:ro \
-            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-            -e NIGHTLY_BUILD \
-            -e BUILD_BUILDNUMBER \
-            onnxruntimecpubuild \
-            /bin/bash -c "
-              set -ex; \
-              pushd /onnxruntime_src/csharp; \
-              dotnet restore /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
-              dotnet build /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \
-              dotnet test /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -f net6.0 --no-build -l \"console;verbosity=normal\"; \
-              popd
-              "
-        displayName: 'Dotnet build C# sln and Test'
-
-      - bash: |
-          mkdir -p $HOME/.onnx
-          docker run --rm \
-            --volume /data/onnx:/data/onnx:ro \
-            --volume $(Build.SourcesDirectory):/onnxruntime_src \
-            --volume $(Build.BinariesDirectory):/build \
-            --volume /data/models:/build/models:ro \
-            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-            -e NIGHTLY_BUILD \
-            -e BUILD_BUILDNUMBER \
-            onnxruntimecpubuild \
-              /bin/bash -c "
-                set -ex; \
-                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Release && \
-                /bin/bash /onnxruntime_src/tools/scripts/symbolic_shape_infer_test.sh /build
-              "
-        displayName: 'Run Release tests and symbolic shape infer test'
-
-      - bash: |
-          mkdir -p $HOME/.onnx
-          docker run --rm \
-            --volume /data/onnx:/data/onnx:ro \
-            --volume $(Build.SourcesDirectory):/onnxruntime_src \
-            --volume $(Build.BinariesDirectory):/build \
-            --volume /data/models:/build/models:ro \
-            --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-            -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-            -e NIGHTLY_BUILD \
-            -e BUILD_BUILDNUMBER \
-            onnxruntimecpubuild \
-                /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Debug
-        displayName: 'Run Debug tests'
+      
 
       - task: PublishTestResults@2
         displayName: 'Publish unit test results'
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
index 18d53654e7c4d..33701fccf0c5f 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml
@@ -38,6 +38,8 @@ jobs:
   timeoutInMinutes: 150
   steps:
     - template: templates/use-xcode-version.yml
+      parameters:
+        xcodeVersion: 14.3
     - template: templates/mac-build-step-with-cache.yml
       parameters:
         WithCache: true
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index e26e3e9abd0b9..5ee39876733e2 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -460,7 +460,8 @@ stages:
         architecture: "x64"
 
     - template: templates/use-xcode-version.yml
-
+      parameters:
+        xcodeVersion: 14.3
     - script: |
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index 62f84a9bb185c..06cca0068523d 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -24,11 +24,6 @@ parameters:
   type: boolean
   default: true
 
-- name: enable_mac_silicon
-  displayName: 'Whether Mac silicon package is built.'
-  type: boolean
-  default: true
-
 - name: enable_linux_arm
   displayName: 'Whether Linux ARM package is built.'
   type: boolean
@@ -68,7 +63,6 @@ stages:
     enable_windows_cpu: ${{ parameters.enable_windows_cpu }}
     enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
     enable_mac_cpu: ${{ parameters.enable_mac_cpu }}
-    enable_mac_silicon: ${{ parameters.enable_mac_silicon }}
     enable_linux_arm: ${{ parameters.enable_linux_arm }}
     build_py_parameters: ${{ parameters.build_py_parameters }}
     cmake_build_type: ${{ parameters.cmake_build_type }}
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index e6025ae1b56bd..81319e07c6b17 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -107,7 +107,8 @@ stages:
     - template: set-version-number-variables-step.yml
 
     - template: use-xcode-version.yml
-
+      parameters:
+        xcodeVersion: 14.3
     - script: |
         /bin/bash $(Build.SourcesDirectory)/tools/ci_build/github/apple/build_host_protoc.sh \
           $(Build.SourcesDirectory) \
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 15fcec0511741..8538f15e93753 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -11,12 +11,6 @@ parameters:
 - name: OnnxruntimeArch
   type: string
 
-- name: OnnxruntimeCFlags
-  type: string
-
-- name: OnnxruntimeCXXFlags
-  type: string
-
 - name: OnnxruntimeNodejsBindingArch
   type: string
   values:
@@ -67,9 +61,9 @@ jobs:
       inputs:
         script: |
           mkdir -p $HOME/.onnx
-          docker run --rm -e CFLAGS="${{parameters.OnnxruntimeCFlags}}" -e CXXFLAGS="${{parameters.OnnxruntimeCXXFlags}}" --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
+          docker run --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \
-          /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
+          /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \
           --skip_submodule_sync  --parallel --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}"
         workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
index 09c52f4d5ba0d..9516753d50113 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
@@ -30,6 +30,41 @@ steps:
     addToPath: true
     architecture: ${{parameters.BuildArch}}
 
+- ${{ if eq(parameters.BuildArch, 'x64') }}:
+  - script: |
+      @echo off
+      set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+      for /f "usebackq delims=" %%i in (`%vswherepath% -latest -property installationPath`) do (
+        if exist "%%i\VC\Auxiliary\Build\vcvars64.bat" (
+          set vcvarsall="%%i\VC\Auxiliary\Build\vcvars64.bat"
+        )
+      )
+
+      @echo %vcvarsall% will be used as the VC compiler
+      @echo ##vso[task.setvariable variable=vcvarsall]%vcvarsall%
+    displayName: 'locate vcvarsall via vswhere'
+
+- ${{ if eq(parameters.BuildArch, 'x86') }}:
+  - script: |
+      @echo off
+      set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
+      for /f "usebackq delims=" %%i in (`%vswherepath% -latest -property installationPath`) do (
+        if exist "%%i\VC\Auxiliary\Build\vcvars32.bat" (
+          set vcvarsall="%%i\VC\Auxiliary\Build\vcvars32.bat"
+        )
+      )
+
+      @echo %vcvarsall% will be used as the VC compiler
+      @echo ##vso[task.setvariable variable=vcvarsall]%vcvarsall%
+    displayName: 'locate vcvarsall via vswhere'
+
+- task: BatchScript@1
+  displayName: 'Setup VC env'
+  inputs:
+    filename: '$(vcvarsall)'
+    modifyEnvironment: true
+    workingFolder: '$(Build.BinariesDirectory)'
+
 - script: |
     python -m pip install --upgrade "setuptools>=68.2.2" wheel numpy flatbuffers
   workingDirectory: '$(Build.BinariesDirectory)'
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 1cc5c48c5513c..6ad5f9f38a4db 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -31,8 +31,6 @@ stages:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
       BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
-      OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
-      OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
       OnnxruntimeNodejsBindingArch: 'x64'
       PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
@@ -44,8 +42,6 @@ stages:
       AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }}
       BaseImage: 'arm64v8/almalinux:8'
       OnnxruntimeArch: 'aarch64'
-      OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -O3 -Wl,--strip-all'
-      OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -O3 -Wl,--strip-all'
       OnnxruntimeNodejsBindingArch: 'arm64'
       PoolName: 'onnxruntime-linux-ARM64-CPU-2019'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index a3c2983b755d0..abe06e80f4f19 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -30,11 +30,6 @@ parameters:
   type: boolean
   default: true
 
-- name: enable_mac_silicon
-  displayName: 'Whether Mac silicon package is built.'
-  type: boolean
-  default: true
-
 - name: enable_linux_arm
   displayName: 'Whether Linux ARM package is built.'
   type: boolean
@@ -382,6 +377,8 @@ stages:
         inputs:
           versionSpec: $(PythonVersion)
 
+      - template: use-xcode-version.yml
+
       - script: |
           set -e -x
           export _PYTHON_HOST_PLATFORM=macosx-${{variables.MACOSX_DEPLOYMENT_TARGET}}-universal2
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index 501251eaff20f..c83e130dd26e8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -110,7 +110,7 @@ jobs:
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
           arguments: >
-            --config RelWithDebInfo
+            --config RelWithDebInfo --enable_qspectre
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
diff --git a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
index 7d767b4f4fde6..5742b6c60fec5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/use-xcode-version.yml
@@ -3,7 +3,7 @@
 parameters:
 - name: xcodeVersion
   type: string
-  default: "14.3"
+  default: "15.1"
 
 steps:
 - bash: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 89c481f267e64..31e41eb4bc2d7 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -150,9 +150,9 @@ stages:
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
           ${{ if eq(parameters['UseIncreasedTimeoutForTests'], 'true') }}:
-            arguments: '--config RelWithDebInfo --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} --test_all_timeout 72000'
+            arguments: '--config RelWithDebInfo --enable_qspectre --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} --test_all_timeout 72000'
           ${{ else }}:
-            arguments: '--config RelWithDebInfo --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} '
+            arguments: '--config RelWithDebInfo --enable_qspectre --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "$(VSGenerator)" --enable_onnx_tests $(TelemetryOption) ${{ parameters.buildparameter }} '
           workingDirectory: '$(Build.BinariesDirectory)'
 
       - task: VSBuild@1
@@ -172,7 +172,7 @@ stages:
         condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--config RelWithDebInfo --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+          arguments: '--config RelWithDebInfo --enable_qspectre --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
           workingDirectory: '$(Build.BinariesDirectory)'
 
       - script: |
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index d7ffc1828c943..71dcdf0cc76ac 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -52,6 +52,32 @@ stages:
         WITH_CACHE: true
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
+    - job: build_x64_asan
+      pool: 'onnxruntime-Win-CPU-2022'
+      timeoutInMinutes:  300
+      steps:
+      - checkout: self
+        clean: true
+        submodules: none
+
+      - template: templates/jobs/win-ci-prebuild-steps.yml
+        parameters:
+          EnvSetupScript: setup_env.bat
+          DownloadCUDA: false
+          BuildArch: x64
+          BuildConfig: Debug
+          MachinePool: 'onnxruntime-Win-CPU-2022'
+          WithCache: false
+          Today: $(TODAY)
+
+      - task: PythonScript@0
+        displayName: 'Build and Test'
+        inputs:
+          scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+          arguments: --config Debug --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --cmake_generator "Visual Studio 17 2022" --disable_memleak_checker --enable_address_sanitizer
+          workingDirectory: '$(Build.BinariesDirectory)'
+
+
 - stage: x64_release
   dependsOn: []
   jobs:
@@ -127,9 +153,9 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: false
+        WITH_CACHE: true
         MachinePool: 'onnxruntime-Win-CPU-2022'
-
+        
 - stage: x86_release
   dependsOn: []
   jobs:
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
index d0f9772da7adc..9133db79946b5 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-reduce-op-ci-pipeline.yml
@@ -11,7 +11,6 @@ jobs:
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
     EnvSetupScript: setup_env_cuda.bat
-    buildArch: x64
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   timeoutInMinutes: 120
   workspace:
@@ -21,7 +20,7 @@ jobs:
     parameters:
       EnvSetupScript: $(EnvSetupScript)
       DownloadCUDA: true
-      BuildArch: $(buildArch)
+      BuildArch: 'x64'
       BuildConfig: $(BuildConfig)
       MachinePool: 'onnxruntime-Win2022-GPU-T4'
       WithCache: true
@@ -34,7 +33,7 @@ jobs:
       AdditionalKey: "gpu-reduced-ops | $(BuildConfig)"
       BuildPyArguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --update --skip_submodule_sync --cmake_generator "Visual Studio 17 2022" --build_wheel --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=75" --include_ops_by_config="$(Build.SourcesDirectory)\onnxruntime\test\testdata\required_ops.config"'
       MsbuildArguments: $(MsbuildArguments)
-      BuildArch: $(buildArch)
+      BuildArch: 'x64'
       Platform: 'x64'
       BuildConfig: $(BuildConfig)
 
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
index 658c358aa4523..2c4e4eb011783 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml
@@ -33,8 +33,6 @@ jobs:
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
     EnvSetupScript: setup_env_trt.bat
-    buildArch: x64
-    BuildConfig: 'RelWithDebInfo'
     skipComponentGovernanceDetection: true
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   timeoutInMinutes: 150
@@ -45,8 +43,8 @@ jobs:
     parameters:
       EnvSetupScript: $(EnvSetupScript)
       DownloadCUDA: true
-      BuildArch: $(buildArch)
-      BuildConfig: $(BuildConfig)
+      BuildArch: 'x64'
+      BuildConfig: RelWithDebInfo
       MachinePool: 'onnxruntime-Win2022-GPU-T4'
       WithCache: true
       Today: $(Today)
@@ -55,28 +53,28 @@ jobs:
     parameters:
       WithCache: True
       Today: $(TODAY)
-      AdditionalKey: "gpu-tensorrt | $(BuildConfig)"
-      BuildPyArguments: '--config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
+      AdditionalKey: "gpu-tensorrt | RelWithDebInfo"
+      BuildPyArguments: '--config RelWithDebInfo --enable_qspectre --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75'
       MsbuildArguments: $(MsbuildArguments)
-      BuildArch: $(buildArch)
+      BuildArch: 'x64'
       Platform: 'x64'
-      BuildConfig: $(BuildConfig)
+      BuildConfig: RelWithDebInfo
 
   - task: PythonScript@0
     displayName: 'Build wheel'
     inputs:
       scriptPath: '$(Build.SourcesDirectory)\setup.py'
       arguments: 'bdist_wheel'
-      workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
+      workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
 
   - script: |
-     mklink  /D /J $(Build.BinariesDirectory)\$(BuildConfig)\models $(Build.BinariesDirectory)\models
+     mklink  /D /J $(Build.BinariesDirectory)\RelWithDebInfo\models $(Build.BinariesDirectory)\models
      DIR dist\ /S /B > wheel_filename_file
      set /p WHEEL_FILENAME=<wheel_filename_file
      del wheel_filename_file
      python.exe -m pip install -q --upgrade %WHEEL_FILENAME%
-     set PATH=$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig);%PATH%
-     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config $(BuildConfig) --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+     set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH%
+     python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --enable_qspectre --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="C:\local\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8"  --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
 
-    workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
+    workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
     displayName: 'Run tests'
diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index 2ec8bc82ae048..106536c0093b8 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -1,11 +1,9 @@
 #!/bin/bash
 set -e -x
-export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
-export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
-docker run --gpus all -e CFLAGS -e CXXFLAGS  -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \
+docker run --gpus all -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \
 $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \
-/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
+/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \
 --skip_submodule_sync  --parallel --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
 --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \
 --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 4c0a39fdc512e..1059dd5047477 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -25,30 +25,12 @@ done
 
 BUILD_ARGS=("--build_dir" "/build" "--config" "$BUILD_CONFIG" "--update" "--build" "--skip_submodule_sync" "--parallel" "--build_wheel")
 
-if [ "$BUILD_CONFIG" == "Debug" ]; then
-    CFLAGS="-ggdb3"
-    CXXFLAGS="-ggdb3"
-else
-    CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -O3 -pipe -Wl,--strip-all"
-    CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -O3 -pipe -Wl,--strip-all"
+if [ "$BUILD_CONFIG" != "Debug" ]; then
     BUILD_ARGS+=("--enable_lto")
 fi
 
-# Depending on how the compiler has been configured when it was built, sometimes "gcc -dumpversion" shows the full version.
-GCC_VERSION=$(gcc -dumpversion | cut -d . -f 1)
-#-fstack-clash-protection prevents attacks based on an overlapping heap and stack.
-if [ "$GCC_VERSION" -ge 8 ]; then
-    CFLAGS="$CFLAGS -fstack-clash-protection"
-    CXXFLAGS="$CXXFLAGS -fstack-clash-protection"
-fi
-
 ARCH=$(uname -m)
 
-if [ "$ARCH" == "x86_64" ] && [ "$GCC_VERSION" -ge 9 ]; then
-    CFLAGS="$CFLAGS -fcf-protection"
-    CXXFLAGS="$CXXFLAGS -fcf-protection"
-fi
-
 echo "EXTRA_ARG:"
 echo "$EXTRA_ARG"
 
@@ -67,8 +49,6 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then
     BUILD_ARGS+=("--nvcc_threads=1" "--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80")
 fi
 
-export CFLAGS
-export CXXFLAGS
 for PYTHON_EXE in "${PYTHON_EXES[@]}"
 do
   rm -rf /build/"$BUILD_CONFIG"
diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
index 957f1f8a812a5..d70442ad2cae8 100755
--- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh
@@ -24,8 +24,6 @@ docker run --rm \
   --security-opt seccomp=unconfined \
   --shm-size=1024m \
   --user $UID:$(id -g $USER) \
-  -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
-  -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \
   -e NIGHTLY_BUILD \
   --volume $SOURCE_DIR:/onnxruntime_src \
   --volume $BINARY_DIR:/build \
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
index 5bf6a69170074..a65be0cb6baa8 100755
--- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
 set -e -x
-export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
-export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
 mkdir -p $HOME/.onnx
 docker run --gpus all -e CFLAGS -e CXXFLAGS -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
 --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
index b5f8bf1a49a19..dc105805a8a1a 100755
--- a/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/cpu/scripts/install_centos.sh
@@ -1,9 +1,9 @@
-#!/bin/bash
+!/bin/bash
 set -e -x
 
 os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
 
 echo "installing for CentOS version : $os_major_version"
 rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm
-dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran
+dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran gcc-toolset-12-libasan-devel libasan.x86_64
 locale
diff --git a/tools/ci_build/github/linux/docker/manylinux.patch b/tools/ci_build/github/linux/docker/manylinux.patch
index 75923e746f93c..b3ea9f0dd17f5 100644
--- a/tools/ci_build/github/linux/docker/manylinux.patch
+++ b/tools/ci_build/github/linux/docker/manylinux.patch
@@ -94,7 +94,7 @@ index 9ef1e99..ec52833 100755
 +fi
 \ No newline at end of file
 diff --git a/install-runtime-packages.sh b/install-runtime-packages.sh
-index 137d2e2..203b4bc 100755
+index 137d2e2..a0ed0c8 100755
 --- a/install-runtime-packages.sh
 +++ b/install-runtime-packages.sh
 @@ -33,7 +33,7 @@ source $MY_DIR/build_utils.sh
@@ -152,9 +152,9 @@ index 137d2e2..203b4bc 100755
 +            sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo
 +	fi
 +        if [[ -d /usr/local/cuda ]]; then
-+	    TOOLCHAIN_DEPS="gcc gcc-c++"
++	    TOOLCHAIN_DEPS="gcc gcc-c++ libasan"
 +	else
-+	    TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran"
++	    TOOLCHAIN_DEPS="gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran gcc-toolset-12-libasan-devel"
  	fi
  elif [ "${AUDITWHEEL_POLICY}" == "musllinux_1_1" ]; then
  	TOOLCHAIN_DEPS="binutils gcc g++ gfortran"
diff --git a/tools/ci_build/github/windows/helpers.ps1 b/tools/ci_build/github/windows/helpers.ps1
index 20df10b244408..a039a9274b8ee 100644
--- a/tools/ci_build/github/windows/helpers.ps1
+++ b/tools/ci_build/github/windows/helpers.ps1
@@ -315,6 +315,7 @@ function Install-Pybind {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -349,11 +350,18 @@ function Install-Pybind {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
     }
 
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
+
+
     $final_args = $msbuild_args + "pybind11.sln"
     &$msbuild_path $final_args
     $final_args = $msbuild_args + "INSTALL.vcxproj"
     &$msbuild_path $final_args
-       
+
     Write-Host "Installing pybind finished."
 
     popd
@@ -377,6 +385,7 @@ function Install-Abseil {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -393,7 +402,7 @@ function Install-Abseil {
     }
     cd $absl_src_dir
     cd *
-    
+
     # Search patch.exe
     $patch_path = 'C:\Program Files\Git\usr\bin\patch.exe'
     if(-not (Test-Path $patch_path -PathType Leaf)){
@@ -408,7 +417,7 @@ function Install-Abseil {
     } else {
       Write-Host "Skip patching abseil since we cannot find patch.exe at $patch_path"
     }
-    
+
     # Run cmake to generate Visual Studio sln file
     [string[]]$cmake_args = ".", "-DABSL_PROPAGATE_CXX_STD=ON", "-DCMAKE_BUILD_TYPE=$build_config", "-DBUILD_TESTING=OFF", "-DABSL_USE_EXTERNAL_GOOGLETEST=ON", "-DCMAKE_PREFIX_PATH=$install_prefix",  "-DCMAKE_INSTALL_PREFIX=$install_prefix"
     $cmake_args += $cmake_extra_args
@@ -425,6 +434,11 @@ function Install-Abseil {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
     }
 
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
 
     $final_args = $msbuild_args + "absl.sln"
     &$msbuild_path $final_args
@@ -459,6 +473,7 @@ function Install-UTF8-Range {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -492,6 +507,11 @@ function Install-UTF8-Range {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
     }
 
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
 
     $final_args = $msbuild_args + "utf8_range.sln"
     &$msbuild_path $final_args
@@ -527,6 +547,7 @@ function Install-Protobuf {
     param (
         [Parameter(Mandatory)][string]$cmake_path,
         [Parameter(Mandatory)][string]$msbuild_path,
+        [Parameter(Mandatory)][string]$cpu_arch,
         [Parameter(Mandatory)][string]$src_root,
         [Parameter(Mandatory)][CMakeBuildType]$build_config,
         [Parameter(Mandatory)][string[]]$cmake_extra_args
@@ -567,8 +588,13 @@ function Install-Protobuf {
       Write-Host -Object "CMake command failed. Exitcode: $exitCode"
       exit $lastExitCode
     }
-    
+
     $msbuild_args = "-nodeReuse:false", "-nologo", "-nr:false", "-maxcpucount", "-p:UseMultiToolTask=true", "-p:configuration=`"$build_config`""
+    if($cpu_arch -eq 'x86'){
+      $msbuild_args +=  "/p:platform=Win32"
+    } elseif($cpu_arch -eq 'x64') {
+      $msbuild_args +=  "/p:platform=x64"
+    }
 
     if ($use_cache) {
       $msbuild_args += "/p:CLToolExe=cl.exe", "/p:CLToolPath=C:\ProgramData\chocolatey\bin", "/p:TrackFileAccess=false", "/p:UseMultiToolTask=true"
@@ -609,7 +635,7 @@ function Install-ONNX {
     if ($lastExitCode -ne 0) {
       exit $lastExitCode
     }
-    
+
     Write-Host "Installing python packages..."
     [string[]]$pip_args = "-m", "pip", "install", "-qq", "--disable-pip-version-check", "setuptools>=68.2.2", "wheel", "numpy", "protobuf==$protobuf_version"
     &"python.exe" $pip_args
@@ -661,8 +687,8 @@ function Install-ONNX {
     $Env:CMAKE_ARGS="-DONNX_USE_PROTOBUF_SHARED_LIBS=OFF -DProtobuf_USE_STATIC_LIBS=ON -DONNX_USE_LITE_PROTO=OFF -DCMAKE_PREFIX_PATH=$install_prefix"
 
     python.exe "setup.py" "bdist_wheel"
-    
-    
+
+
     Write-Host "Installing the newly built ONNX python package"
     Get-ChildItem -Path dist/*.whl | foreach {
         $p = Start-Process -NoNewWindow -Wait -PassThru -FilePath "python.exe" -ArgumentList "-m", "pip", "--disable-pip-version-check", "install", "--upgrade", $_.fullname
diff --git a/tools/ci_build/github/windows/install_third_party_deps.ps1 b/tools/ci_build/github/windows/install_third_party_deps.ps1
index c30b576953114..54507cd40cc47 100644
--- a/tools/ci_build/github/windows/install_third_party_deps.ps1
+++ b/tools/ci_build/github/windows/install_third_party_deps.ps1
@@ -93,11 +93,11 @@ if(-not (Test-Path $vshwere_path -PathType Leaf)){
 
 $msbuild_path = &$vshwere_path -latest -requires Microsoft.Component.MSBuild -find MSBuild\**\Bin\MSBuild.exe | select-object -first 1
 
-Install-Pybind -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config  -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
+Install-Pybind -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config  -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path -cpu_arch $cpu_arch
 
-Install-Abseil -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
+Install-Abseil -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path -cpu_arch $cpu_arch
 
-Install-Protobuf -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path
+Install-Protobuf -cmake_path $cmake_path -src_root $ort_src_root -build_config $build_config -cmake_extra_args $cmake_extra_args -msbuild_path $msbuild_path -cpu_arch $cpu_arch
 
 $protobuf_version="4.21.12"
 

From c8399a81fed9c114c43daf2103fee48d6b02bdd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Fri, 12 Jan 2024 17:54:55 +0100
Subject: [PATCH 035/100] Quantization tool: support float 8 with MatMul,
 support float 16 weights (#18043)

### Description

Whenever a node QuantizeLinear or DequantizeLinear, the type of the
weights before being quantize must be known to create the scale with the
expected type. Another option would be to add many operator CastLike but
that would push the burden to onnxruntime optimizer.

The PR tries to avoid changing the signature. To do so, it modified the
scale computation to use a numpy array to store the result and not a
python float. The numpy array must be of the same type than the weights
to quantize.

The PR adds many `assert` to check the type of the scale is not a python
type or a float64. This was added to make sure all the code follows the
same logic. These lines were kept for the first review.

DequantizeLinear, QuantizeLinear cannot be tested with onnx==1.15. PR
https://github.com/onnx/onnx/pull/5709 is missing to fix shape
inference. PR https://github.com/onnx/onnx/pull/5473) is missing to
support QLinearMatMul with float 16. That explains why some tests are
disabled with float 16.

### Motivation and Context

The current quantization tool assumes every weight is float 32. For
large models such as LLAMA, it is usually float 16. The quantization
needs to quantize such weights.
---
 .../python/tools/quantization/calibrate.py    |  95 ++--
 .../tools/quantization/onnx_quantizer.py      | 200 +++++---
 .../tools/quantization/operators/conv.py      |   5 +-
 .../tools/quantization/operators/matmul.py    |  20 +-
 .../tools/quantization/operators/softmax.py   |   8 +-
 .../tools/quantization/qdq_loss_debug.py      |  16 +-
 .../tools/quantization/qdq_quantizer.py       |  58 ++-
 .../python/tools/quantization/quant_utils.py  | 115 +++--
 .../test/python/quantization/op_test_utils.py | 156 ++++++-
 .../python/quantization/test_conv_dynamic.py  |  37 +-
 .../quantization/test_op_conv_transpose.py    |  49 +-
 .../test/python/quantization/test_op_gemm.py  |   1 +
 .../python/quantization/test_op_matmul.py     | 436 ++++++++++++++++++
 .../test/python/quantization/test_op_pad.py   |  10 +-
 .../test/python/quantization/test_op_where.py |   2 -
 .../test/python/quantization/test_qdq.py      |   4 +-
 .../python/quantization/test_quant_util.py    |  53 ++-
 .../test_tensor_quant_overrides_option.py     |  79 +++-
 18 files changed, 1107 insertions(+), 237 deletions(-)
 create mode 100644 onnxruntime/test/python/quantization/test_op_matmul.py

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index f934b55bdc30d..d0db57c392961 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -23,11 +23,17 @@
 
 class TensorData:
     _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
+    _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
 
     def __init__(self, **kwargs):
         for k, v in kwargs.items():
             if k not in TensorData._allowed:
                 raise ValueError(f"Unexpected value {k!r} not in {TensorData._allowed}.")
+            if k in TensorData._floats:
+                if not hasattr(v, "dtype"):
+                    raise ValueError(f"Unexpected type {type(v)} for k={k!r}")
+                if v.dtype not in (np.float16, np.float32):
+                    raise ValueError(f"Unexpected dtype {v.dtype} for k={k!r}")
             setattr(self, k, v)
 
     @property
@@ -171,7 +177,7 @@ def select_tensors_to_calibrate(self, model: ModelProto):
         initializer = {init.name for init in model.graph.initializer}
 
         tensors_to_calibrate = set()
-        tensor_type_to_calibrate = {TensorProto.FLOAT}
+        tensor_type_to_calibrate = {TensorProto.FLOAT, TensorProto.FLOAT16}
 
         for node in model.graph.node:
             if not self.op_types_to_calibrate or node.op_type in self.op_types_to_calibrate:
@@ -284,7 +290,17 @@ def add_reduce_min_max(tensor_name, reduce_op_name):
             )
 
             self.model.graph.node.extend([reduce_node, reshape_node])
-            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, TensorProto.FLOAT, [1]))
+            value_infos = {vi.name: vi for vi in self.model.graph.value_info}
+            value_infos.update({o.name: o for o in self.model.graph.output})
+            value_infos.update({i.name: i for i in self.model.graph.input})
+            if tensor_name in value_infos:
+                onnx_type = value_infos[tensor_name].type.tensor_type.elem_type
+            else:
+                raise ValueError(
+                    f"Unable to guess tensor type for tensor {tensor_name!r}, "
+                    f"running shape inference before quantization may resolve this issue."
+                )
+            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [1]))
 
         for tensor in tensors:
             add_reduce_min_max(tensor, "ReduceMin")
@@ -364,24 +380,18 @@ def compute_data(self) -> TensorsData:
 
         pairs = []
         for i in range(0, len(added_output_names), 2):
-            min_value = 0
-            max_value = 0
             if self.moving_average:
                 min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis=0)
                 max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
             else:
-                min_value_array = min(merged_added_output_dict[added_output_names[i]])
-                max_value_array = max(merged_added_output_dict[added_output_names[i + 1]])
-            if isinstance(min_value_array, int) or min_value_array.size > 0:
-                min_value = float(min_value_array)
-            if isinstance(max_value_array, int) or max_value_array.size > 0:
-                max_value = float(max_value_array)
+                min_value_array = np.min(merged_added_output_dict[added_output_names[i]], axis=0)
+                max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
 
             if self.symmetric:
-                max_absolute_value = max(abs(min_value), abs(max_value))
+                max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
                 pairs.append(tuple([-max_absolute_value, max_absolute_value]))
             else:
-                pairs.append(tuple([min_value, max_value]))
+                pairs.append(tuple([min_value_array, max_value_array]))
 
         new_calibrate_tensors_range = TensorsData(CalibrationMethod.MinMax, dict(zip(calibrate_tensor_names, pairs)))
         if self.calibrate_tensors_range:
@@ -679,20 +689,37 @@ def collect_absolute_value(self, name_to_arr):
         Collect histogram on absolute value
         """
         for tensor, data_arr in name_to_arr.items():
-            data_arr = np.asarray(data_arr)  # noqa: PLW2901
-            data_arr = data_arr.flatten()  # noqa: PLW2901
-            if data_arr.size > 0:
-                min_value = np.min(data_arr)
-                max_value = np.max(data_arr)
+            if isinstance(data_arr, list):
+                for arr in data_arr:
+                    if not isinstance(arr, np.ndarray):
+                        raise ValueError(f"Unexpected type {type(arr)} for tensor={tensor!r}")
+                dtypes = set(a.dtype for a in arr)
+                if len(dtypes) != 1:
+                    raise ValueError(
+                        f"The calibration expects only one element type but got {dtypes} for tensor={tensor!r}"
+                    )
+                data_arr_np = np.asarray(data_arr)
+            elif not isinstance(data_arr, np.ndarray):
+                raise ValueError(f"Unexpected type {type(data_arr)} for tensor={tensor!r}")
+            else:
+                data_arr_np = data_arr
+            data_arr_np = data_arr_np.flatten()
+            if data_arr_np.size > 0:
+                min_value = np.min(data_arr_np)
+                max_value = np.max(data_arr_np)
             else:
                 min_value = 0
                 max_value = 0
 
-            data_arr = np.absolute(data_arr)  # only consider absolute value  # noqa: PLW2901
+            data_arr_np = np.absolute(data_arr_np)  # only consider absolute value
 
             if tensor not in self.histogram_dict:
                 # first time it uses num_bins to compute histogram.
-                hist, hist_edges = np.histogram(data_arr, bins=self.num_bins)
+                hist, hist_edges = np.histogram(data_arr_np, bins=self.num_bins)
+                hist_edges = hist_edges.astype(data_arr_np.dtype)
+                assert (
+                    data_arr_np.dtype != np.float64
+                ), "only float32 or float16 is supported, every constant must be explicetly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value)
             else:
                 old_histogram = self.histogram_dict[tensor]
@@ -700,15 +727,19 @@ def collect_absolute_value(self, name_to_arr):
                 old_max = old_histogram[3]
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
-                temp_amax = np.max(data_arr)
+                temp_amax = np.max(data_arr_np)
                 if temp_amax > old_hist_edges[-1]:
                     # increase the number of bins
                     width = old_hist_edges[1] - old_hist_edges[0]
                     # NOTE: np.arange may create an extra bin after the one containing temp_amax
                     new_bin_edges = np.arange(old_hist_edges[-1] + width, temp_amax + width, width)
                     old_hist_edges = np.hstack((old_hist_edges, new_bin_edges))
-                hist, hist_edges = np.histogram(data_arr, bins=old_hist_edges)
+                hist, hist_edges = np.histogram(data_arr_np, bins=old_hist_edges)
+                hist_edges = hist_edges.astype(data_arr_np.dtype)
                 hist[: len(old_hist)] += old_hist
+                assert (
+                    data_arr_np.dtype != np.float64
+                ), "only float32 or float16 is supported, every constant must be explicetly typed"
                 self.histogram_dict[tensor] = (hist, hist_edges, min(old_min, min_value), max(old_max, max_value))
 
     def collect_value(self, name_to_arr):
@@ -723,8 +754,8 @@ def collect_value(self, name_to_arr):
                 min_value = np.min(data_arr)
                 max_value = np.max(data_arr)
             else:
-                min_value = 0
-                max_value = 0
+                min_value = np.array(0, dtype=data_arr.dtype)
+                max_value = np.array(0, dtype=data_arr.dtype)
 
             threshold = max(abs(min_value), abs(max_value))
 
@@ -811,16 +842,16 @@ def compute_percentile(self):
                 idx_right = np.searchsorted(cdf, percentile / 100.0)
 
                 thresholds_dict[tensor] = (
-                    -float(hist_edges[idx_right]),
-                    float(hist_edges[idx_right]),
+                    -np.array(hist_edges[idx_right], dtype=hist_edges.dtype),
+                    np.array(hist_edges[idx_right], dtype=hist_edges.dtype),
                 )
             else:
                 percent_to_cut_one_side = (100.0 - percentile) / 200.0
                 idx_right = np.searchsorted(cdf, 1.0 - percent_to_cut_one_side)
                 idx_left = np.searchsorted(cdf, percent_to_cut_one_side)
                 thresholds_dict[tensor] = (
-                    float(hist_edges[idx_left]),
-                    float(hist_edges[idx_right]),
+                    np.array(hist_edges[idx_left], dtype=hist_edges.dtype),
+                    np.array(hist_edges[idx_right], dtype=hist_edges.dtype),
                 )
             min_value = histogram[2]
             max_value = histogram[3]
@@ -868,11 +899,11 @@ def _avg_std(hist, hist_edges, power=1):
         if power == 1:
             avg = (hist * values).sum() / hist.sum()
             std = ((hist * values**2).sum() / hist.sum() - avg**2) ** 0.5
-            return avg, std
+            return np.array(avg, dtype=hist_edges.dtype), np.array(std, dtype=hist_edges.dtype)
         if int(power) == power and int(power) % 2 == 1:
             avg = (hist * values**power).sum() / hist.sum()
             std = ((hist * (values**power - avg) ** 2).sum() / hist.sum()) ** 0.5
-            return avg, std
+            return np.array(avg, dtype=hist_edges.dtype), np.array(std, dtype=hist_edges.dtype)
 
         fact = np.abs(values) / values
         fact[np.isnan(fact)] = 1
@@ -880,7 +911,7 @@ def _avg_std(hist, hist_edges, power=1):
         values = np.abs(values) ** power * fact
         avg = (hist * values).sum() / hist.sum()
         std = ((hist * values**2).sum() / hist.sum() - avg**2) ** 0.5
-        return avg, std
+        return np.array(avg, dtype=hist_edges.dtype), np.array(std, dtype=hist_edges.dtype)
 
     def compute_distribution(self):
         if self.num_bins < 512:
@@ -897,12 +928,16 @@ def compute_distribution(self):
             hist = histogram[0]
             hist_edges = histogram[1]
 
+            assert hist_edges.dtype != np.float64
             if self.scenario == "same":
                 avg_coef, std_coef = self._avg_std(hist, hist_edges, power=1)
             elif self.scenario == "p3":
                 avg_coef, std_coef = self._avg_std(hist, hist_edges, power=1.0 / 3.0)
             else:
                 raise ValueError("Invalid scenario. Must be in {'same', 'p3'}.")
+            assert avg_coef.dtype != np.float64
+            assert std_coef.dtype != np.float64
+            assert hist_edges.dtype != np.float64
             thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
 
             # Plot histogram for debug only
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index f6491f32d87be..898a5f70ac45e 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -20,6 +20,7 @@
 from .calibrate import TensorData
 from .onnx_model import ONNXModel
 from .quant_utils import (
+    ONNX_TYPE_TO_NP_TYPE,
     TENSOR_NAME_QUANT_SUFFIX,
     QuantizationMode,
     QuantizedValue,
@@ -49,9 +50,11 @@ def __init__(self, **data: Dict[str, Any]):
         self.data = {}
         for k, v in data.items():
             if not isinstance(k, str):
-                raise TypeError(f"Keys must be strings not {type(k)}.")
-            if not isinstance(v, (int, float, str, QuantType)):
-                raise TypeError(f"Values must be int, float, str, or QuantType not {type(v)}.")
+                raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.")
+            if not isinstance(v, (int, str, np.ndarray)):
+                raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.")
+            if k == "scale" and v.dtype not in (np.float32, np.float16):
+                raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}")
             self.data[k] = v
 
     def __iter__(self):
@@ -89,6 +92,12 @@ def __init__(
         self.model = ONNXModel(model)
         if not static:
             self.model.replace_gemm_with_matmul()
+            # We need to update value_infos.
+            model = save_and_reload_model_with_shape_infer(self.model.model)
+            self.value_infos = {vi.name: vi for vi in model.graph.value_info}
+            self.value_infos.update({ot.name: ot for ot in model.graph.output})
+            self.value_infos.update({it.name: it for it in model.graph.input})
+            self.model = ONNXModel(model)
 
         self.per_channel = per_channel  # weight-pack per channel
         self.reduce_range = reduce_range
@@ -432,27 +441,59 @@ def is_per_channel(self):
     def is_valid_quantize_weight(self, weight_name):
         weight = find_by_name(weight_name, self.model.initializer())
         if weight is not None:
-            return weight.data_type == onnx_proto.TensorProto.FLOAT
+            return weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16)
         if (not self.enable_subgraph_quantization) or (self.parent is None):
             return False
         return self.parent.is_valid_quantize_weight(weight_name)
 
+    def get_tensor_type(self, tensor_name, mandatory=False):
+        weight = find_by_name(tensor_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type
+        if tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type"):
+                if mandatory and vi.type.tensor_type.elem_type == 0:
+                    raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
+                return vi.type.tensor_type.elem_type
+        if (not self.enable_subgraph_quantization) or (self.parent is None):
+            if mandatory:
+                raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
+            return None
+        otype = self.parent.is_valid_quantize_weight(tensor_name)
+        if otype is not None:
+            return otype
+        if self.enable_subgraph_quantization and self.parent:
+            res = self.parent.get_tensor_type(tensor_name)
+            if res is not None:
+                return res
+        if mandatory:
+            raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}")
+        return None
+
     def is_float_tensor(self, tensor_name):
         if self.is_input_a_initializer(tensor_name):
             return self.is_valid_quantize_weight(tensor_name)
 
         if tensor_name in self.value_infos:
             vi = self.value_infos[tensor_name]
-            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (onnx_proto.TensorProto.FLOAT,):
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                onnx_proto.TensorProto.FLOAT,
+                onnx_proto.TensorProto.FLOAT16,
+            ):
                 return True
-        elif self.enable_subgraph_quantization and self.parent:
-            return self.parent.is_float_tensor(tensor_name)
-        else:
             logging.warning(
-                "Failed to infer data type of tensor: {}. Please add data type info for this tensor "
-                "if your model has customized operators.".format(tensor_name)
+                f"Inference failed or unsupported type to quantize for tensor {tensor_name!r}, type is {vi.type}."
             )
+            return False
+
+        if self.enable_subgraph_quantization and self.parent:
+            return self.parent.is_float_tensor(tensor_name)
 
+        logging.warning(
+            f"Failed to infer data type of tensor: {tensor_name!r}. Please add data type info for this tensor "
+            f"if your model has customized operators."
+        )
         return False
 
     def should_quantize_node(self, node):
@@ -487,11 +528,12 @@ def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
             return self._get_dynamic_input_quantization_params_float8e4m3fn(input_name, nodes_list)
         raise ValueError(f"Unexpected value for qType={qType}.")
 
-    def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
+    def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list, initial_type):
         """
         Create nodes for dynamic quantization of input to int8 and add them to nodes_list
             parameter input_name: Name of the input.
             parameter nodes_list: new nodes are appended to this list.
+            parameter initial_type: initial weight type (FLOAT or FLOAT16)
             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
         """
         qType = onnx_proto.TensorProto.INT8  # noqa: N806
@@ -550,7 +592,7 @@ def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
         #   and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range
         initializer_div = onnx.helper.make_tensor(
             self.fixed_qrange_int8_name,
-            onnx_proto.TensorProto.FLOAT,
+            initial_type,
             [],
             [get_qrange_for_qType(qType) / 2.0],
         )
@@ -570,11 +612,12 @@ def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
 
         return input_scale_name, self.fixed_zero_zp_name, [], []
 
-    def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
+    def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list, initial_type):
         """
         Create nodes for dynamic quantization of input to uint8 and add them to nodes_list
             parameter input_name: Name of the input.
             parameter nodes_list: new nodes are appended to this list.
+            parameter initial_type: initial weight type (FLAOT or FLOAT16)
             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
         """
         qType = onnx_proto.TensorProto.UINT8  # noqa: N806
@@ -605,12 +648,12 @@ def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
         # Add tensors for quantize range and zero value.
         initializer_qrange = onnx.helper.make_tensor(
             self.fixed_qrange_uint8_name,
-            onnx_proto.TensorProto.FLOAT,
+            initial_type,
             [],
             [get_qrange_for_qType(qType)],
         )
         self.model.add_initializer(initializer_qrange)
-        initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, onnx_proto.TensorProto.FLOAT, [], [0.0])
+        initializer_qvalue = onnx.helper.make_tensor(self.fixed_zero_name, initial_type, [], [0.0])
         self.model.add_initializer(initializer_qvalue)
 
         # Compute Scale
@@ -686,12 +729,21 @@ def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=Non
                     f"Specified values for output {param_name}: {params}"
                 )
 
-            zero_point_values = [params["zero_point"]]
-            scale_values = [params["scale"]]
-            zero_point_type = params["quant_type"]
+            zero_point_values = np.array([params["zero_point"]])
+            if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16):
+                raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}")
+            scale_values = np.array([params["scale"]])
+            assert scale_values.dtype != np.float64
+            # zero_point_type = params["quant_type"]
+            assert zero_point_type == params["quant_type"]
         else:
-            zero_point_values = [use_zeropoint]
-            scale_values = [use_scale]
+            zero_point_values = np.array([use_zeropoint])
+            scale_values = np.array([use_scale])
+            params = self.quantization_params[param_name]
+            if "scale" in params:
+                dtype = params["scale"].dtype
+                scale_values = scale_values.astype(dtype)
+            assert scale_values.dtype != np.float64
 
         zero_point_shape = []
         zero_point_name = param_name + "_zero_point"
@@ -699,19 +751,17 @@ def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=Non
         scale_name = param_name + "_scale"
 
         # Add initializers
-        init_zp = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_point_shape, zero_point_values)
+        init_zp = onnx.helper.make_tensor(
+            zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist()
+        )
         self.model.add_initializer(init_zp)
-        if zero_point_type in {
-            onnx_proto.TensorProto.FLOAT8E4M3FN,
-            onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
-            onnx_proto.TensorProto.FLOAT8E5M2,
-            onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
-        }:
-            # TODO: enable FLOAT16 support
+        if scale_values.dtype == np.float32:
             scale_type = onnx_proto.TensorProto.FLOAT
+        elif scale_values.dtype == np.float16:
+            scale_type = onnx_proto.TensorProto.FLOAT16
         else:
-            scale_type = onnx_proto.TensorProto.FLOAT
-        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values)
+            raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}")
+        init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist())
         self.model.add_initializer(init_scale)
 
         return True, scale_name, zero_point_name, scale_shape, zero_point_shape
@@ -780,7 +830,8 @@ def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=N
         return [*nodes, qlinear_node]
 
     def set_quant_scale_zp(self, tensor_name, value):
-        assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float) and zeropoint"
+        assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float or float16) and zeropoint"
+        assert hasattr(value[0], "dtype")
         assert tensor_name not in self.used_scale_zp_map, f"{tensor_name} has been setted before"
         self.used_scale_zp_map[tensor_name] = value
 
@@ -830,18 +881,19 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
 
         # quantize bias
         if self.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
-            # Note: if the quantized type is float 8, the bias is converted into float 16.
-            # cublasLtMatMul only supports (b)float16 or float32 bias.
-
             data = np.asarray(bias_data)
+            if data.dtype == np.float16:
+                node_qtype = onnx.TensorProto.FLOAT16
+            elif data.dtype == np.float32:
+                node_qtype = onnx.TensorProto.FLOAT
+            else:
+                raise TypeError(f"Only float16 or float32 are supported with float 8 but bias dtype is {data.dtype}.")
             quantized_data = data.astype(np.float32)
             bias_scale = np.array([1], dtype=quantized_data.dtype)
             bias_scale_data = bias_scale.reshape(-1)
             packed_bias_initializer = onnx.numpy_helper.from_array(quantized_data, quantized_bias_name)
             self.model.initializer_extend([packed_bias_initializer])
             node_type = "Cast"
-            # TODO: enable FLOAT16 support
-            node_qtype = onnx.TensorProto.FLOAT
         else:
             # calculate scale for bias
             # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
@@ -859,12 +911,7 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
 
         # update scale initializer
         quantized_bias_scale_name = quantized_bias_name + "_scale"
-        if self.is_per_channel():
-            packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
-        else:
-            packed_bias_scale_initializer = onnx.helper.make_tensor(
-                quantized_bias_scale_name, onnx_proto.TensorProto.FLOAT, [], bias_scale_data
-            )
+        packed_bias_scale_initializer = onnx.numpy_helper.from_array(bias_scale_data, quantized_bias_scale_name)
         self.model.initializer_extend([packed_bias_scale_initializer])
 
         # update zero initializer
@@ -1083,11 +1130,18 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
             qType = quant_overrides["quant_type"].tensor_type  # noqa: N806
 
         if "scale" in quant_overrides and "zero_point" in quant_overrides:
-            zero_point, scale = quant_overrides["zero_point"], quant_overrides["scale"]
+            zero_point = np.array(quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[qType])
+            scale = np.array(quant_overrides["scale"])
             q_weight_data = quantize_nparray(qType, weight_data.flatten(), scale, zero_point)
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
         else:
             _, _, zero_point, scale, q_weight_data = quantize_data(
-                weight_data.flatten().tolist(),
+                weight_data.flatten(),
                 qType,
                 quant_overrides.get("symmetric", self.is_weight_symmetric),
                 reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
@@ -1096,18 +1150,15 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
                 rmax_override=quant_overrides.get("rmax"),
             )
 
-        if qType in {
-            onnx.TensorProto.FLOAT8E4M3FN,
-            onnx.TensorProto.FLOAT8E4M3FNUZ,
-            onnx.TensorProto.FLOAT8E5M2,
-            onnx.TensorProto.FLOAT8E5M2FNUZ,
-        }:
-            # TODO: enable FLOAT16 support
-            scale_dtype = onnx_proto.TensorProto.FLOAT
-        else:
-            scale_dtype = onnx_proto.TensorProto.FLOAT
-        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], [scale])
-        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], [zero_point])
+            assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+            assert (
+                zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+            ), f"Unexpected dtype {zero_point.dtype}"
+            assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+
+        scale_dtype = weight.data_type
+        scale_initializer = onnx.helper.make_tensor(scale_name, scale_dtype, [], scale.reshape((-1,)).tolist())
+        zero_initializer = onnx.helper.make_tensor(zp_name, qType, [], zero_point.reshape((-1,)).tolist())
         self.model.initializer_extend([scale_initializer, zero_initializer])
 
         if not keep_float_weight:
@@ -1129,7 +1180,7 @@ def quantize_initializer(self, weight, qType, reduce_range=False, keep_float_wei
                             f"\nraw={str(q_weight_initializer)[:200]}."
                         )
             else:
-                q_weight_data = np.asarray(q_weight_data, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[qType]).reshape(
+                q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape(
                     weight.dims
                 )
                 q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name)
@@ -1185,10 +1236,20 @@ def quantize_weight_per_channel(
             channel_quant_overrides = quant_overrides_for_channels[i]
 
             if "scale" in channel_quant_overrides and "zero_point" in channel_quant_overrides:
-                zero_point, scale = channel_quant_overrides["zero_point"], channel_quant_overrides["scale"]
+                zero_point = np.array(channel_quant_overrides["zero_point"], dtype=ONNX_TYPE_TO_NP_TYPE[weight_qType])
+                scale = np.array(channel_quant_overrides["scale"])
                 quantized_per_channel_data = quantize_nparray(
                     weight_qType, per_channel_data.flatten(), scale, zero_point
                 )
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
             else:
                 symmetric = channel_quant_overrides.get(
                     "symmetric",
@@ -1198,7 +1259,7 @@ def quantize_weight_per_channel(
                     ),
                 )
                 _, _, zero_point, scale, quantized_per_channel_data = quantize_data(
-                    per_channel_data.flatten().tolist(),
+                    per_channel_data.flatten(),
                     weight_qType,
                     symmetric,
                     reduce_range=channel_quant_overrides.get("reduce_range", self.reduce_range and reduce_range),
@@ -1207,6 +1268,15 @@ def quantize_weight_per_channel(
                     rmax_override=channel_quant_overrides.get("rmax"),
                 )
 
+                assert isinstance(zero_point, np.ndarray), f"Unexpected type {type(zero_point)}"
+                assert (
+                    zero_point.dtype != np.float32 and zero_point.dtype != np.float16
+                ), f"Unexpected dtype {zero_point.dtype}"
+                assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}"
+                assert isinstance(
+                    quantized_per_channel_data, np.ndarray
+                ), f"Unexpected type {type(quantized_per_channel_data)}"
+
             zero_point_list.append(zero_point)
             scale_list.append(scale)
             quantized_per_channel_data_list.append(quantized_per_channel_data)
@@ -1236,9 +1306,11 @@ def quantize_weight_per_channel(
         # Update packed weight, zero point, and scale initializers
         zero_scale_shape = [initializer.dims[channel_axis]]
         scale_initializer = onnx.helper.make_tensor(
-            scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape, scale_list
+            scale_name, initializer.data_type, zero_scale_shape, np.hstack(scale_list).tolist()
+        )
+        zero_initializer = onnx.helper.make_tensor(
+            zp_name, weight_qType, zero_scale_shape, np.hstack(zero_point_list).tolist()
         )
-        zero_initializer = onnx.helper.make_tensor(zp_name, weight_qType, zero_scale_shape, zero_point_list)
 
         self.model.initializer_extend([scale_initializer, zero_initializer])
 
@@ -1255,7 +1327,7 @@ def quantize_weight_per_channel(
     def _dequantize_value(self, value_name):
         """
         Given a value (input/output) which is quantized, add a DequantizeLinear node to dequantize
-        it back to float32
+        it back to float32 or float16
             parameter value_name: value to dequantize
             parameter new_nodes_list: List of new nodes created before processing current node
             return: None if there is already a DequantizeLinear node that dequantizes it
@@ -1264,6 +1336,10 @@ def _dequantize_value(self, value_name):
         if (value_name in self.quantized_value_map) and (value_name not in self.generated_value_names):
             quantized_value = self.quantized_value_map[value_name]
             # Add DequantizeLinear Node for this input
+            scale_init = find_by_name(quantized_value.scale_name, self.model.initializer())
+            # axis is not specified so scale_init must be a scalar.
+            assert onnx.numpy_helper.to_array(scale_init).size == 1
+
             dqlinear_name = value_name + "_DequantizeLinear"
             dqlinear_node = self.model.find_node_by_name(dqlinear_name, self.new_nodes, self.model.graph())
             if dqlinear_node is None:
diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
index 23f9eaf4b0e0b..06204585ba1ca 100644
--- a/onnxruntime/python/tools/quantization/operators/conv.py
+++ b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -89,13 +89,14 @@ def quantize(self):
         nodes.append(conv_integer_node)
 
         # Add cast operation to cast convInteger output to float.
+        onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
         cast_op_output = conv_integer_output + "_cast_output"
         cast_node = onnx.helper.make_node(
             "Cast",
             [conv_integer_output],
             [cast_op_output],
             conv_integer_output + "_cast",
-            to=onnx_proto.TensorProto.FLOAT,
+            to=onnx_type,  # TODO: FLOAT ot FLOAT16
         )
         nodes.append(cast_node)
 
@@ -193,7 +194,7 @@ def quantize(self):
             bias_present = True
 
         qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
-        qlinear_conv_name = qlinear_conv_name = node.name + "_quant" if node.name else ""
+        qlinear_conv_name = node.name + "_quant" if node.name else ""
 
         kwargs = {}
         for attribute in node.attribute:
diff --git a/onnxruntime/python/tools/quantization/operators/matmul.py b/onnxruntime/python/tools/quantization/operators/matmul.py
index 2dbdbdbeb5455..af76a68f137ab 100644
--- a/onnxruntime/python/tools/quantization/operators/matmul.py
+++ b/onnxruntime/python/tools/quantization/operators/matmul.py
@@ -1,4 +1,5 @@
 import itertools
+import logging
 
 import onnx
 from onnx import onnx_pb as onnx_proto
@@ -14,17 +15,19 @@ def __init__(self, onnx_quantizer, onnx_node):
 
     def should_quantize(self):
         if not self.quantizer.should_quantize_node(self.node):
+            logging.debug(f"Ignore MatMul {self.node.name}]")
             return False
 
         if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
             not self.quantizer.is_float_tensor(self.node.input[0])
         ):
+            logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
             return False
 
         # do not quantize non-constant B matrices for matmul
         if self.quantizer.q_matmul_const_b_only:
             if not self.quantizer.find_initializer_in_path(self.node.input[1]):
-                print(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
+                logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
                 return False
         return True
 
@@ -72,12 +75,13 @@ def quantize(self):
 
         # Add cast operation to cast matmulInteger output to float.
         cast_op_output = matmul_integer_output + "_cast_output"
+        otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
         cast_node = onnx.helper.make_node(
             "Cast",
             [matmul_integer_output],
             [cast_op_output],
             matmul_integer_output + "_cast",
-            to=onnx_proto.TensorProto.FLOAT,  # TODO: support FLOAT16 as well.
+            to=otype,
         )
         nodes.append(cast_node)
 
@@ -168,11 +172,23 @@ def quantize(self):
         qlinear_matmul_inputs.append(output_scale_name)
         qlinear_matmul_inputs.append(output_zp_name)
 
+        domain = (
+            "com.microsoft"
+            if self.quantizer.weight_qType
+            in {
+                onnx_proto.TensorProto.FLOAT8E4M3FN,
+                onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
+                onnx_proto.TensorProto.FLOAT8E5M2,
+                onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
+            }
+            else ""
+        )
         qlinear_matmul_node = onnx.helper.make_node(
             "QLinearMatMul",
             qlinear_matmul_inputs,
             [qlinear_matmul_output],
             qlinear_matmul_name,
+            domain=domain,
         )
         nodes.append(qlinear_matmul_node)
 
diff --git a/onnxruntime/python/tools/quantization/operators/softmax.py b/onnxruntime/python/tools/quantization/operators/softmax.py
index 76c9054caa845..61a69ab3649dd 100644
--- a/onnxruntime/python/tools/quantization/operators/softmax.py
+++ b/onnxruntime/python/tools/quantization/operators/softmax.py
@@ -1,4 +1,6 @@
+import numpy as np
 import onnx
+import onnx.helper
 
 from ..quant_utils import (
     TENSOR_NAME_QUANT_SUFFIX,
@@ -96,8 +98,10 @@ def quantize(self):
             out_zero_point, out_scale = quant_overrides["zero_point"], quant_overrides["scale"]
         else:
             # Unless overridden by the user, force Softmax to range from 0.0 to 1.0
-            rmin = quant_overrides.get("rmin", 0.0)
-            rmax = quant_overrides.get("rmax", 1.0)
+            qparams = self.quantizer.quantization_params[output_name]
+            dtype = qparams.data["scale"].dtype
+            rmin = quant_overrides.get("rmin", np.array(0, dtype=dtype))
+            rmax = quant_overrides.get("rmax", np.array(1, dtype=dtype))
             symmetric = quant_overrides.get("symmetric", self.quantizer.is_activation_symmetric)
             reduce_range = quant_overrides.get("reduce_range", False)
             qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
diff --git a/onnxruntime/python/tools/quantization/qdq_loss_debug.py b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
index 67938de54a10b..f9ed844febe46 100644
--- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py
+++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
@@ -42,7 +42,7 @@ def get_next(self):
 
 import numpy
 import onnx
-from onnx import TensorProto, helper, numpy_helper
+from onnx import helper, numpy_helper
 
 import onnxruntime
 
@@ -86,7 +86,7 @@ def modify_model_output_intermediate_tensors(
         op_types_for_saving = []
     saver = CalibraterBase(input_model_path, op_types_to_calibrate=op_types_for_saving)
     model_to_augment = saver.model
-    tensors, _ = saver.select_tensors_to_calibrate(model_to_augment)
+    tensors, value_infos = saver.select_tensors_to_calibrate(model_to_augment)
     reshape_shape_name = "LinearReshape_" + str(time.time())
     reshape_shape = numpy_helper.from_array(numpy.array([-1], dtype=numpy.int64), reshape_shape_name)
     model_to_augment.graph.initializer.append(reshape_shape)
@@ -100,7 +100,9 @@ def modify_model_output_intermediate_tensors(
             name=reshape_output,
         )
         model_to_augment.graph.node.append(reshape_node)
-        reshape_output_value_info = helper.make_tensor_value_info(reshape_output, TensorProto.FLOAT, [-1])
+        reshape_output_value_info = helper.make_tensor_value_info(
+            reshape_output, value_infos[tensor_name].type.tensor_type.elem_type, [-1]
+        )
         model_to_augment.graph.output.append(reshape_output_value_info)
 
     onnx.save(
@@ -312,6 +314,14 @@ def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[s
             weight_zp = numpy.zeros(weight_scale.shape, dtype=numpy.int32)
 
         # Perform dequantization:
+        if weight_scale.size == weight_zp.size == 1:
+            # Avoids the confusion between a scaler and a tensor of one element.
+            weight_scale = weight_scale.reshape(tuple())
+            weight_zp = weight_zp.reshape(tuple())
+        if weight_scale.shape != weight_zp.shape:
+            raise RuntimeError(
+                f"scale and zero_point must have the same shape but {weight_scale.shape} != {weight_zp.shape}"
+            )
         weight_quant = _run_dequantize_linear(weight_tensor, weight_scale, weight_zp, channel_axis=axis)
         weight_name = weight_name[: -len(TENSOR_NAME_QUANT_SUFFIX)]
         if weight_quant is None:
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 187555ff76fb9..b0153aed766ad 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -37,11 +37,13 @@ class QDQQuantTensorType(Enum):
 
 
 class QDQTensorQuantInfo:
-    def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None):
+    def __init__(self, tensor_type=QDQQuantTensorType.ACTIVATION, quant_para_provider=None, axis=None, data_type=None):
         self.tensor_type = tensor_type
         self.quant_para_provider = quant_para_provider
         self.axis = axis
         self.is_shared = quant_para_provider is not None
+        assert data_type is not None
+        self.data_type = data_type
 
 
 class QDQQuantizer(ONNXQuantizer):
@@ -134,17 +136,33 @@ def __init__(
             )
             self.qdq_op_domain = ms_domain
 
+    def _get_tensor_type(self, tensor_name):
+        """
+        Check if tensor can be quantized
+        """
+        weight = find_by_name(tensor_name, self.model.initializer())
+        if weight is not None:
+            return weight.data_type
+        elif tensor_name in self.value_infos:
+            vi = self.value_infos[tensor_name]
+            if vi.type.HasField("tensor_type"):
+                return vi.type.tensor_type.elem_type
+        return None
+
     def _is_tensor_quantizable(self, tensor_name):
         """
         Check if tensor can be quantized
         """
         weight = find_by_name(tensor_name, self.model.initializer())
         if weight is not None:
-            if weight.data_type == onnx_proto.TensorProto.FLOAT:
+            if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
                 return True
         elif tensor_name in self.value_infos:
             vi = self.value_infos[tensor_name]
-            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
+            if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type in (
+                TensorProto.FLOAT,
+                TensorProto.FLOAT16,
+            ):
                 return True
         else:
             logging.warning(
@@ -167,11 +185,13 @@ def __quantize_tensor(self, tensor_name, quant_sharing_param=None, tensor_type=Q
         """
         if self._is_tensor_quantizable(tensor_name):
             if quant_sharing_param:
+                data_type = self._get_tensor_type(tensor_name)
                 self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
-                    tensor_type=tensor_type, quant_para_provider=quant_sharing_param
+                    tensor_type=tensor_type, quant_para_provider=quant_sharing_param, data_type=data_type
                 )
             elif tensor_name not in self.tensors_to_quantize:
-                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type)
+                data_type = self._get_tensor_type(tensor_name)
+                self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(tensor_type=tensor_type, data_type=data_type)
 
     def quantize_activation_tensor(self, tensor_name, quant_sharing_param=None):
         """
@@ -196,9 +216,9 @@ def quantize_weight_tensor(self, tensor_name, quant_sharing_param=None):
     def quantize_weight_tensor_per_channel(self, tensor_name, axis):
         weight = find_by_name(tensor_name, self.model.initializer())
         if weight:
-            if weight.data_type == onnx_proto.TensorProto.FLOAT:
+            if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
                 self.tensors_to_quantize[tensor_name] = QDQTensorQuantInfo(
-                    tensor_type=QDQQuantTensorType.WEIGHT, axis=axis
+                    tensor_type=QDQQuantTensorType.WEIGHT, axis=axis, data_type=weight.data_type
                 )
         else:
             logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.")
@@ -217,7 +237,7 @@ def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
 
         weight = find_by_name(bias_name, self.model.initializer())
         if weight is not None:
-            if weight.data_type == onnx_proto.TensorProto.FLOAT:
+            if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
                 self.bias_to_quantize.append((bias_name, input_name, weight_name, beta))
         else:
             logging.warning(f"Expected {bias_name} to be a weight")
@@ -292,6 +312,9 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
         if axis is not None:
             if self.opset_version < 13:
                 raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.")
+            qtype = self.activation_qType
+            if self.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
+                qtype = onnx_proto.TensorProto.INT8
             q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel(
                 weight_name,
                 # Quantization type is forced to be TensorProto.INT8.
@@ -300,7 +323,7 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
                 # QLinearConv expects to have a unique value for all channels.
                 # This code does not enforce that but it is necessarily the case when the
                 # quantization is symmetric (as for INT8).
-                onnx_proto.TensorProto.INT8,
+                qtype,
                 axis,
                 keep_float_weight=self.add_qdq_pair_to_weight,
             )
@@ -338,7 +361,7 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None):
             )
             self.model.add_node(dequant_node)
 
-    def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
+    def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_type=None):
         if (
             self.dedicated_qdq_pair
             and tensor_name in self.tensor_to_its_receiving_nodes
@@ -371,6 +394,7 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
                         scale_name,
                         zp_name,
                         QuantizedValueType.Input,
+                        scale_type=data_type,
                     )
                     self.quantized_value_map[tensor_name] = quantized_value
         else:
@@ -400,6 +424,7 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
                 scale_name,
                 zp_name,
                 QuantizedValueType.Input,
+                scale_type=data_type,
             )
             self.quantized_value_map[tensor_name] = quantized_value
 
@@ -415,6 +440,10 @@ def _quantize_normal_tensors(self):
                     self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis)
                 else:
                     used_scale, used_zp = self.find_quant_scale_zp(tensor_name)
+                    if used_scale is not None and not hasattr(used_scale, "dtype"):
+                        raise TypeError(
+                            f"Unexpected type {type(used_scale)} for used_scale and tensor_name={tensor_name!r}"
+                        )
                     data_found, scale_name, zp_name, _, _ = self._get_quantization_params(
                         tensor_name, used_scale, used_zp
                     )
@@ -425,7 +454,7 @@ def _quantize_normal_tensors(self):
                             "In static mode quantization params for inputs and outputs of nodes to be quantized are required."
                         )
 
-                    self._add_qdq_pair_for_activation(tensor_name, scale_name, zp_name)
+                    self._add_qdq_pair_for_activation(tensor_name, scale_name, zp_name, data_type=tensor_info.data_type)
 
                 del self.tensors_to_quantize[tensor_name]
 
@@ -449,18 +478,21 @@ def _quantize_bias_tensors(self):
                 continue
             # Quantize the input
             self.quantize_bias_static(bias_name, input_name, weight_name, beta)
-            self.model.remove_initializer(find_by_name(bias_name, self.model.initializer()))
+            init = find_by_name(bias_name, self.model.initializer())
+            self.model.remove_initializer(init)
             quant_value = self.quantized_value_map[bias_name]
             if quant_value.node_type == "Cast":
                 # simple cast to float 16 and not DequantizeLinear
                 # cublasLtMatmul only supports (b)float16, float bias.
+                if not isinstance(init.data_type, int):
+                    raise TypeError(f"Unexpected type {type(init.data_type)} for input={input_name!r}")
                 node_name = add_dequant_suffix(bias_name)
                 dequant_node = onnx.helper.make_node(
                     "Cast",
                     [quant_value.q_name],
                     [bias_name],
                     name=node_name,
-                    to=onnx.TensorProto.FLOAT,
+                    to=init.data_type,
                 )
             elif quant_value.node_type in (None, "DequantizeLinear"):
                 if quant_value.node_qtype in {
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 9acee9d8ab124..68c2b3bf79c8b 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -124,25 +124,41 @@ def from_string(format):
 }
 
 ONNX_INT_TYPE_RANGE = {
-    onnx_proto.TensorProto.UINT8: (0, 255),
-    onnx_proto.TensorProto.INT8: (-128, 127),
-    onnx_proto.TensorProto.UINT16: (0, 65535),
-    onnx_proto.TensorProto.INT16: (-32768, 32767),
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(255, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-128, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65535, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32768, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
 }
 
 ONNX_INT_TYPE_SYMMETRIC_RANGE = {
-    onnx_proto.TensorProto.INT8: (-127, 127),
-    onnx_proto.TensorProto.INT16: (-32767, 32767),
+    onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)),
 }
 
 ONNX_INT_TYPE_REDUCED_RANGE = {
-    onnx_proto.TensorProto.UINT8: (0, 127),
-    onnx_proto.TensorProto.INT8: (-64, 64),
-    onnx_proto.TensorProto.UINT16: (0, 32767),
-    onnx_proto.TensorProto.INT16: (-16384, 16384),
+    onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(127, dtype=numpy.uint8)),
+    onnx_proto.TensorProto.INT8: (numpy.array(-64, dtype=numpy.int8), numpy.array(64, dtype=numpy.int8)),
+    onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(32767, dtype=numpy.uint16)),
+    onnx_proto.TensorProto.INT16: (numpy.array(-16384, dtype=numpy.int16), numpy.array(16384, dtype=numpy.int16)),
 }
 
 
+def _check_type(*args, zero_point_index=-1):
+    new_args = []
+    for i, a in enumerate(args):
+        if numpy.issubdtype(type(a), numpy.number):
+            new_args.append(numpy.array(a))
+        elif isinstance(a, numpy.ndarray):
+            new_args.append(a)
+        else:
+            raise TypeError(f"arg {i} is not an array: {a}")
+        if i == zero_point_index:
+            v = new_args[-1]
+            if v.dtype == numpy.float32 or v.dtype == numpy.float16:
+                raise TypeError(f"zero_point cannot be {v.dtype}")
+    return tuple(new_args) if len(new_args) > 1 else new_args[0]
+
+
 def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
     assert (
         qType in ONNX_TYPE_TO_NP_TYPE
@@ -155,6 +171,12 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
     ):
         if zero_point != 0:
             raise NotImplementedError(f"zero_point is expected to be null for float 8 not {zero_point!r}.")
+        if arr.dtype == numpy.float32:
+            onnx_type = TensorProto.FLOAT
+        elif arr.dtype == numpy.float16:
+            onnx_type = TensorProto.FLOAT16
+        else:
+            raise ValueError(f"Unexpected dtype {arr.dtype}.")
         onnx_model = make_model(
             make_graph(
                 [
@@ -165,14 +187,14 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
                 ],
                 "qu",
                 [
-                    make_tensor_value_info("X", TensorProto.FLOAT, None),
-                    make_tensor_value_info("scale", TensorProto.FLOAT, None),
+                    make_tensor_value_info("X", onnx_type, None),
+                    make_tensor_value_info("scale", onnx_type, None),
                 ],
                 [make_tensor_value_info("Y", qType, None)],
             )
         )
         ref = ReferenceEvaluator(onnx_model)
-        return ref.run(None, {"X": arr.astype(numpy.float32), "scale": scale.astype(numpy.float32)})[0]
+        return _check_type(ref.run(None, {"X": arr, "scale": scale})[0])
     else:
         dtype = ONNX_TYPE_TO_NP_TYPE[qType]
         (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True)
@@ -181,7 +203,7 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
         cliphigh = min(qmax, high) if high is not None else qmax
         arr_fp32 = numpy.asarray((arr.astype(numpy.float32) / scale).round() + zero_point)
         numpy.clip(arr_fp32, cliplow, cliphigh, out=arr_fp32)
-        return arr_fp32.astype(dtype)
+        return _check_type(arr_fp32.astype(dtype))
 
 
 def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=None):
@@ -210,24 +232,29 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non
     # Adjust rmin and rmax such that 0 is included in the range. This is
     # required to make sure zero can be represented by the quantization data
     # type (i.e. to make sure qmin <= zero_point <= qmax)
-    rmin = min(rmin, 0)
-    rmax = max(rmax, 0)
+    rmin = numpy.minimum(rmin, numpy.array(0, dtype=rmin.dtype))
+    rmax = numpy.maximum(rmax, numpy.array(0, dtype=rmax.dtype))
 
     # Ensure a minimum float-point range if specified.
     if min_real_range is not None:
         rmax = max(rmax, rmin + min_real_range)
 
     if symmetric:
-        absmax = max(abs(rmin), abs(rmax))
+        absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax))
         rmin = -absmax
         rmax = +absmax
 
-    scale = (rmax - rmin) / float(qmax - qmin)
-    if scale < numpy.finfo(numpy.float32).tiny:
-        scale = 1.0
-        zero_point = 0
+    assert qmin <= qmax, f"qmin={rmin} > qmax={rmax}"
+    dr = numpy.array(rmax - rmin, dtype=numpy.float64)
+    dq = numpy.array(qmax, dtype=numpy.float64) - numpy.array(qmin, dtype=numpy.float64)
+    scale = numpy.array(dr / dq)
+    assert scale >= 0, "scale isse"
+    if scale < numpy.finfo(rmax.dtype).tiny:
+        scale = numpy.array(1.0, dtype=rmax.dtype)
+        zero_point = numpy.array(0, dtype=qmin.dtype)
     else:
-        zero_point = round(qmin - rmin / scale)
+        zero_point = numpy.array(numpy.round(qmin - rmin / scale), dtype=qmin.dtype)
+        scale = scale.astype(rmax.dtype)
 
     return [zero_point, scale]
 
@@ -242,10 +269,13 @@ def compute_scale_zp_float8(element_type, std):
     More details in notebook `quantization_fp8.ipynb
     <https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/quantization_fp8.ipynb>`_.
     """
+    zp_dtype = None
     if element_type not in FLOAT8_DISTRIBUTIONS:
         if element_type == TensorProto.FLOAT8E4M3FN:
             from onnx.numpy_helper import float8e4m3_to_float32
+            from onnx.reference.custom_element_types import float8e4m3fn
 
+            zp_dtype = float8e4m3fn
             all_values = [float8e4m3_to_float32(i) for i in range(0, 256)]
             values = numpy.array(
                 [f for f in all_values if not numpy.isnan(f) and not numpy.isinf(f)], dtype=numpy.float32
@@ -253,10 +283,16 @@ def compute_scale_zp_float8(element_type, std):
         else:
             raise ValueError(f"Quantization to element_type={element_type} not implemented.")
         FLOAT8_DISTRIBUTIONS[element_type] = values
+    elif element_type == TensorProto.FLOAT8E4M3FN:
+        from onnx.reference.custom_element_types import float8e4m3fn
+
+        zp_dtype = float8e4m3fn
 
+    if zp_dtype is None:
+        raise TypeError(f"Unexpected element_type {element_type}.")
     std_f8 = numpy.std(FLOAT8_DISTRIBUTIONS[element_type])
-    zero = 0
-    scale = std / std_f8
+    zero = numpy.array(0, dtype=zp_dtype)
+    scale = numpy.array(std / std_f8, dtype=std.dtype)
     return [zero, scale]
 
 
@@ -288,40 +324,43 @@ def quantize_data(
     - *S*: scale
     - *z*: zero point
     """
-
+    if not isinstance(data, numpy.ndarray):
+        raise TypeError(f"Weight must be given as an array not {type(data)}.")
     if rmin_override is not None:
         rmin = rmin_override
     else:
-        rmin = min(data) if len(data) else 0
+        rmin = data.min() if len(data) else 0.0
 
     if rmax_override is not None:
         rmax = rmax_override
     else:
-        rmax = max(data) if len(data) else 0
+        rmax = data.max() if len(data) else 0.0
 
+    rmin = numpy.array(rmin, dtype=data.dtype)
+    rmax = numpy.array(rmax, dtype=data.dtype)
     zero_point = 0
-    scale = 1.0
+    scale = numpy.array(1.0, dtype=data.dtype)
 
     if qType == TensorProto.FLOAT8E4M3FN:
         if reduce_range:
             raise RuntimeError("Unsupported option reduce_range=True for float 8.")
         std = numpy.std(data)
         zero_point, scale = compute_scale_zp_float8(qType, std)
-        quantized_data = quantize_nparray(qType, numpy.asarray(data), scale, zero_point)
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
         if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127):
             np_data = numpy.asarray(data)
             raise RuntimeError(
                 f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], "
                 f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]."
             )
-        return rmin, rmax, zero_point, scale, quantized_data
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
 
     if qType in (TensorProto.INT8, TensorProto.UINT8, TensorProto.INT16, TensorProto.UINT16):
         if len(data):
             qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric)
             zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range)
-        quantized_data = quantize_nparray(qType, numpy.asarray(data), scale, zero_point)
-        return rmin, rmax, zero_point, scale, quantized_data
+        quantized_data = quantize_nparray(qType, data, scale, zero_point)
+        return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2)
 
     raise ValueError(f"Unexpected value for qType={qType}.")
 
@@ -347,6 +386,14 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):  # noqa
     if not qrange:
         raise ValueError(f"Unexpected data type {qType} requested. Only INT8, UINT8, INT16, and UINT16 are supported.")
 
+    qmin, qmax = qrange
+    if qmin > 0 or qmax < 0:
+        raise ValueError(
+            f"qmin and qmax must meet requirement: qmin <= 0 <= qmax while "
+            f"qmin:{qmin}, qmmax:{qmax}, dtype={qmin.dtype}, reduce_range={reduce_range}, "
+            f"symmetric={symmetric}, qType={qType}"
+        )
+
     return qrange
 
 
@@ -406,6 +453,7 @@ def __init__(
         axis=None,
         node_type=None,
         node_qtype=None,
+        scale_type=None,
     ):
         self.original_name = name
         self.q_name = new_quantized_name
@@ -415,6 +463,7 @@ def __init__(
         self.axis = axis
         self.node_type = node_type
         self.node_qtype = node_qtype
+        self.scale_type = scale_type
 
 
 class BiasToQuantize:
@@ -694,7 +743,7 @@ def save_and_reload_model_with_shape_infer(model: ModelProto) -> ModelProto:
 
 
 def tensor_proto_to_array(initializer: TensorProto) -> numpy.ndarray:
-    if initializer.data_type == onnx_proto.TensorProto.FLOAT:
+    if initializer.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16):
         return onnx.numpy_helper.to_array(initializer)
 
     raise ValueError(
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index eede1be05f85f..c1bbb49f10c7e 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -12,6 +12,7 @@
 from onnx.reference.op_run import OpRun
 
 import onnxruntime
+import onnxruntime.capi._pybind_state as C
 from onnxruntime.quantization import CalibrationDataReader
 
 onnx_recent_enough = hasattr(OpRun, "infer_name")
@@ -27,7 +28,7 @@
         onnx_recent_enough = False
 
 
-class QGemm(OpRun):
+class QOpRun(OpRun):
     op_domain = "com.microsoft"
 
     f8_types = {
@@ -48,6 +49,8 @@ def get_tensor_type(self, tensor: np.ndarray) -> int:
             return TensorProto.FLOAT8E5M2FNUZ
         return np_dtype_to_tensor_dtype(tensor.dtype)
 
+
+class QGemm(QOpRun):
     def _run(
         self,
         A,
@@ -131,6 +134,74 @@ def _run(
             return (y.astype(dtype),)
 
 
+class QLinearMatMul(QOpRun):
+    def _run(
+        self,
+        A,
+        a_scale,
+        a_zero_point,
+        B,
+        b_scale,
+        b_zero_point,
+        y_scale=None,
+        y_zero_point=None,
+    ):
+        a_type = self.get_tensor_type(a_zero_point)
+        b_type = self.get_tensor_type(b_zero_point)
+        y_type = self.get_tensor_type(y_zero_point)
+        if a_type == TensorProto.FLOAT8E4M3FN and b_type == TensorProto.FLOAT8E4M3FN:
+            a_scaled = (float8e4m3_to_float32(A).astype(float) - float8e4m3_to_float32(a_zero_point)) * np.float32(
+                a_scale
+            )
+            b_scaled = (float8e4m3_to_float32(B).astype(float) - float8e4m3_to_float32(b_zero_point)) * np.float32(
+                b_scale
+            )
+            y = a_scaled @ b_scaled
+            if y_scale is not None:
+                y /= y_scale
+            if y_zero_point is not None:
+                y += float8e4m3_to_float32(y_zero_point)
+                ry = y.ravel()
+
+                fy = np.empty(ry.shape, dtype=float8e4m3fn)
+                for i in range(fy.shape[0]):
+                    el = float32_to_float8e4m3(ry[i])  # type: ignore[assignment]
+                    fy[i] = el
+                y = fy.reshape(y.shape)
+            else:
+                raise NotImplementedError("y_zero_point is not empty. QLinearMatMul is not implemented in that case.")
+            return (y,)
+        elif a_type in self.f8_types or b_type in self.f8_types or y_type in self.f8_types:
+            raise NotImplementedError(f"QLinearMatMul not implemented for zero_types {a_type}, {b_type}, {y_type}.")
+        else:
+            if TensorProto.FLOAT8E4M3FN in {a_type, b_type, y_type}:
+                raise TypeError(f"Unexpected type for A: {a_type}, B:{b_type} or Y:{y_type}.")
+            a_scaled = (A.astype(float) - a_zero_point) * np.float32(a_scale)
+            b_scaled = (B.astype(float) - b_zero_point) * np.float32(b_scale)
+            y = a_scaled @ b_scaled
+            if y_scale is not None:
+                y /= np.float32(y_scale)
+            if y_zero_point is not None:
+                y += y_zero_point
+
+            if y_zero_point is not None:
+                dtype = y_zero_point.dtype
+            elif C is not None:
+                dtype = C.dtype
+            else:
+                dtype = A.dtype
+
+            y = np.rint(y)
+            if dtype == np.uint8:
+                y = np.clip(y, 0, 255)
+            elif dtype == np.int8:
+                y = np.clip(y, -128, 127)
+            else:
+                raise ValueError(f"Unexpected dtype={dtype}, it should be uint8 or int8.")
+
+            return (y.astype(dtype),)
+
+
 class TestDataFeeds(CalibrationDataReader):
     def __init__(self, data_feeds):
         """
@@ -183,12 +254,20 @@ def check_op_type_count(testcase, model_path, **kwargs):
     for node in model.graph.node:
         if node.op_type in optype2count:
             optype2count[node.op_type] += 1
+
     for op_type in kwargs:
-        testcase.assertEqual(
-            kwargs[op_type],
-            optype2count[op_type],
-            f"op_type {op_type} count not same",
-        )
+        try:
+            testcase.assertEqual(
+                kwargs[op_type],
+                optype2count[op_type],
+                f"op_type {op_type} count not same",
+            )
+        except AssertionError as e:
+            from onnx_array_api.plotting.text_plot import onnx_simple_text_plot
+
+            raise AssertionError(
+                f"Assert failed:\noptype={optype2count}\nkwargs={kwargs}\n{onnx_simple_text_plot(model)}"
+            ) from e
 
 
 def check_sign_f8_quantization(model_path_origin, model_path_to_check):
@@ -265,6 +344,7 @@ def check_model_correctness(
     providers=None,
     dynamic=False,
     is_gemm=False,
+    op_matmul=False,
 ):
     if providers is None:
         providers = ["CPUExecutionProvider"]
@@ -334,7 +414,10 @@ def check_model_correctness(
 
     # Verifies the expected outputs.
     if check_reference_evaluator and onnx_recent_enough:
-        reference_new_ops = [QGemm]
+        if op_matmul:
+            reference_new_ops = [QLinearMatMul]
+        else:
+            reference_new_ops = [QGemm]
         has_missing_reference_ops = any(
             node.domain not in ["", "ai.onnx"]
             and not any(
@@ -350,25 +433,47 @@ def check_model_correctness(
             )
         # Needs pv.Version(onnx.__version__) >= pv.Version("1.16.0")
         ref = ReferenceEvaluator(model_check, new_ops=reference_new_ops)
-        target_results = ref.run(None, inputs)
-        testcase.assertEqual(len(origin_results), len(target_results), "result count are different")
-        for idx, ref_output in enumerate(origin_results):
-            output = target_results[idx]
-            np.testing.assert_allclose(
-                ref_output,
-                output,
-                rtol=rtol,
-                atol=atol,
-                err_msg=f"Model {model_path_to_check!r} failed for providers={providers!r}.",
-            )
+        try:
+            target_results = ref.run(None, inputs)
+        except Exception as e:
+            if "axis is out of boundary" not in str(e) and "list assignment index out of range" not in str(e):
+                # Run through the same failure with more logs
+                ref = ReferenceEvaluator(model_check, new_ops=reference_new_ops, verbose=10)
+                target_results = ref.run(None, inputs)
+            else:
+                target_results = []
+        if target_results:
+            testcase.assertEqual(len(origin_results), len(target_results), "result count are different")
+            for idx, ref_output in enumerate(origin_results):
+                output = target_results[idx]
+                np.testing.assert_allclose(
+                    ref_output,
+                    output,
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Model {model_path_to_check!r} failed for providers={providers!r}.",
+                )
 
     # enable QDQ transformers
     # sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-    target_sess = onnxruntime.InferenceSession(
-        model_path_to_check,
-        sess_options=sess_options,
-        providers=providers,
-    )
+    try:
+        target_sess = onnxruntime.InferenceSession(
+            model_path_to_check,
+            sess_options=sess_options,
+            providers=providers,
+        )
+    except (C.Fail, C.InvalidGraph) as e:
+        # This should disabled when QDQ optimizers is implemented.
+        se = str(e)
+        if (
+            "com.microsoft:QLinearMatMul(-1) is not a registered function/op" not in se
+            and "Type 'tensor(float16)' of input parameter (input) of operator (QuantizeLinear)" not in se
+            and "Type 'tensor(float16)' of input parameter (input) of operator (DynamicQuantizeLinear)" not in se
+        ):
+            # com.microsoft:QLinearMatMul is not yet implemented.
+            # QuantizeLinear supports float16 in opset 19
+            raise e
+        return
     target_results = target_sess.run([], inputs)
     testcase.assertEqual(len(origin_results), len(target_results), "result count are different")
     for idx, ref_output in enumerate(origin_results):
@@ -407,7 +512,10 @@ def check_qtype_by_node_type(testcase, model_to_check, check_list):
             input_output_check_list = check_list[node.op_type]
             for check_item in input_output_check_list:
                 tensor_name = node.input[check_item[1]] if check_item[0] == "i" else node.output[check_item[1]]
-                testcase.assertTrue((tensor_name in value_infos) or (tensor_name in initializers))
+                if tensor_name not in value_infos and tensor_name not in initializers:
+                    raise AssertionError(
+                        f"Unable to find tensor_name={tensor_name!r} in {list(sorted(value_infos))}\n{model}"
+                    )
                 if tensor_name in value_infos:
                     vi = value_infos[tensor_name]
                     testcase.assertTrue(vi.type.HasField("tensor_type"))
diff --git a/onnxruntime/test/python/quantization/test_conv_dynamic.py b/onnxruntime/test/python/quantization/test_conv_dynamic.py
index 18467bcbc1083..f6ee3fe97a745 100644
--- a/onnxruntime/test/python/quantization/test_conv_dynamic.py
+++ b/onnxruntime/test/python/quantization/test_conv_dynamic.py
@@ -27,7 +27,7 @@ def generate_input_initializer(tensor_shape, tensor_dtype, input_name):
 
 
 class TestONNXModel(unittest.TestCase):
-    def construct_model(self, model_path):
+    def construct_model(self, model_path, onnx_type=TensorProto.FLOAT, opset=13, ir_version=7):
         #       input
         #      /    |
         #     /     |
@@ -40,12 +40,13 @@ def construct_model(self, model_path):
         #        |
         #       (output)
         initializers = []
-        input = helper.make_tensor_value_info("input", TensorProto.FLOAT, [4, 2, 8, 8])
-        output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [4, 2, 8, 8])
+        input = helper.make_tensor_value_info("input", onnx_type, [4, 2, 8, 8])
+        output = helper.make_tensor_value_info("output", onnx_type, [4, 2, 8, 8])
 
-        initializers.append(generate_input_initializer([2, 2, 1, 1], np.float32, "W1"))
-        initializers.append(generate_input_initializer([2, 2, 1, 1], np.float32, "W2"))
-        initializers.append(generate_input_initializer([2], np.float32, "B"))
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
+        initializers.append(generate_input_initializer([2, 2, 1, 1], dtype, "W1"))
+        initializers.append(generate_input_initializer([2, 2, 1, 1], dtype, "W2"))
+        initializers.append(generate_input_initializer([2], dtype, "B"))
         conv_node_1 = onnx.helper.make_node("Conv", ["input", "W1", "B"], ["Conv1_O"], name="Conv1")
         conv_node_2 = onnx.helper.make_node("Conv", ["input", "W2", "B"], ["Conv2_O"], name="Conv2")
         relu_node = onnx.helper.make_node("Relu", ["Conv1_O"], ["Relu_O"], name="Relu")
@@ -57,13 +58,17 @@ def construct_model(self, model_path):
             [output],
             initializer=initializers,
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)], ir_version=ir_version)
         onnx.save(model, model_path)
 
-    def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_config=False):  # noqa: B006
+    def dynamic_quant_conv_test(
+        self, onnx_type, opset, ir_version, weight_type, extra_options=None, use_quant_config=False
+    ):
+        if extra_options is None:
+            extra_options = {}
         np.random.seed(1)
         model_fp32_path = "conv_bias.fp32.onnx"
-        self.construct_model(model_fp32_path)
+        self.construct_model(model_fp32_path, onnx_type, opset, ir_version)
 
         activation_proto_qtype = TensorProto.UINT8
         activation_type_str = "u8"
@@ -84,16 +89,26 @@ def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_confi
         check_op_type_count(self, model_int8_path, **quant_nodes)
         qnode_io_qtypes = {"ConvInteger": [["i", 2, activation_proto_qtype]]}
         check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
         check_model_correctness(
             self,
             model_fp32_path,
             model_int8_path,
-            {"input": np.random.rand(4, 2, 8, 8).astype(np.float32)},
+            {"input": np.random.rand(4, 2, 8, 8).astype(dtype)},
         )
 
     def test_quant_conv(self):
         for use_quant_config in [True, False]:
-            self.dynamic_quant_conv_test(QuantType.QUInt8, extra_options={}, use_quant_config=use_quant_config)
+            self.dynamic_quant_conv_test(
+                TensorProto.FLOAT, 13, 7, QuantType.QUInt8, extra_options={}, use_quant_config=use_quant_config
+            )
+
+    @unittest.skipIf(onnx.defs.onnx_opset_version() < 20, reason="Shape inference bug, see onnx PR #5709")
+    def test_quant_conv_fp16(self):
+        for use_quant_config in [True, False]:
+            self.dynamic_quant_conv_test(
+                TensorProto.FLOAT16, 19, 9, QuantType.QUInt8, extra_options={}, use_quant_config=use_quant_config
+            )
 
     # TODO: uncomment following after ConvInteger s8 supported
     # def test_quant_conv_s8s8(self):
diff --git a/onnxruntime/test/python/quantization/test_op_conv_transpose.py b/onnxruntime/test/python/quantization/test_op_conv_transpose.py
index e7746f21300ea..69f02eef19513 100644
--- a/onnxruntime/test/python/quantization/test_op_conv_transpose.py
+++ b/onnxruntime/test/python/quantization/test_op_conv_transpose.py
@@ -13,6 +13,7 @@
 
 import numpy as np
 import onnx
+import packaging.version as pv
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
@@ -24,7 +25,7 @@ class TestOpConvTranspose(unittest.TestCase):
     Class with test_* methods that test quantization of the ConvTranspose operator.
     """
 
-    def input_feeds(self, num_test_inputs, name2shape):
+    def input_feeds(self, num_test_inputs, name2shape, dtype):
         """
         Returns a data reader of input test data.
 
@@ -37,12 +38,12 @@ def input_feeds(self, num_test_inputs, name2shape):
         for _ in range(num_test_inputs):
             inputs = {}
             for name, shape in name2shape.items():
-                inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(dtype)})
             input_data_list.extend([inputs])
         data_reader = TestDataFeeds(input_data_list)
         return data_reader
 
-    def construct_model(self, output_model_path):
+    def construct_model(self, output_model_path, onnx_type=TensorProto.FLOAT, opset=13, ir_version=7):
         """
         Constructs an ONNX model containing a single ConvTranspose node, and saves
         the model to the specified output path.
@@ -50,10 +51,10 @@ def construct_model(self, output_model_path):
         :param output_model_path: The output filepath in which to save the model.
         """
 
-        input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 1, 7, 7])
-        output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 1, 8, 8])
-        ini_w = helper.make_tensor("weight", TensorProto.FLOAT, [1, 1, 2, 2], [1.0, 1.0, 1.0, 1.0])
-        ini_b = helper.make_tensor("bias", TensorProto.FLOAT, [1], [0.17])
+        input_tensor = helper.make_tensor_value_info("input", onnx_type, [1, 1, 7, 7])
+        output_tensor = helper.make_tensor_value_info("output", onnx_type, [1, 1, 8, 8])
+        ini_w = helper.make_tensor("weight", onnx_type, [1, 1, 2, 2], [1.0, 1.0, 1.0, 1.0])
+        ini_b = helper.make_tensor("bias", onnx_type, [1], [0.17])
         conv_tranpose_node = onnx.helper.make_node(
             "ConvTranspose",
             ["input", "weight", "bias"],
@@ -72,8 +73,8 @@ def construct_model(self, output_model_path):
             [output_tensor],
             initializer=[ini_w, ini_b],
         )
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7  # use stable onnx ir version
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
+        model.ir_version = ir_version  # use stable onnx ir version
 
         onnx.save(model, output_model_path)
 
@@ -128,15 +129,16 @@ def static_quant_test_qdq(
         data_reader.rewind()
         check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
 
-    def test_quantize_conv_transpose_u8u8(self):
+    def quantize_conv_transpose_u8u8(self, onnx_type, opset, ir_version):
         """
         Unit test that quantizes (uint8) an ONNX model containing an ConvTranspose operator.
         """
 
         np.random.seed(1)
         model_fp32_path = "conv_transpose_fp32.onnx"
-        self.construct_model(model_fp32_path)
-        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]})
+        self.construct_model(model_fp32_path, onnx_type, opset, ir_version)
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
+        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]}, dtype)
 
         self.static_quant_test_qdq(
             model_fp32_path,
@@ -145,7 +147,16 @@ def test_quantize_conv_transpose_u8u8(self):
             weight_type=QuantType.QUInt8,
         )
 
-    def test_quantize_conv_transpose_s8s8(self):
+    def test_quantize_conv_transpose_u8u8(self):
+        self.quantize_conv_transpose_u8u8(TensorProto.FLOAT, 13, 7)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_conv_transpose_u8u8_fp16(self):
+        self.quantize_conv_transpose_u8u8(TensorProto.FLOAT16, 19, 9)
+
+    def quantize_conv_transpose_s8s8(self, onnx_type, opset, ir_version):
         """
         Unit test that quantizes (int8) an ONNX model containing an ConvTranspose operator.
         """
@@ -153,7 +164,8 @@ def test_quantize_conv_transpose_s8s8(self):
         np.random.seed(1)
         model_fp32_path = "conv_transpose_fp32.onnx"
         self.construct_model(model_fp32_path)
-        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]})
+        dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_type)
+        data_reader = self.input_feeds(1, {"input": [1, 1, 7, 7]}, dtype)
 
         self.static_quant_test_qdq(
             model_fp32_path,
@@ -163,6 +175,15 @@ def test_quantize_conv_transpose_s8s8(self):
             extra_options={"ActivationSymmetric": True},
         )
 
+    def test_quantize_conv_transpose_s8s8(self):
+        self.quantize_conv_transpose_s8s8(TensorProto.FLOAT, 13, 7)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_conv_transpose_s8s8_fp16(self):
+        self.quantize_conv_transpose_s8s8(TensorProto.FLOAT16, 19, 9)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/onnxruntime/test/python/quantization/test_op_gemm.py b/onnxruntime/test/python/quantization/test_op_gemm.py
index bac0f6d48e9fc..d5482c7ccd476 100644
--- a/onnxruntime/test/python/quantization/test_op_gemm.py
+++ b/onnxruntime/test/python/quantization/test_op_gemm.py
@@ -762,4 +762,5 @@ def test_qgemm_ref_uint8_specific_example(self):
 
 
 if __name__ == "__main__":
+    TestOpGemm().test_quantize_gemm_e4m3fn_p3()
     unittest.main(verbosity=2)
diff --git a/onnxruntime/test/python/quantization/test_op_matmul.py b/onnxruntime/test/python/quantization/test_op_matmul.py
new file mode 100644
index 0000000000000..344583aa7c624
--- /dev/null
+++ b/onnxruntime/test/python/quantization/test_op_matmul.py
@@ -0,0 +1,436 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+import unittest
+
+import numpy as np
+import onnx
+import packaging.version as pv
+from onnx import TensorProto, helper
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
+
+from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
+
+
+class TestOpMatMul(unittest.TestCase):
+    def input_feeds(self, n, name2shape, dtype):
+        input_data_list = []
+        for _i in range(n):
+            inputs = {}
+            for name, shape in name2shape.items():
+                inputs.update({name: np.random.randint(-1, 2, shape).astype(dtype)})
+            input_data_list.extend([inputs])
+        dr = TestDataFeeds(input_data_list)
+        return dr
+
+    def construct_model_matmul(
+        self, output_model_path, add_clip=True, tensor_type=onnx.TensorProto.FLOAT, opset=18, ir_version=8
+    ):
+        #      (input)
+        #         |
+        #        MatMul
+        #         |
+        #        Clip
+        #         |
+        #        MatMul
+        #         |
+        #      (output)
+        dtype = np.float32 if tensor_type == onnx.TensorProto.FLOAT else np.float16
+        input_name = "input"
+        output_name = "output"
+        initializers = []
+
+        def make_matmul(input_name, weight_shape, weight_name, output_name):
+            weight_data = np.random.normal(0, 0.1, weight_shape).astype(dtype)
+            initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
+            return onnx.helper.make_node("MatMul", [input_name, weight_name], [output_name])
+
+        # make mm1 node
+        mm1_output_name = "mm1_output"
+        mm1_node = make_matmul(
+            input_name,
+            [10, 100],
+            "linear1.weight",
+            mm1_output_name,
+        )
+
+        if add_clip:
+            # make Clip
+            clip_min_name = "clip_min"
+            clip_max_name = "clip_max"
+            clip_output_name = "clip_output"
+            clip_inputs = [mm1_output_name, clip_min_name, clip_max_name]
+            clip_outputs = [clip_output_name]
+            initializers.append(onnx.numpy_helper.from_array(np.array(-1.0, dtype=dtype), name=clip_min_name))
+            initializers.append(onnx.numpy_helper.from_array(np.array(1.0, dtype=dtype), name=clip_max_name))
+            clip_node = onnx.helper.make_node("Clip", clip_inputs, clip_outputs)
+
+        else:
+            clip_output_name = "clip_output"
+            clip_node = onnx.helper.make_node("Identity", [mm1_output_name], [clip_output_name])
+
+        # make mm2 node
+        mm2_node = make_matmul(
+            clip_output_name,
+            [100, 10],
+            "linear2.weight",
+            output_name,
+        )
+
+        # make graph
+        input_tensor = helper.make_tensor_value_info(input_name, tensor_type, [-1, 10])
+        output_tensor = helper.make_tensor_value_info(output_name, tensor_type, [-1, 10])
+        graph_name = "matmul_test"
+        graph = helper.make_graph(
+            [mm1_node, clip_node, mm2_node],
+            graph_name,
+            [input_tensor],
+            [output_tensor],
+            initializer=initializers,
+        )
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
+        model.ir_version = ir_version
+
+        onnx.save(model, output_model_path)
+
+    @staticmethod
+    def str_type(qtype):
+        if qtype == QuantType.QUInt8:
+            return "u8"
+        if qtype == QuantType.QInt8:
+            return "s8"
+        if qtype == QuantType.QFLOAT8E4M3FN:
+            return "f8e4m3fn"
+        raise ValueError(f"Unexpected value for qtype={qtype}")
+
+    def static_quant_test(
+        self,
+        model_fp_path,
+        data_reader,
+        activation_type,
+        weight_type,
+        extra_options={},  # noqa: B006
+        calibrate_method=CalibrationMethod.MinMax,
+    ):
+        activation_proto_qtype = activation_type.tensor_type
+        activation_type_str = self.str_type(activation_type)
+        weight_type_str = self.str_type(weight_type)
+        model_qtype_path = f"matmul_fp.quant_{activation_type_str}{weight_type_str}.onnx"
+
+        data_reader.rewind()
+        quantize_static(
+            model_fp_path,
+            model_qtype_path,
+            data_reader,
+            quant_format=QuantFormat.QOperator,
+            activation_type=activation_type,
+            weight_type=weight_type,
+            extra_options=extra_options,
+            calibrate_method=calibrate_method,
+        )
+
+        if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
+            quant_nodes = {"QLinearMatMul": 2, "QuantizeLinear": 2, "DequantizeLinear": 2, "Identity": 1}
+            qnode_io_qtypes = {
+                "QuantizeLinear": [
+                    ["i", 2, activation_proto_qtype],
+                    ["o", 0, activation_proto_qtype],
+                ]
+            }
+        else:
+            qdq_count = 1 if activation_type != QuantType.QInt8 else 2
+            clip_count = 0 if activation_type != QuantType.QInt8 else 1
+            quant_nodes = {
+                "QLinearMatMul": 2,
+                "QuantizeLinear": qdq_count,
+                "DequantizeLinear": qdq_count,
+                "Clip": clip_count,
+            }
+            qnode_io_qtypes = {
+                "QuantizeLinear": [
+                    ["i", 2, activation_proto_qtype],
+                    ["o", 0, activation_proto_qtype],
+                ]
+            }
+
+        if activation_type_str == "f8e4m3fn" and weight_type_str == "f8e4m3fn":
+            with open(model_qtype_path, "rb") as f:
+                onx = onnx.load(f)
+
+            nf8 = 0
+            for init in onx.graph.initializer:
+                if init.data_type not in (TensorProto.FLOAT, TensorProto.FLOAT16, TensorProto.FLOAT8E4M3FN):
+                    raise AssertionError(f"Unexpected data_type={init.data_type} for initializer {init.name!r}.")
+                if init.data_type == TensorProto.FLOAT8E4M3FN:
+                    nf8 += 1
+            if nf8 < 4:
+                raise AssertionError(f"Unexpected low number of float 8 initializer ({nf8}).")
+
+        check_op_type_count(self, model_qtype_path, **quant_nodes)
+        qnode_io_qtypes.update({"DequantizeLinear": [["i", 2, activation_proto_qtype]]})
+        if activation_type_str != "f8e4m3fn":
+            # QLinearMatMul belongs to domain com.microsoft for this type and shape inference does not work
+            check_qtype_by_node_type(self, model_qtype_path, qnode_io_qtypes)
+        data_reader.rewind()
+        if activation_type_str == "f8e4m3fn" and weight_type_str == "f8e4m3fn":
+            check_model_correctness(
+                self,
+                model_fp_path,
+                model_qtype_path,
+                data_reader.get_next(),
+                providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+                is_gemm=True,
+                op_matmul=True,
+            )
+        else:
+            check_model_correctness(
+                self, model_fp_path, model_qtype_path, data_reader.get_next(), is_gemm=True, op_matmul=True
+            )
+
+    def static_quant_test_qdq(
+        self,
+        model_fp_path,
+        data_reader,
+        activation_type,
+        weight_type,
+        extra_options={},  # noqa: B006
+        calibrate_method=CalibrationMethod.MinMax,
+    ):
+        activation_proto_qtype = activation_type.tensor_type
+        activation_type_str = self.str_type(activation_type)
+        weight_type_str = self.str_type(weight_type)
+        model_qtype_path = f"matmul_fp.quant_dqd_{activation_type_str}{weight_type_str}.onnx"
+
+        data_reader.rewind()
+        quantize_static(
+            model_fp_path,
+            model_qtype_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=activation_type,
+            weight_type=weight_type,
+            extra_options=extra_options,
+            calibrate_method=calibrate_method,
+        )
+
+        if activation_type == QuantType.QUInt8:
+            clip_count = 0
+            q_count = 3
+            dq_count = 5
+            cast_count = 0
+        elif activation_type == QuantType.QInt8:
+            clip_count = 1
+            q_count = 4
+            dq_count = 6
+            cast_count = 0
+        elif activation_type == QuantType.QFLOAT8E4M3FN:
+            clip_count = 0
+            q_count = 4
+            dq_count = 6
+            cast_count = 0
+        else:
+            raise AssertionError(f"Test not implemented for activation_type={activation_type}.")
+
+        quant_nodes = {
+            "MatMul": 2,
+            "QuantizeLinear": q_count,
+            "DequantizeLinear": dq_count,
+            "Clip": clip_count,
+            "Cast": cast_count,
+        }
+        check_op_type_count(self, model_qtype_path, **quant_nodes)
+        qnode_io_qtypes = {
+            "QuantizeLinear": [
+                ["i", 2, activation_proto_qtype],
+                ["o", 0, activation_proto_qtype],
+            ]
+        }
+        check_qtype_by_node_type(self, model_qtype_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(
+            self, model_fp_path, model_qtype_path, data_reader.get_next(), is_gemm=True, op_matmul=True
+        )
+
+    def dynamic_quant_test(
+        self,
+        model_fp_path,
+        data_reader,
+        activation_type,
+        weight_type,
+        extra_options={},  # noqa: B006
+    ):
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
+        weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
+        model_qtype_path = f"matmul_fp.quant_dynamic_{activation_type_str}{weight_type_str}.onnx"
+
+        quantize_dynamic(
+            model_fp_path,
+            model_qtype_path,
+            weight_type=weight_type,
+            extra_options=extra_options,
+        )
+        quant_nodes = {"MatMulInteger": 2}
+        check_op_type_count(self, model_qtype_path, **quant_nodes)
+        qnode_io_qtypes = {"MatMulInteger": [["i", 2, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_qtype_path, qnode_io_qtypes)
+        data_reader.rewind()
+        onx = onnx.load(model_fp_path)
+        tt = onx.graph.input[0].type.tensor_type.elem_type
+        check_model_correctness(
+            self,
+            model_fp_path,
+            model_qtype_path,
+            {"input": np.random.rand(5, 10).astype(np.float32 if tt == onnx.TensorProto.FLOAT else np.float16)},
+            dynamic=True,
+            is_gemm=True,
+            op_matmul=True,
+        )
+
+    def quantize_matmul_u8u8(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+        )
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+        )
+        self.dynamic_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+        )
+
+    def test_quantize_matmul_u8u8(self):
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_matmul_u8u8_f16(self):
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 19, 9)
+
+    def quantize_matmul_s8s8(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QInt8,
+            weight_type=QuantType.QInt8,
+            extra_options={"ActivationSymmetric": True},
+        )
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QInt8,
+            weight_type=QuantType.QInt8,
+            extra_options={"ActivationSymmetric": True},
+        )
+
+        # dynamic quantization doesn't support activation:int8
+        # self.dynamic_quant_test(model_fp_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
+        #                        extra_options={'ActivationSymmetric': True})
+
+    def test_quantize_matmul_s8s8(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_matmul_s8s8_f16(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 19, 9)
+
+    def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, add_clip=False, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "same"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "same"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+
+    def test_quantize_matmul_e4m3fn_same(self):
+        self.quantize_matmul_e4m3fn_same(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_matmul_e4m3fn_same_f16(self):
+        self.quantize_matmul_e4m3fn_same(onnx.TensorProto.FLOAT16, 19, 9)
+
+    def quantize_matmul_e4m3fn_p3(self, tt, opset, ir_version):
+        np.random.seed(1)
+        model_fp_path = "matmul_fp.onnx"
+        self.construct_model_matmul(model_fp_path, add_clip=False, tensor_type=tt, opset=opset, ir_version=ir_version)
+        data_reader = self.input_feeds(
+            1, {"input": [5, 10]}, np.float32 if tt == onnx.TensorProto.FLOAT else np.float16
+        )
+
+        self.static_quant_test_qdq(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "p3"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+        self.static_quant_test(
+            model_fp_path,
+            data_reader,
+            activation_type=QuantType.QFLOAT8E4M3FN,
+            weight_type=QuantType.QFLOAT8E4M3FN,
+            extra_options={"scenario": "p3"},
+            calibrate_method=CalibrationMethod.Distribution,
+        )
+
+    def test_quantize_matmul_e4m3fn_p3(self):
+        self.quantize_matmul_e4m3fn_p3(onnx.TensorProto.FLOAT, 18, 8)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    def test_quantize_matmul_e4m3fn_p3_f16(self):
+        self.quantize_matmul_e4m3fn_p3(onnx.TensorProto.FLOAT16, 19, 9)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index 005f4752c16cc..03e29dd64f8a7 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -496,6 +496,8 @@ def test_pad_with_empty_string_input_name(self):
         )
 
         model_fp32 = TestOpQuatizerPad.construct_model_add_pad_add(name=name, shape=shape, final_name="output")
+        op_types = [n.op_type for n in model_fp32.graph.node]
+        self.assertEqual(["Add", "Pad", "Add"], op_types)
 
         onnx.save(model_fp32, model_fp32_path)
 
@@ -506,13 +508,11 @@ def test_pad_with_empty_string_input_name(self):
         )
 
         model_i8 = onnx.load(model_i8_path)
+        print(model_i8)
 
         # Assert quantization really happens.
-        self.assertEqual(model_i8.graph.node[0].op_type, "QuantizeLinear")
-        self.assertEqual(model_i8.graph.node[1].op_type, "QLinearAdd")
-        self.assertEqual(model_i8.graph.node[2].op_type, "Pad")
-        self.assertEqual(model_i8.graph.node[3].op_type, "QLinearAdd")
-        self.assertEqual(model_i8.graph.node[4].op_type, "DequantizeLinear")
+        op_types = [n.op_type for n in model_i8.graph.node]
+        self.assertEqual(["QuantizeLinear", "QLinearAdd", "Pad", "QLinearAdd", "DequantizeLinear"], op_types)
 
         for node in model_i8.graph.node:
             # Examine no empty string flows to quantization process.
diff --git a/onnxruntime/test/python/quantization/test_op_where.py b/onnxruntime/test/python/quantization/test_op_where.py
index 4f96283c7d03b..0c57a7452902b 100644
--- a/onnxruntime/test/python/quantization/test_op_where.py
+++ b/onnxruntime/test/python/quantization/test_op_where.py
@@ -145,11 +145,9 @@ def quantize_where_test(self, activation_type, weight_type, extra_options={}):
 
     def test_quantize_where_u8u8(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": True})
-        print(__name__)
 
     def test_quantize_where_u8u8_no_force_quantize_no_input_check(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": False})
-        print(__name__)
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index 5c2db435d7fb5..4de797400836f 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -74,7 +74,7 @@ def test_qdq_extra_options(self):
         onnx.save(model, test_model_path)
 
         def td(vals):
-            return TensorData(lowest=vals[0], highest=vals[1])
+            return TensorData(lowest=np.array(vals[0], dtype=np.float32), highest=np.array(vals[1], dtype=np.float32))
 
         compute_data = {
             "P": td([0.1, 0.1]),
@@ -175,7 +175,7 @@ def test_qdq_extra_options_2(self):
         onnx.save(model, test_model_path)
 
         def td(vals):
-            return TensorData(lowest=vals[0], highest=vals[1])
+            return TensorData(lowest=np.array(vals[0], dtype=np.float32), highest=np.array(vals[1], dtype=np.float32))
 
         compute_data = {
             "L": td([0.1, 0.1]),
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index 65cdff025bbe4..848857ceb279d 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -18,31 +18,56 @@
 
 class TestQuantUtil(unittest.TestCase):
     def test_compute_scale_zp(self):
-        self.assertEqual(compute_scale_zp(0.0, 0.0, -127, 127, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(1.0, -1.0, -127, 127, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(0.0, 0.0, 0, 255, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(1.0, -1.0, 0, 255, symmetric=True), [0, 1.0])
+        def _compute_scale_zp(rmin, rmax, qmin, qmax, qtype, symmetric=False, min_real_range=None):
+            zp, scale = compute_scale_zp(
+                numpy.array(rmin, dtype=numpy.float32),
+                numpy.array(rmax, dtype=numpy.float32),
+                numpy.array(qmin, dtype=qtype),
+                numpy.array(qmax, dtype=qtype),
+                symmetric=symmetric,
+                min_real_range=min_real_range,
+            )
+            assert isinstance(zp, numpy.ndarray)
+            assert isinstance(scale, numpy.ndarray)
+            return [float(zp), float(scale)]
 
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, -127, 127, symmetric=True), [0, 2.0 / 127])
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, -127, 127, symmetric=False), [-42, 3.0 / 254])
+        self.assertEqual(_compute_scale_zp(0.0, 0.0, -127, 127, numpy.int8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(1.0, -1.0, -127, 127, numpy.int8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(0.0, 0.0, 0, 255, numpy.uint8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(1.0, -1.0, 0, 255, numpy.uint8, symmetric=True), [0, 1.0])
 
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, 0, 255, symmetric=True), [128, 4.0 / 255])
-        self.assertEqual(compute_scale_zp(-1.0, 2.0, 0, 255, symmetric=False), [85, 3.0 / 255])
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, -127, 127, numpy.int8, symmetric=True), [0, numpy.float32(2.0 / 127)]
+        )
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, -127, 127, numpy.int8, symmetric=False), [-42, numpy.float32(3.0 / 254)]
+        )
+
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, 0, 255, numpy.uint8, symmetric=True), [128, numpy.float32(4.0 / 255)]
+        )
+        self.assertEqual(
+            _compute_scale_zp(-1.0, 2.0, 0, 255, numpy.uint8, symmetric=False), [85, numpy.float32(3.0 / 255)]
+        )
 
         tiny_float = numpy.float32(numpy.finfo(numpy.float32).tiny * 0.1)
-        self.assertEqual(compute_scale_zp(-tiny_float, tiny_float, 0, 255, symmetric=True), [0, 1.0])
-        self.assertEqual(compute_scale_zp(-tiny_float, 0.0, 0, 255, symmetric=False), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(-tiny_float, tiny_float, 0, 255, numpy.uint8, symmetric=True), [0, 1.0])
+        self.assertEqual(_compute_scale_zp(-tiny_float, 0.0, 0, 255, numpy.uint8, symmetric=False), [0, 1.0])
 
         # Test enforcing a minimum floatint-point range.
-        self.assertEqual(compute_scale_zp(0.0, 0.0, 0, 255, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 255])
         self.assertEqual(
-            compute_scale_zp(0.0, 0.0, -128, 127, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 255]
+            _compute_scale_zp(0.0, 0.0, 0, 255, numpy.uint8, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 255]
+        )
+        self.assertEqual(
+            _compute_scale_zp(0.0, 0.0, -128, 127, numpy.int8, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 255]
         )
         self.assertEqual(
-            compute_scale_zp(0.0, 0.0, 0, 65535, symmetric=False, min_real_range=0.0001), [0, 0.0001 / 65535]
+            _compute_scale_zp(0.0, 0.0, 0, 65535, numpy.uint16, symmetric=False, min_real_range=0.0001),
+            [0, 0.0001 / 65535],
         )
         self.assertEqual(
-            compute_scale_zp(0.0, 0.0, -32768, 32767, symmetric=True, min_real_range=0.0001), [0, 0.0002 / 65535]
+            _compute_scale_zp(0.0, 0.0, -32768, 32767, numpy.int16, symmetric=True, min_real_range=0.0001),
+            [0, 0.0002 / 65535],
         )
 
     def test_load_external_model(self):
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 770f292286982..9f0ee380cad15 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -147,7 +147,8 @@ def test_qdq_default(self):
 
         self.assertEqual(bias_zp.int32_data[0], self.default_zp_scales["BIAS"][0])
         self.assertEqual(bias_zp.data_type, self.default_bias_qtype)
-        self.assertEqual(bias_sc.float_data[0], self.default_zp_scales["BIAS"][1])
+        np_array = onnx.numpy_helper.to_array(bias_sc)
+        self.assertEqual(np_array[0], self.default_zp_scales["BIAS"][1])
 
         self.assertEqual(out_zp.int32_data[0], self.default_zp_scales["OUT"][0])
         self.assertEqual(out_zp.data_type, self.default_act_qtype)
@@ -215,7 +216,7 @@ def test_qdq_overrides1(self):
         inp_zp, inp_sc, sig_out_zp, sig_out_sc, wgt_zp, wgt_sc, bias_zp, bias_sc, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides1.onnx",
             tensor_quant_overrides={
-                "SIG_OUT": [{"scale": 1.0, "zero_point": 127}],
+                "SIG_OUT": [{"scale": np.array(1.0, dtype=np.float32), "zero_point": np.array(127, dtype=np.uint8)}],
                 "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
                 "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
             },
@@ -253,7 +254,7 @@ def test_qdq_overrides2(self):
         """
         Test overriding rmin/rmax for Sigmoid output.
         """
-        sigmoid_rmin, sigmoid_rmax = 0.0, 0.5
+        sigmoid_rmin, sigmoid_rmax = np.array(0.0, dtype=np.float32), np.array(0.5, dtype=np.float32)
         inp_zp, inp_sc, sig_out_zp, sig_out_sc, _, _, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides2.onnx",
             tensor_quant_overrides={"SIG_OUT": [{"rmin": sigmoid_rmin, "rmax": sigmoid_rmax}]},
@@ -276,7 +277,7 @@ def test_qdq_overrides3(self):
         """
         Test overriding rmin and rmax for Conv weight
         """
-        wgt_rmin, wgt_rmax = 0.0, 1.0
+        wgt_rmin, wgt_rmax = np.array(0.0, dtype=np.float32), np.array(1.0, dtype=np.float32)
         _, _, _, _, wgt_zp, wgt_sc, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides3.onnx",
             tensor_quant_overrides={
@@ -298,7 +299,7 @@ def test_qdq_overrides4(self):
         """
         Test overriding scale and zero_point for Conv weight
         """
-        wgt_zp_val, wgt_scale_val = 4, 0.5
+        wgt_zp_val, wgt_scale_val = np.array(4, dtype=np.float32), np.array(0.5, dtype=np.float32)
         _, _, _, _, wgt_zp, wgt_sc, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides4.onnx",
             tensor_quant_overrides={
@@ -315,7 +316,7 @@ def test_qdq_overrides_per_channel1(self):
         """
         Test per-channel overriding of scale/zero_point for Conv weight and bias.
         """
-        zp_vals, scale_vals = [2, 4], [0.5, 0.2]
+        zp_vals, scale_vals = np.array([2, 4], dtype=np.float32), np.array([0.5, 0.2], dtype=np.float32)
         (
             _,
             _,
@@ -380,14 +381,14 @@ def test_qdq_overrides_per_channel2(self):
                 "WGT": [
                     {
                         "quant_type": quant_type,
-                        "rmin": rmin_vals[0],
-                        "rmax": rmax_vals[0],
+                        "rmin": np.array(rmin_vals[0], dtype=np.float32),
+                        "rmax": np.array(rmax_vals[0], dtype=np.float32),
                         "reduce_range": reduce_ranges[0],
                     },
                     {
                         "quant_type": quant_type,
-                        "rmin": rmin_vals[1],
-                        "rmax": rmax_vals[1],
+                        "rmin": np.array(rmin_vals[1], dtype=np.float32),
+                        "rmax": np.array(rmax_vals[1], dtype=np.float32),
                         "reduce_range": reduce_ranges[1],
                     },
                 ],
@@ -398,7 +399,12 @@ def test_qdq_overrides_per_channel2(self):
         self.assertEqual(wgt_zp.data_type, quant_type.tensor_type)
         for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data)):
             wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=reduce_ranges[index])
-            expected_zp, expected_scale = compute_scale_zp(rmin_vals[index], rmax_vals[index], wgt_qmin, wgt_qmax)
+            expected_zp, expected_scale = compute_scale_zp(
+                np.array(rmin_vals[index], dtype=np.float32),
+                np.array(rmax_vals[index], dtype=np.float32),
+                wgt_qmin,
+                wgt_qmax,
+            )
             self.assertEqual(zp, expected_zp)
             self.assertEqual(scale, np.float32(expected_scale))
 
@@ -409,7 +415,9 @@ def test_override_validation_nonexisting_tensor(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"NON_EXISTING": [{"rmin": 0.0, "rmax": 0.5}]},
+                tensor_quant_overrides={
+                    "NON_EXISTING": [{"rmin": np.array(0.0, dtype=np.float32), "rmax": np.array(0.5, dtype=np.float32)}]
+                },
             )
 
         self.assertIn("is not present in the model", str(context.exception))
@@ -421,7 +429,7 @@ def test_override_validation_scale_missing_zp(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0}]},
+                tensor_quant_overrides={"SIG_OUT": [{"scale": np.array(0.0, dtype=np.float32)}]},
             )
 
         self.assertIn("Must provide both 'scale' and 'zero_point'", str(context.exception))
@@ -433,7 +441,15 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "rmax": 10.0}]},
+                tensor_quant_overrides={
+                    "SIG_OUT": [
+                        {
+                            "scale": np.array(0, dtype=np.float32),
+                            "zero_point": np.array(0, dtype=np.int8),
+                            "rmax": np.array(10.0, dtype=np.float32),
+                        }
+                    ]
+                },
             )
 
         self.assertIn("option 'rmax' is invalid with 'scale' and 'zero_point'", str(context.exception))
@@ -441,15 +457,31 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "rmin": 10.0}]},
+                tensor_quant_overrides={
+                    "SIG_OUT": [
+                        {
+                            "scale": np.array(0, dtype=np.float32),
+                            "zero_point": np.array(0, dtype=np.int8),
+                            "rmax": np.array(10.0, dtype=np.float32),
+                        }
+                    ]
+                },
             )
 
-        self.assertIn("option 'rmin' is invalid with 'scale' and 'zero_point'", str(context.exception))
+        self.assertIn("Tensor override option 'rmax' is invalid with 'scale' and 'zero_point'", str(context.exception))
 
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "symmetric": True}]},
+                tensor_quant_overrides={
+                    "SIG_OUT": [
+                        {
+                            "scale": np.array(0, dtype=np.float32),
+                            "zero_point": np.array(0, dtype=np.int8),
+                            "symmetric": True,
+                        }
+                    ]
+                },
             )
 
         self.assertIn("option 'symmetric' is invalid with 'scale' and 'zero_point'", str(context.exception))
@@ -457,11 +489,22 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": 0.0, "zero_point": 0, "reduce_range": True}]},
+                tensor_quant_overrides={
+                    "SIG_OUT": [
+                        {
+                            "scale": np.array(0, dtype=np.float32),
+                            "zero_point": np.array(0, dtype=np.int8),
+                            "reduce_range": True,
+                        }
+                    ]
+                },
             )
 
         self.assertIn("option 'reduce_range' is invalid with 'scale' and 'zero_point'", str(context.exception))
 
 
 if __name__ == "__main__":
+    t = TestTensorQuantOverridesOption()
+    t.setUp()
+    t.test_qdq_default_per_channel()
     unittest.main()

From 2cb5781b439d12e23d8ccbdb63e8bc9b8bf7e2d4 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 12 Jan 2024 09:26:28 -0800
Subject: [PATCH 036/100] Remove two tests from test_logging_apis.cc (#19100)

### Description
In some environments the test code has undefined behavior. To prove it, save the following code as
test.cpp
```c++
#include <iostream>
#include <stdio.h>

int main(){
  char buf[1024];
  int ret = snprintf(buf, sizeof(buf), "%ls","abc");
  if(ret <0){
    std::cout<< ret<< std::endl;
  } else{
    std::cout<< "OK: ret="<<ret<< std::endl;
  }
  return 0;
}
```
Then compile it as
```
g++   -DNDEBUG -std=gnu++17    test.cpp -o /tmp/t
```
Or
```
g++   -O2 -DNDEBUG -std=gnu++17    test.cpp -o /tmp/t
```
The first command is without optimization. The second one turns on
optimization. Then the outputs are different.
When optimization is enabled, the output might be:
```
OK: ret=-1
```
You cannot explain why it would go to this branch when ret is "-1". It
might be a bug of a specific version of GCC. However, at this moment we
cannot change the version. It was found in GCC version 8.5.0 20210514
(Red Hat 8.5.0-18) (GCC) that is provided by UBI8. RHEL9 doesn't have
the problem. snprintf is a builtin function of GCC. So the problem was
not related to glibc.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../test/logging_apis/test_logging_apis.cc      | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/onnxruntime/test/logging_apis/test_logging_apis.cc b/onnxruntime/test/logging_apis/test_logging_apis.cc
index 0bf3b65bc755a..d72c47493d800 100644
--- a/onnxruntime/test/logging_apis/test_logging_apis.cc
+++ b/onnxruntime/test/logging_apis/test_logging_apis.cc
@@ -269,23 +269,6 @@ TEST_F(RealCAPITestsFixture, CppApiORTCXXLOGF) {
 
   line_num = __LINE__ + 1;
   ORT_CXX_LOGF_NOEXCEPT(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO, "Ignored %d", line_num);
-
-  //
-  // Test errors due to formatting error.
-  //
-
-  // Catch expected exception from ORT_CXX_LOGF macro.
-  try {
-    line_num = __LINE__ + 1;
-    ORT_CXX_LOGF(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "%ls", "abc");
-    FAIL();
-  } catch (const Ort::Exception& excpt) {
-    ASSERT_THAT(excpt.what(), testing::HasSubstr("Failed to log message due to formatting error"));
-  }
-
-  // The formatting error is ignored with the ORT_CXX_LOGF_NOEXCEPT macro
-  line_num = __LINE__ + 1;
-  ORT_CXX_LOGF_NOEXCEPT(cpp_ort_logger, OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, "%ls", "abc");
 }
 
 TEST_F(MockCAPITestsFixture, CppLogMacroBypassCApiCall) {

From c23410a182bab53fd90080887debd71290c4d84e Mon Sep 17 00:00:00 2001
From: Aditya Goel <48102515+adityagoel4512@users.noreply.github.com>
Date: Fri, 12 Jan 2024 17:46:23 +0000
Subject: [PATCH 037/100] StringSplit operator (#18016)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Closes https://github.com/microsoft/onnxruntime/issues/17596
---
 docs/OperatorKernels.md                       |   1 +
 .../cpu/quantization/qlinear_concat.cc        |   2 +-
 .../providers/cpu/cpu_execution_provider.cc   |   2 +
 .../core/providers/cpu/text/string_split.cc   |  98 ++++++++++++++
 .../core/providers/cpu/text/string_split.h    |  20 +++
 .../providers/cpu/text/string_split_test.cc   | 126 ++++++++++++++++++
 .../onnx_backend_test_series_filters.jsonc    |   6 -
 7 files changed, 248 insertions(+), 7 deletions(-)
 create mode 100644 onnxruntime/core/providers/cpu/text/string_split.cc
 create mode 100644 onnxruntime/core/providers/cpu/text/string_split.h
 create mode 100644 onnxruntime/test/providers/cpu/text/string_split_test.cc

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index f0b79eb9e429f..a2bb39da76235 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -385,6 +385,7 @@ Do not modify directly.*
 |||[1, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |StringConcat|*in* X:**T**<br> *in* Y:**T**<br> *out* Z:**T**|20+|**T** = tensor(string)|
 |StringNormalizer|*in* X:**tensor(string)**<br> *out* Y:**tensor(string)**|10+|**X** = tensor(string)|
+|StringSplit|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**T3**|20+|**T1** = tensor(string)<br/> **T2** = tensor(string)<br/> **T3** = tensor(int64)|
 |Sub|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||13|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |||[7, 12]|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
index ee9ae7167945c..af163b6be702b 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_concat.cc
@@ -1,4 +1,4 @@
-// Copyright (c Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
 #include "qlinear_util.h"
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 9cd0b3d0620af..6aef03a32db09 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -991,6 +991,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit);
 
 // !!PLEASE READ BELOW!! Following that, add new entries above this comment
 
@@ -2451,6 +2452,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, RegexFullMatch)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringSplit)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/text/string_split.cc b/onnxruntime/core/providers/cpu/text/string_split.cc
new file mode 100644
index 0000000000000..2b82309838464
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_split.cc
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "string_split.h"
+#include <algorithm>
+#include <limits>
+#include <string>
+#include "core/common/common.h"
+namespace onnxruntime {
+
+ONNX_CPU_OPERATOR_KERNEL(StringSplit, 20,
+                         KernelDefBuilder()
+                             .TypeConstraint("T1", DataTypeImpl::GetTensorType<std::string>())
+                             .TypeConstraint("T2", DataTypeImpl::GetTensorType<std::string>())
+                             .TypeConstraint("T3", DataTypeImpl::GetTensorType<int64_t>()),
+                         StringSplit);
+
+/// Calculate substrings in ``str`` delimited by ``delimiter``. A maximum of ``max_splits`` splits are permitted.
+/// Returns a vector of string slices into ``str`` representing the substrings as string views. The user must ensure
+/// the returned views' lifetime does not exceed ``str``'s.
+void ComputeSubstrings(std::string_view str, std::string_view delimiter, int64_t max_splits, InlinedVector<std::string_view>& out) {
+  if (str.empty()) {
+    return;
+  }
+  if (delimiter.empty()) {
+    // Count consecutive whitespace as one delimiter. Preceding and trailing whitespace is meant to be ignored.
+    size_t pos = str.find_first_not_of(" ");
+    int64_t token_count = 0;
+    while (pos != std::string::npos) {
+      if (token_count++ == max_splits) {
+        // Trim down last substring as required in specification
+        size_t next_pos = str.length() - 1;
+        while (str[next_pos] == ' ') {
+          next_pos--;
+        }
+        out.push_back(str.substr(pos, next_pos - pos + 1));
+        break;
+      } else {
+        auto next_pos = str.find_first_of(" ", pos);
+        out.push_back(str.substr(pos, next_pos - pos));
+        pos = str.find_first_not_of(" ", next_pos);
+      }
+    }
+  } else {
+    size_t pos = 0;
+    int64_t token_count = 0;
+    while (pos != std::string::npos) {
+      auto next_pos = str.find(delimiter, pos);
+      if (token_count++ == max_splits || next_pos == std::string::npos) {
+        out.push_back(str.substr(pos));
+        break;
+      }
+      out.push_back(str.substr(pos, next_pos - pos));
+      pos = next_pos + delimiter.size();
+    }
+  }
+}
+
+StringSplit::StringSplit(const OpKernelInfo& info) : OpKernel(info) {
+  info.GetAttrOrDefault("maxsplit", &maxsplit_, std::numeric_limits<int64_t>::max() - 1);
+  info.GetAttrOrDefault("delimiter", &delimiter_, std::string());
+}
+
+Status StringSplit::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  auto input_data = input->template DataAsSpan<std::string>();
+
+  // Set up number of tokens output
+  auto num_tokens_data = context->Output(1, input->Shape())->template MutableDataAsSpan<int64_t>();
+  auto num_tokens_iter = num_tokens_data.begin();
+
+  InlinedVector<InlinedVector<std::string_view>> input_slices;
+  input_slices.reserve(input_data.size());
+  size_t last_dim = 0;
+
+  for (const auto& s : input_data) {
+    auto& substrs = input_slices.emplace_back();
+    ComputeSubstrings(s, delimiter_, maxsplit_, substrs);
+    auto substr_count = substrs.size();
+    last_dim = std::max(last_dim, substr_count);
+    *num_tokens_iter = static_cast<int64_t>(substr_count);
+    ++num_tokens_iter;
+  }
+
+  // Set up splits output
+  auto splits_shape = input->Shape().AsShapeVector();
+  splits_shape.push_back(last_dim);
+
+  auto splits_data = context->Output(0, splits_shape)->template MutableDataAsSpan<std::string>();
+  auto slices_iter = input_slices.begin();
+  for (auto output_splits_iter = splits_data.begin(); output_splits_iter != splits_data.end(); output_splits_iter += last_dim, ++slices_iter) {
+    std::copy(slices_iter->begin(), slices_iter->end(), output_splits_iter);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/text/string_split.h b/onnxruntime/core/providers/cpu/text/string_split.h
new file mode 100644
index 0000000000000..6be249261d4e3
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/text/string_split.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+
+class StringSplit final : public OpKernel {
+ public:
+  explicit StringSplit(const OpKernelInfo& info);
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  std::string delimiter_;
+  int64_t maxsplit_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/text/string_split_test.cc b/onnxruntime/test/providers/cpu/text/string_split_test.cc
new file mode 100644
index 0000000000000..d5e1c296d0b25
--- /dev/null
+++ b/onnxruntime/test/providers/cpu/text/string_split_test.cc
@@ -0,0 +1,126 @@
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+TEST(StringSplit, BasicSplitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {3}, {"hello world", "hello", "world"});
+  test.AddAttribute<std::string>("delimiter", " ");
+  test.AddOutput<std::string>("Y", {3, 2}, {"hello", "world", "hello", "", "world", ""});
+  test.AddOutput<int64_t>("Z", {3}, {2, 1, 1});
+  test.Run();
+}
+
+TEST(StringSplit, MaxSplitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {2, 2}, {"eggs;milk;chesse", "pepper;salt", "chicken;fish;pork", "spinach"});
+  test.AddAttribute<std::string>("delimiter", ";");
+  test.AddAttribute<int64_t>("maxsplit", 1);
+  test.AddOutput<std::string>("Y", {2, 2, 2},
+                              {"eggs", "milk;chesse", "pepper", "salt", "chicken", "fish;pork", "spinach", ""});
+  test.AddOutput<int64_t>("Z", {2, 2}, {2, 2, 2, 1});
+  test.Run();
+}
+
+TEST(StringSplit, EmptyStringDelimiterTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4}, {"hello world", "hello  world", " hello world", "hello world  "});
+  test.AddAttribute<std::string>("delimiter", "");
+  test.AddOutput<std::string>("Y", {1, 4, 2}, {"hello", "world", "hello", "world", "hello", "world", "hello", "world"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 2, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SubsequentWhitespaceDefaultTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4}, {"hello world", "hello  world", "   hello world", "hello world  "});
+  test.AddOutput<std::string>("Y", {1, 4, 2}, {"hello", "world", "hello", "world", "hello", "world", "hello", "world"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 2, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SubsequentWhitespaceWithLimitTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 4},
+                             {"lorem  ipsum doler", " Open Neural Network Exchange (ONNX)", "onnx", "ONNX runtime "});
+  test.AddAttribute<int64_t>("maxsplit", 1);
+  test.AddOutput<std::string>(
+      "Y", {1, 4, 2},
+      {"lorem", "ipsum doler", "Open", "Neural Network Exchange (ONNX)", "onnx", "", "ONNX", "runtime"});
+  test.AddOutput<int64_t>("Z", {1, 4}, {2, 2, 1, 2});
+  test.Run();
+}
+
+TEST(StringSplit, SingleTokenTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 1, 1}, {"lorem"});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, SingleTokenWhitespaceTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 1, 1}, {"lorem"});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, EdgeWhitespaceTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 1, 1}, {"         lorem "});
+  test.AddOutput<std::string>("Y", {1, 1, 1, 1}, {"lorem"});
+  test.AddOutput<int64_t>("Z", {1, 1, 1}, {1});
+  test.Run();
+}
+
+TEST(StringSplit, EmptyInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 3, 1}, {"", "+", "*"});
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddOutput<std::string>("Y", {1, 3, 1, 2}, {"", "", "+", "", "", ""});
+  test.AddOutput<int64_t>("Z", {1, 3, 1}, {0, 1, 2});
+  test.Run();
+}
+
+TEST(StringSplit, OnlyEmptyInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddAttribute<std::string>("delimiter", "*");
+  test.AddInput<std::string>("X", {1, 2, 1}, {"", ""});
+  test.AddOutput<std::string>("Y", {1, 2, 1, 0}, {});
+  test.AddOutput<int64_t>("Z", {1, 2, 1}, {0, 0});
+  test.Run();
+}
+
+TEST(StringSplit, OnlyEmptyNoDelimiterInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {1, 2, 1}, {"", ""});
+  test.AddOutput<std::string>("Y", {1, 2, 1, 0}, {});
+  test.AddOutput<int64_t>("Z", {1, 2, 1}, {0, 0});
+  test.Run();
+}
+
+TEST(StringSplit, NoInputTest) {
+  OpTester test("StringSplit", 20);
+  test.AddInput<std::string>("X", {
+                                      0,
+                                  },
+                             {});
+  test.AddOutput<std::string>("Y", {
+                                       0,
+                                       0,
+                                   },
+                              {});
+  test.AddOutput<int64_t>("Z", {
+                                   0,
+                               },
+                          {});
+  test.Run();
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index c2ca5f860a107..ed263515d6dd6 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -248,12 +248,6 @@
         "^test_image_decoder_decode_pnm_rgb",
         "^test_image_decoder_decode_tiff_rgb",
         "^test_image_decoder_decode_webp_rgb",
-        "^test_string_split_basic",
-        "^test_string_split_consecutive_delimiters",
-        "^test_string_split_empty_string_delimiter",
-        "^test_string_split_empty_tensor",
-        "^test_string_split_maxsplit",
-        "^test_string_split_no_delimiter",
         "^test_reduce_l1_empty_set_cuda",
         "^test_reduce_l1_empty_set_expanded_cuda",
         "^test_reduce_l2_empty_set_cuda",

From 3eec1592bd87deede7834cdca4b6d0e4baed14dc Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Sat, 13 Jan 2024 02:22:38 +0800
Subject: [PATCH 038/100] [WebNN EP] Update WebNN unit test list (#19103)

Update WebNN test list in suite-test-list.jsonc so all test cases are
passed behind WebNN CPU backend on Chrome Stable (Although some cases
may fall back to CPU EP).
Enable int64 support for WebNN in unit tests.
---
 js/web/test/suite-test-list.jsonc | 409 +++++++++++++++---------------
 js/web/test/test-runner.ts        |   2 +-
 2 files changed, 205 insertions(+), 206 deletions(-)

diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 79f42e36bf390..033b3b3f4b0f5 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -553,7 +553,7 @@
       "test_gemm_broadcast",
       "test_gemm_default_matrix_bias",
       "test_gemm_default_no_bias",
-      "test_gemm_default_scalar_bias",
+      // "test_gemm_default_scalar_bias",
       "test_gemm_default_single_elem_vector_bias",
       "test_gemm_default_vector_bias",
       "test_gemm_default_zero_bias",
@@ -637,9 +637,9 @@
       "test_layer_normalization_4d_axis_negative_1",
       // // "test_layer_normalization_4d_axis_negative_2_expanded",
       "test_layer_normalization_4d_axis_negative_2",
-      "test_layer_normalization_4d_axis_negative_3_expanded",
+      // "test_layer_normalization_4d_axis_negative_3_expanded",
       "test_layer_normalization_4d_axis_negative_3",
-      "test_layer_normalization_4d_axis_negative_4_expanded",
+      // "test_layer_normalization_4d_axis_negative_4_expanded",
       "test_layer_normalization_4d_axis_negative_4",
       "test_layer_normalization_4d_axis0_expanded",
       "test_layer_normalization_4d_axis0",
@@ -1502,10 +1502,10 @@
     "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
     "node": [
       "test_abs",
-      "test_acos_example",
-      "test_acos",
-      "test_acosh_example",
-      "test_acosh",
+      // "test_acos_example",
+      // "test_acos",
+      // "test_acosh_example",
+      // "test_acosh",
       // // "test_adagrad_multiple",
       // // "test_adagrad",
       // // "test_adam_multiple",
@@ -1521,38 +1521,38 @@
       // "test_and2d",
       // "test_and3d",
       // "test_and4d",
-      // "test_argmax_default_axis_example_select_last_index",
-      // "test_argmax_default_axis_example",
-      // "test_argmax_default_axis_random_select_last_index",
-      // "test_argmax_default_axis_random",
-      // "test_argmax_keepdims_example_select_last_index",
-      // "test_argmax_keepdims_example",
-      // "test_argmax_keepdims_random_select_last_index",
-      // "test_argmax_keepdims_random",
-      // "test_argmax_negative_axis_keepdims_example_select_last_index",
-      // "test_argmax_negative_axis_keepdims_example",
-      // "test_argmax_negative_axis_keepdims_random_select_last_index",
-      // "test_argmax_negative_axis_keepdims_random",
-      // "test_argmax_no_keepdims_example_select_last_index",
-      // "test_argmax_no_keepdims_example",
-      // "test_argmax_no_keepdims_random_select_last_index",
-      // "test_argmax_no_keepdims_random",
-      // "test_argmin_default_axis_example_select_last_index",
-      // "test_argmin_default_axis_example",
-      // "test_argmin_default_axis_random_select_last_index",
-      // "test_argmin_default_axis_random",
-      // "test_argmin_keepdims_example_select_last_index",
-      // "test_argmin_keepdims_example",
-      // "test_argmin_keepdims_random_select_last_index",
-      // "test_argmin_keepdims_random",
-      // "test_argmin_negative_axis_keepdims_example_select_last_index",
-      // "test_argmin_negative_axis_keepdims_example",
-      // "test_argmin_negative_axis_keepdims_random_select_last_index",
-      // "test_argmin_negative_axis_keepdims_random",
-      // "test_argmin_no_keepdims_example_select_last_index",
-      // "test_argmin_no_keepdims_example",
-      // "test_argmin_no_keepdims_random_select_last_index",
-      // "test_argmin_no_keepdims_random",
+      "test_argmax_default_axis_example_select_last_index",
+      "test_argmax_default_axis_example",
+      "test_argmax_default_axis_random_select_last_index",
+      "test_argmax_default_axis_random",
+      "test_argmax_keepdims_example_select_last_index",
+      "test_argmax_keepdims_example",
+      "test_argmax_keepdims_random_select_last_index",
+      "test_argmax_keepdims_random",
+      "test_argmax_negative_axis_keepdims_example_select_last_index",
+      "test_argmax_negative_axis_keepdims_example",
+      "test_argmax_negative_axis_keepdims_random_select_last_index",
+      "test_argmax_negative_axis_keepdims_random",
+      "test_argmax_no_keepdims_example_select_last_index",
+      "test_argmax_no_keepdims_example",
+      "test_argmax_no_keepdims_random_select_last_index",
+      "test_argmax_no_keepdims_random",
+      "test_argmin_default_axis_example_select_last_index",
+      "test_argmin_default_axis_example",
+      "test_argmin_default_axis_random_select_last_index",
+      "test_argmin_default_axis_random",
+      "test_argmin_keepdims_example_select_last_index",
+      "test_argmin_keepdims_example",
+      "test_argmin_keepdims_random_select_last_index",
+      "test_argmin_keepdims_random",
+      "test_argmin_negative_axis_keepdims_example_select_last_index",
+      "test_argmin_negative_axis_keepdims_example",
+      "test_argmin_negative_axis_keepdims_random_select_last_index",
+      "test_argmin_negative_axis_keepdims_random",
+      "test_argmin_no_keepdims_example_select_last_index",
+      "test_argmin_no_keepdims_example",
+      "test_argmin_no_keepdims_random_select_last_index",
+      "test_argmin_no_keepdims_random",
       // "test_asin_example",
       // "test_asin",
       // "test_asinh_example",
@@ -1577,10 +1577,10 @@
       "test_basic_conv_with_padding",
       "test_basic_conv_without_padding",
       // "test_basic_convinteger",
-      // "test_batchnorm_epsilon_training_mode",
-      // "test_batchnorm_epsilon",
-      // "test_batchnorm_example_training_mode",
-      // "test_batchnorm_example",
+      "test_batchnorm_epsilon_training_mode",
+      "test_batchnorm_epsilon",
+      "test_batchnorm_example_training_mode",
+      "test_batchnorm_example",
       // // "test_bernoulli_double_expanded",
       // // "test_bernoulli_double",
       // // "test_bernoulli_expanded",
@@ -1600,14 +1600,14 @@
       // // "test_blackmanwindow_symmetric",
       // // "test_blackmanwindow",
       // // "test_cast_BFLOAT16_to_FLOAT",
-      // // "test_cast_DOUBLE_to_FLOAT",
-      // // "test_cast_DOUBLE_to_FLOAT16",
+      "test_cast_DOUBLE_to_FLOAT",
+      // "test_cast_DOUBLE_to_FLOAT16",
       // // "test_cast_FLOAT_to_BFLOAT16",
-      // // "test_cast_FLOAT_to_DOUBLE",
+      "test_cast_FLOAT_to_DOUBLE",
       // // "test_cast_FLOAT_to_FLOAT16",
       // // "test_cast_FLOAT_to_STRING",
-      // // "test_cast_FLOAT16_to_DOUBLE",
-      // // "test_cast_FLOAT16_to_FLOAT",
+      // "test_cast_FLOAT16_to_DOUBLE",
+      // "test_cast_FLOAT16_to_FLOAT",
       // // "test_cast_STRING_to_FLOAT",
       // // "test_castlike_BFLOAT16_to_FLOAT_expanded",
       // // "test_castlike_BFLOAT16_to_FLOAT",
@@ -1666,7 +1666,7 @@
       "test_conv_with_strides_padding",
       // // "test_convinteger_with_padding",
       // // "test_convinteger_without_padding",
-      // "test_convtranspose_1d",
+      "test_convtranspose_1d",
       // // "test_convtranspose_3d",
       // "test_convtranspose_autopad_same",
       "test_convtranspose_dilations",
@@ -1730,8 +1730,8 @@
       "test_elu_default",
       "test_elu_example",
       "test_elu",
-      // "test_equal_bcast",
-      // "test_equal",
+      "test_equal_bcast",
+      "test_equal",
       // "test_erf",
       "test_exp_example",
       "test_exp",
@@ -1751,14 +1751,13 @@
       "test_flatten_negative_axis4",
       "test_floor_example",
       "test_floor",
-      // "test_gather_0",
-      // "test_gather_1",
-      // "test_gather_2d_indices",
-      // "test_gather_negative_indices",
-      // "test_gather_elements_0",
-      // "test_gather_elements_1",
-      // "test_gather_elements_negative_indices",
-      // "test_gather_negative_indices",
+      "test_gather_0",
+      "test_gather_1",
+      "test_gather_2d_indices",
+      "test_gather_negative_indices",
+      "test_gather_elements_0",
+      "test_gather_elements_1",
+      "test_gather_elements_negative_indices",
       // "test_gathernd_example_float32",
       // "test_gathernd_example_int32_batch_dim1",
       // "test_gathernd_example_int32",
@@ -1777,14 +1776,14 @@
       "test_gemm_transposeB",
       "test_globalaveragepool_precomputed",
       "test_globalaveragepool",
-      // "test_globalmaxpool_precomputed",
-      // "test_globalmaxpool",
-      // "test_greater_bcast",
-      // "test_greater_equal_bcast_expanded",
-      // "test_greater_equal_bcast",
-      // "test_greater_equal_expanded",
-      // "test_greater_equal",
-      // "test_greater",
+      "test_globalmaxpool_precomputed",
+      "test_globalmaxpool",
+      "test_greater_bcast",
+      "test_greater_equal_bcast_expanded",
+      "test_greater_equal_bcast",
+      "test_greater_equal_expanded",
+      "test_greater_equal",
+      "test_greater",
       // // "test_gridsample_aligncorners_true",
       // // "test_gridsample_bicubic",
       // // "test_gridsample_bilinear",
@@ -1812,10 +1811,10 @@
       // // "test_hardmax_example",
       // // "test_hardmax_negative_axis",
       // // "test_hardmax_one_hot",
-      // // "test_hardsigmoid_default",
-      // // "test_hardsigmoid_example",
-      // // "test_hardsigmoid",
-      // "test_hardswish_expanded",
+      "test_hardsigmoid_default",
+      "test_hardsigmoid_example",
+      "test_hardsigmoid",
+      "test_hardswish_expanded",
       "test_hardswish",
       // "test_if",
       // TODO: Uncomment 'test_if_seq' and 'test_if_opt' once the test infra
@@ -1823,58 +1822,58 @@
       // "test_if_seq",
       // "test_if_opt",
       "test_instancenorm_epsilon",
-      // "test_instancenorm_example",
+      "test_instancenorm_example",
       // "test_isinf_negative",
       // "test_isinf_positive",
       // "test_isinf",
       // "test_isnan",
       // "test_layer_normalization_2d_axis_negative_1_expanded",
-      // "test_layer_normalization_2d_axis_negative_1",
+      "test_layer_normalization_2d_axis_negative_1",
       // "test_layer_normalization_2d_axis_negative_2_expanded",
-      // "test_layer_normalization_2d_axis_negative_2",
+      "test_layer_normalization_2d_axis_negative_2",
       // "test_layer_normalization_2d_axis0_expanded",
-      // "test_layer_normalization_2d_axis0",
+      "test_layer_normalization_2d_axis0",
       // "test_layer_normalization_2d_axis1_expanded",
-      // "test_layer_normalization_2d_axis1",
-      // // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
-      // "test_layer_normalization_3d_axis_negative_1_epsilon",
-      // // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
-      // "test_layer_normalization_3d_axis_negative_2_epsilon",
-      // // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
-      // "test_layer_normalization_3d_axis_negative_3_epsilon",
-      // // "test_layer_normalization_3d_axis0_epsilon_expanded",
-      // "test_layer_normalization_3d_axis0_epsilon",
+      "test_layer_normalization_2d_axis1",
+      // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
+      "test_layer_normalization_3d_axis_negative_1_epsilon",
+      // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
+      "test_layer_normalization_3d_axis_negative_2_epsilon",
+      // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
+      "test_layer_normalization_3d_axis_negative_3_epsilon",
+      // "test_layer_normalization_3d_axis0_epsilon_expanded",
+      "test_layer_normalization_3d_axis0_epsilon",
       // "test_layer_normalization_3d_axis1_epsilon_expanded",
-      // "test_layer_normalization_3d_axis1_epsilon",
-      // // "test_layer_normalization_3d_axis2_epsilon_expanded",
-      // "test_layer_normalization_3d_axis2_epsilon",
+      "test_layer_normalization_3d_axis1_epsilon",
+      // "test_layer_normalization_3d_axis2_epsilon_expanded",
+      "test_layer_normalization_3d_axis2_epsilon",
       // "test_layer_normalization_4d_axis_negative_1_expanded",
-      // "test_layer_normalization_4d_axis_negative_1",
-      // // "test_layer_normalization_4d_axis_negative_2_expanded",
-      // "test_layer_normalization_4d_axis_negative_2",
+      "test_layer_normalization_4d_axis_negative_1",
+      // "test_layer_normalization_4d_axis_negative_2_expanded",
+      "test_layer_normalization_4d_axis_negative_2",
       // "test_layer_normalization_4d_axis_negative_3_expanded",
-      // "test_layer_normalization_4d_axis_negative_3",
+      "test_layer_normalization_4d_axis_negative_3",
       // "test_layer_normalization_4d_axis_negative_4_expanded",
-      // "test_layer_normalization_4d_axis_negative_4",
+      "test_layer_normalization_4d_axis_negative_4",
       // "test_layer_normalization_4d_axis0_expanded",
-      // "test_layer_normalization_4d_axis0",
+      "test_layer_normalization_4d_axis0",
       // "test_layer_normalization_4d_axis1_expanded",
-      // "test_layer_normalization_4d_axis1",
+      "test_layer_normalization_4d_axis1",
       // "test_layer_normalization_4d_axis2_expanded",
-      // "test_layer_normalization_4d_axis2",
+      "test_layer_normalization_4d_axis2",
       // "test_layer_normalization_4d_axis3_expanded",
-      // "test_layer_normalization_4d_axis3",
+      "test_layer_normalization_4d_axis3",
       // "test_layer_normalization_default_axis_expanded",
-      // "test_layer_normalization_default_axis",
+      "test_layer_normalization_default_axis",
       "test_leakyrelu_default",
       "test_leakyrelu_example",
       "test_leakyrelu",
-      // "test_less_bcast",
-      // "test_less_equal_bcast_expanded",
-      // "test_less_equal_bcast",
-      // "test_less_equal_expanded",
-      // "test_less_equal",
-      // "test_less",
+      "test_less_bcast",
+      "test_less_equal_bcast_expanded",
+      "test_less_equal_bcast",
+      "test_less_equal_expanded",
+      "test_less_equal",
+      "test_less",
       "test_log_example",
       "test_log",
       // // "test_logsoftmax_axis_0_expanded",
@@ -1897,20 +1896,20 @@
       // // "test_lstm_defaults",
       // // "test_lstm_with_initial_bias",
       // // "test_lstm_with_peepholes",
-      // "test_matmul_2d",
-      // "test_matmul_3d",
-      // "test_matmul_4d",
+      "test_matmul_2d",
+      "test_matmul_3d",
+      "test_matmul_4d",
       // // "test_matmulinteger",
-      // "test_max_example",
+      "test_max_example",
       // "test_max_float16",
-      // "test_max_float32",
-      // "test_max_float64",
+      "test_max_float32",
+      "test_max_float64",
       // "test_max_int16",
       // "test_max_int32",
       // "test_max_int64",
       // "test_max_int8",
-      // "test_max_one_input",
-      // "test_max_two_inputs",
+      "test_max_one_input",
+      "test_max_two_inputs",
       // "test_max_uint16",
       // "test_max_uint32",
       // "test_max_uint64",
@@ -1918,7 +1917,7 @@
       // "test_maxpool_1d_default",
       // "test_maxpool_2d_ceil",
       "test_maxpool_2d_default",
-      // "test_maxpool_2d_dilations",
+      "test_maxpool_2d_dilations",
       "test_maxpool_2d_pads",
       "test_maxpool_2d_precomputed_pads",
       "test_maxpool_2d_precomputed_same_upper",
@@ -1936,16 +1935,16 @@
       // // "test_mean_one_input",
       // // "test_mean_two_inputs",
       // // "test_melweightmatrix",
-      // "test_min_example",
+      "test_min_example",
       // "test_min_float16",
-      // "test_min_float32",
-      // "test_min_float64",
+      "test_min_float32",
+      "test_min_float64",
       // "test_min_int16",
       // "test_min_int32",
       // "test_min_int64",
       // "test_min_int8",
-      // "test_min_one_input",
-      // "test_min_two_inputs",
+      "test_min_one_input",
+      "test_min_two_inputs",
       // "test_min_uint16",
       // "test_min_uint32",
       // "test_min_uint64",
@@ -2060,9 +2059,9 @@
       // "test_nonmaxsuppression_two_batches",
       // "test_nonmaxsuppression_two_classes",
       // "test_nonzero_example",
-      // "test_not_2d",
-      // "test_not_3d",
-      // "test_not_4d",
+      "test_not_2d",
+      "test_not_3d",
+      "test_not_4d",
       // // "test_onehot_negative_indices",
       // // "test_onehot_with_axis",
       // // "test_onehot_with_negative_axis",
@@ -2093,8 +2092,8 @@
       // "test_pow_types_int64_float32",
       // "test_pow_types_int64_int64",
       "test_pow",
-      // "test_prelu_broadcast",
-      // "test_prelu_example",
+      "test_prelu_broadcast",
+      "test_prelu_example",
       // // "test_qlinearconv",
       // // "test_qlinearmatmul_2D",
       // // "test_qlinearmatmul_3D",
@@ -2104,27 +2103,27 @@
       // "test_range_float_type_positive_delta",
       // "test_range_int32_type_negative_delta_expanded",
       // "test_range_int32_type_negative_delta",
-      // "test_reciprocal_example",
-      // "test_reciprocal",
-      // "test_reduce_l1_default_axes_keepdims_example",
-      // "test_reduce_l1_default_axes_keepdims_random",
-      // "test_reduce_l1_do_not_keepdims_example",
-      // "test_reduce_l1_do_not_keepdims_random",
-      // "test_reduce_l1_keep_dims_example",
-      // "test_reduce_l1_keep_dims_random",
-      // "test_reduce_l1_negative_axes_keep_dims_example",
-      // "test_reduce_l1_negative_axes_keep_dims_random",
-      // "test_reduce_l2_default_axes_keepdims_example",
-      // "test_reduce_l2_default_axes_keepdims_random",
-      // "test_reduce_l2_do_not_keepdims_example",
-      // "test_reduce_l2_do_not_keepdims_random",
-      // "test_reduce_l2_keep_dims_example",
-      // "test_reduce_l2_keep_dims_random",
-      // "test_reduce_l2_negative_axes_keep_dims_example",
-      // "test_reduce_l2_negative_axes_keep_dims_random",
-      // "test_reduce_log_sum_asc_axes",
-      // "test_reduce_log_sum_default",
-      // "test_reduce_log_sum_desc_axes",
+      "test_reciprocal_example",
+      "test_reciprocal",
+      "test_reduce_l1_default_axes_keepdims_example",
+      "test_reduce_l1_default_axes_keepdims_random",
+      "test_reduce_l1_do_not_keepdims_example",
+      "test_reduce_l1_do_not_keepdims_random",
+      "test_reduce_l1_keep_dims_example",
+      "test_reduce_l1_keep_dims_random",
+      "test_reduce_l1_negative_axes_keep_dims_example",
+      "test_reduce_l1_negative_axes_keep_dims_random",
+      "test_reduce_l2_default_axes_keepdims_example",
+      "test_reduce_l2_default_axes_keepdims_random",
+      "test_reduce_l2_do_not_keepdims_example",
+      "test_reduce_l2_do_not_keepdims_random",
+      "test_reduce_l2_keep_dims_example",
+      "test_reduce_l2_keep_dims_random",
+      "test_reduce_l2_negative_axes_keep_dims_example",
+      "test_reduce_l2_negative_axes_keep_dims_random",
+      "test_reduce_log_sum_asc_axes",
+      "test_reduce_log_sum_default",
+      "test_reduce_log_sum_desc_axes",
       // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
       // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
       // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
@@ -2134,9 +2133,9 @@
       // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
       // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
       // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
-      // "test_reduce_log_sum_negative_axes",
-      // "test_reduce_log_sum",
-      // "test_reduce_max_default_axes_keepdim_example",
+      "test_reduce_log_sum_negative_axes",
+      "test_reduce_log_sum",
+      "test_reduce_max_default_axes_keepdim_example",
       // "test_reduce_max_default_axes_keepdims_random",
       // "test_reduce_max_do_not_keepdims_example",
       // "test_reduce_max_do_not_keepdims_random",
@@ -2172,8 +2171,8 @@
       // "test_reduce_sum_default_axes_keepdims_random",
       // "test_reduce_sum_do_not_keepdims_example",
       // "test_reduce_sum_do_not_keepdims_random",
-      // "test_reduce_sum_empty_axes_input_noop_example",
-      // "test_reduce_sum_empty_axes_input_noop_random",
+      "test_reduce_sum_empty_axes_input_noop_example",
+      "test_reduce_sum_empty_axes_input_noop_random",
       // "test_reduce_sum_keepdims_example",
       // "test_reduce_sum_keepdims_random",
       // "test_reduce_sum_negative_axes_keepdims_example",
@@ -2188,45 +2187,45 @@
       // "test_reduce_sum_square_negative_axes_keepdims_random",
       // "test_reflect_pad",
       "test_relu",
-      // "test_reshape_allowzero_reordered",
-      // "test_reshape_extended_dims",
-      // "test_reshape_negative_dim",
-      // "test_reshape_negative_extended_dims",
-      // "test_reshape_one_dim",
-      // "test_reshape_reduced_dims",
-      // "test_reshape_reordered_all_dims",
-      // "test_reshape_reordered_dims",
-      // "test_reshape_reordered_last_dims",
-      // "test_reshape_zero_and_negative_dim",
-      // "test_reshape_zero_dim",
-      // "test_resize_downsample_linear",
-      // "test_resize_downsample_nearest",
-      // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
+      "test_reshape_allowzero_reordered",
+      "test_reshape_extended_dims",
+      "test_reshape_negative_dim",
+      "test_reshape_negative_extended_dims",
+      "test_reshape_one_dim",
+      "test_reshape_reduced_dims",
+      "test_reshape_reordered_all_dims",
+      "test_reshape_reordered_dims",
+      "test_reshape_reordered_last_dims",
+      "test_reshape_zero_and_negative_dim",
+      "test_reshape_zero_dim",
+      "test_resize_downsample_linear",
+      "test_resize_downsample_nearest",
+      "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
       // "test_resize_downsample_scales_cubic_align_corners",
-      // "test_resize_downsample_scales_cubic",
+      "test_resize_downsample_scales_cubic",
       // "test_resize_downsample_scales_linear_align_corners",
-      // "test_resize_downsample_scales_linear",
-      // "test_resize_downsample_scales_nearest",
-      // "test_resize_downsample_sizes_cubic",
-      // "test_resize_downsample_sizes_linear_pytorch_half_pixel",
-      // "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
-      // "test_resize_downsample_sizes_nearest",
-      // "test_resize_nearest",
-      // "test_resize_tf_crop_and_resize",
-      // "test_resize_upsample_linear",
-      // "test_resize_upsample_nearest",
-      // "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
-      // "test_resize_upsample_scales_cubic_align_corners",
-      // "test_resize_upsample_scales_cubic_asymmetric",
-      // "test_resize_upsample_scales_cubic",
-      // "test_resize_upsample_scales_linear_align_corners",
-      // "test_resize_upsample_scales_linear",
-      // "test_resize_upsample_scales_nearest",
-      // "test_resize_upsample_sizes_cubic",
-      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel",
-      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners",
-      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
-      // "test_resize_upsample_sizes_nearest",
+      "test_resize_downsample_scales_linear",
+      "test_resize_downsample_scales_nearest",
+      "test_resize_downsample_sizes_cubic",
+      "test_resize_downsample_sizes_linear_pytorch_half_pixel",
+      "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
+      "test_resize_downsample_sizes_nearest",
+      "test_resize_nearest",
+      "test_resize_tf_crop_and_resize",
+      "test_resize_upsample_linear",
+      "test_resize_upsample_nearest",
+      "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
+      "test_resize_upsample_scales_cubic_align_corners",
+      "test_resize_upsample_scales_cubic_asymmetric",
+      "test_resize_upsample_scales_cubic",
+      "test_resize_upsample_scales_linear_align_corners",
+      "test_resize_upsample_scales_linear",
+      "test_resize_upsample_scales_nearest",
+      "test_resize_upsample_sizes_cubic",
+      "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel",
+      "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners",
+      "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
+      "test_resize_upsample_sizes_nearest",
       // // "test_reversesequence_batch",
       // // "test_reversesequence_time",
       // // "test_rnn_seq_length",
@@ -2440,17 +2439,17 @@
       // // "test_softsign",
       // "test_spacetodepth_example",
       // "test_spacetodepth",
-      // "test_split_equal_parts_1d",
-      // "test_split_equal_parts_2d",
-      // "test_split_equal_parts_default_axis",
-      // "test_split_variable_parts_1d",
-      // "test_split_variable_parts_2d",
-      // "test_split_variable_parts_default_axis",
-      // "test_split_zero_size_splits",
+      "test_split_equal_parts_1d",
+      "test_split_equal_parts_2d",
+      "test_split_equal_parts_default_axis",
+      "test_split_variable_parts_1d",
+      "test_split_variable_parts_2d",
+      "test_split_variable_parts_default_axis",
+      "test_split_zero_size_splits",
       "test_sqrt_example",
       "test_sqrt",
-      // "test_squeeze_negative_axes",
-      // "test_squeeze",
+      "test_squeeze_negative_axes",
+      "test_squeeze",
       // // "test_stft_with_window",
       // // "test_stft",
       // // "test_strnormalizer_export_monday_casesensintive_lower",
@@ -2497,7 +2496,7 @@
       "test_transpose_all_permutations_3",
       "test_transpose_all_permutations_4",
       "test_transpose_all_permutations_5",
-      "test_transpose_default"
+      "test_transpose_default",
       // "test_tril_neg",
       // "test_tril_one_row_neg",
       // "test_tril_out_neg",
@@ -2521,18 +2520,18 @@
       // // "test_unique_sorted_with_axis",
       // // "test_unique_sorted_with_negative_axis",
       // // "test_unique_sorted_without_axis",
-      // "test_unsqueeze_axis_0",
-      // "test_unsqueeze_axis_1",
-      // "test_unsqueeze_axis_2",
-      // "test_unsqueeze_axis_3",
-      // "test_unsqueeze_negative_axes",
-      // "test_unsqueeze_three_axes",
-      // "test_unsqueeze_two_axes",
-      // "test_unsqueeze_unsorted_axes",
-      // "test_unsqueeze",
+      "test_unsqueeze_axis_0",
+      "test_unsqueeze_axis_1",
+      "test_unsqueeze_axis_2",
+      "test_unsqueeze_axis_3",
+      "test_unsqueeze_negative_axes",
+      "test_unsqueeze_three_axes",
+      "test_unsqueeze_two_axes",
+      "test_unsqueeze_unsorted_axes",
+      "test_unsqueeze",
       // "test_wrap_pad"
       // "test_upsample_nearest",
-      // "test_where_example",
+      "test_where_example"
       // "test_where_long_example",
       // "test_xor_bcast3v1d",
       // "test_xor_bcast3v2d",
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 6d5951be7b1e6..3492c8f3780ea 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -96,7 +96,7 @@ async function loadTensors(
   const outputs: Test.NamedTensor[] = [];
   let dataFileType: 'none'|'pb'|'npy' = 'none';
 
-  const allowInt64 = ['wasm', 'xnnpack', 'webgpu'].includes(backendName);
+  const allowInt64 = ['wasm', 'xnnpack', 'webgpu', 'webnn'].includes(backendName);
 
   for (const dataFile of testCase.dataFiles) {
     const ext = extname(dataFile);

From 3c0a6b505a476c2915e89c0d0ebe8151105cf879 Mon Sep 17 00:00:00 2001
From: Numfor Tiapo <numsmt2@gmail.com>
Date: Fri, 12 Jan 2024 10:37:48 -0800
Subject: [PATCH 039/100] Update transformers module to 4.36 (#18993)

Update transformers module to fix security vulnerabilities in our
internal pipeline
---
 cgmanifests/cgmanifest.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index e8dbc9cf9eff6..cf245e63a3a5d 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -469,7 +469,7 @@
             "type": "pip",
             "pip": {
                "Name": "transformers",
-               "Version": "2.11.0"
+               "Version": "4.36.0"
             },
             "comments": "Installed in the training docker image"
          }
@@ -570,7 +570,7 @@
             "git": {
                "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac",
                "repositoryUrl": "https://gitlab.com/libeigen/eigen.git"
-            }            
+            }
          }
       }
    ],

From 4520b76417200152ea3c8c68c5a685630c6b5304 Mon Sep 17 00:00:00 2001
From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Date: Fri, 12 Jan 2024 10:40:47 -0800
Subject: [PATCH 040/100] Exclude TP custom API from minimal (#19086)

Exclude TP custom API from minimal.

---------

Co-authored-by: Randy Shuai <rashuai@microsoft.com>
---
 onnxruntime/core/session/custom_ops.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index d653a27c577b0..4bae42f4b80ad 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -26,6 +26,10 @@
 #include "core/session/ort_apis.h"
 #include "core/platform/threadpool.h"
 
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#define ENABLE_CUSTOM_OP_API
+#endif
+
 #if !defined(ORT_MINIMAL_BUILD)
 static constexpr uint32_t min_ort_version_with_optional_io_support = 8;
 static constexpr uint32_t min_ort_version_with_variadic_io_support = 14;
@@ -379,6 +383,7 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont
 };
 
 ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data) {
+#ifdef ENABLE_CUSTOM_OP_API
   API_IMPL_BEGIN
   if (!context) {
     return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, "Invalid context");
@@ -401,6 +406,14 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelCont
   }
   return nullptr;
   API_IMPL_END
+#else
+  ORT_UNUSED_PARAMETER(context);
+  ORT_UNUSED_PARAMETER(fn);
+  ORT_UNUSED_PARAMETER(total);
+  ORT_UNUSED_PARAMETER(num_batch);
+  ORT_UNUSED_PARAMETER(usr_data);
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "ParallelFor API not implemented for this build");
+#endif
 };
 
 #ifdef _WIN32

From 55b046e97e799e5d71901f9e51fea27e35c15a95 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 12 Jan 2024 11:01:39 -0800
Subject: [PATCH 041/100] Remove enable_mac_silicon settings (#19108)

### Description
Remove enable_mac_silicon settings from two packaging pipelines.

### Motivation and Context
Now we build universal2 packages instead.
---
 .../github/azure-pipelines/build-perf-test-binaries-pipeline.yml | 1 -
 .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 08330764ff5f7..3ddc167bc0a61 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -40,5 +40,4 @@ stages:
       enable_windows_cpu: false
       enable_windows_gpu: false
       enable_mac_cpu: false
-      enable_mac_silicon: false
       enable_linux_arm: false
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 07b233590bcf5..817ace0571837 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -101,5 +101,4 @@ stages:
     enable_windows_cpu: true
     enable_windows_gpu: false
     enable_mac_cpu: true
-    enable_mac_silicon: true
     enable_linux_arm: false

From dcd6d4cad6ec7406cea192cbbbcda80c336b98c2 Mon Sep 17 00:00:00 2001
From: Aditya Goel <48102515+adityagoel4512@users.noreply.github.com>
Date: Fri, 12 Jan 2024 20:43:44 +0000
Subject: [PATCH 042/100] Label encoder opset4 (#17977)

### Description
<!-- Describe your changes. -->
Implements LabelEncoder as per `ai.onnx.ml` opset 4 for the upcoming
ONNX 1.15 release. ~~This currently depends on a new ONNX release
candidate and so is marked as draft in the meantime.~~


### Motivation and Context
Closes https://github.com/microsoft/onnxruntime/issues/17602
---
 docs/OperatorKernels.md                       |    3 +-
 .../providers/cpu/cpu_execution_provider.cc   | 1490 ++++++++++-------
 .../core/providers/cpu/ml/label_encoder.cc    |  430 +++--
 .../core/providers/cpu/ml/label_encoder.h     |  191 ++-
 .../providers/cpu/ml/label_encoder_test.cc    |  282 +++-
 .../onnx_backend_test_series_filters.jsonc    |    4 -
 6 files changed, 1587 insertions(+), 813 deletions(-)

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index a2bb39da76235..394bd7ad2abae 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -425,7 +425,8 @@ Do not modify directly.*
 |DictVectorizer|*in* X:**T1**<br> *out* Y:**T2**|1+|**T1** = map(int64,tensor(double)), map(int64,tensor(float)), map(int64,tensor(string)), map(string,tensor(double)), map(string,tensor(float)), map(string,tensor(int64))<br/> **T2** = tensor(double), tensor(float), tensor(int64), tensor(string)|
 |FeatureVectorizer|*in* X:**T1**<br> *out* Y:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)|
 |Imputer|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(int64)|
-|LabelEncoder|*in* X:**T1**<br> *out* Y:**T2**|2+|**T1** = tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(float), tensor(int64), tensor(string)|
+|LabelEncoder|*in* X:**T1**<br> *out* Y:**T2**|4+|**T1** = tensor(double), tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(double), tensor(float), tensor(int16), tensor(int64), tensor(string)|
+|||[2, 3]|**T1** = tensor(float), tensor(int64), tensor(string)<br/> **T2** = tensor(float), tensor(int64), tensor(string)|
 |||1|**T1** = tensor(int64), tensor(string)<br/> **T2** = tensor(int64), tensor(string)|
 |LinearClassifier|*in* X:**T1**<br> *out* Y:**T2**<br> *out* Z:**tensor(float)**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)<br/> **T2** = tensor(int64), tensor(string)|
 |LinearRegressor|*in* X:**T**<br> *out* Y:**tensor(float)**|1+|**T** = tensor(float)|
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 6aef03a32db09..cbdf79caf3afd 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -25,8 +25,7 @@ struct KernelRegistryAndStatus {
 
 namespace onnxruntime {
 CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info)
-    : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {
-}
+    : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {}
 
 std::vector<AllocatorPtr> CPUExecutionProvider::CreatePreferredAllocators() {
   bool create_arena = info_.create_arena;
@@ -155,8 +154,10 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, double, Softmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, float, TopK);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, double, TopK);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Conv);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten);
@@ -185,10 +186,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int32_t,
-                                                      ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int64_t,
-                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int32_t, ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, int64_t, ReduceLogSum);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, float,
                                                       ReduceLogSumExp);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, double,
@@ -290,17 +289,28 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, Sign);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float, Erf);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_int64_t_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_int64_t_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_string_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_string_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_float_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_int32_t_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int64_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_int32_t, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_float, OneHot);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int32_t, OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
+                                                      int64_t_int64_t_int64_t, OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_int64_t_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_string_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_string_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, float_float_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_int32_t_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int64_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_int32_t,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int32_t_float_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_float,
+                                                      OneHot);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, int64_t_float_int32_t,
+                                                      OneHot);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, MaxUnpool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Sinh);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Cosh);
@@ -331,8 +341,10 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t, MatMul);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t, MatMul);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, PRelu);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9, float, Upsample);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9, int32_t, Upsample);
@@ -350,11 +362,16 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, int8_t, Resize);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, uint8_t, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ThresholdedRelu);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int32_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t, QuantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int32_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
+                                                      QuantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearMatMul);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, MatMulInteger);
@@ -400,12 +417,18 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t,
+                                                      ReduceLogSumExp);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, float, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, double, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, int32_t, ReduceMax);
@@ -424,10 +447,14 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSum);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSum);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t, ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int32_t,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, int64_t,
+                                                      ReduceSumSquare);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Hardmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, LogSoftmax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, LogSoftmax);
@@ -453,7 +480,8 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDoma
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv);
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, MLFloat16, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, MLFloat16,
+                                                      AveragePool);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If);
@@ -531,15 +559,22 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Ei
 // class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_float, Dropout);
 // class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_double, Dropout);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_float, Dropout);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_double, Dropout);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_float, Dropout);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_double, Dropout);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_double,
+                                                      Dropout);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_float,
+                                                      Dropout);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_double,
+                                                      Dropout);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Celu);
 
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float, GreaterOrEqual);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double, GreaterOrEqual);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t, GreaterOrEqual);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t, GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float,
+                                                      GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double,
+                                                      GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t,
+                                                      GreaterOrEqual);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t,
+                                                      GreaterOrEqual);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t, LessOrEqual);
@@ -549,9 +584,12 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Erf);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Cast);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Clip);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t, DequantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
+                                                      DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t,
+                                                      DequantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Expand);
@@ -577,8 +615,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Min);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Max);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Mean);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t, QuantizeLinear);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
+                                                      QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
+                                                      QuantizeLinear);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sigmoid);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Sign);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size);
@@ -699,12 +739,18 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceL2);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceLogSum);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceLogSumExp);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
+                                                      ReduceLogSum);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
+                                                      ReduceLogSumExp);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
+                                                      ReduceLogSumExp);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double, ReduceMax);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceMax);
@@ -723,10 +769,14 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceProd);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceProd);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceProd);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t, ReduceSumSquare);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t, ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
+                                                      ReduceSumSquare);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
+                                                      ReduceSumSquare);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ReduceSum);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, ReduceSum);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, ReduceSum);
@@ -774,8 +824,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Div);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18, Reshape);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15, Identity);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, float, BatchNormalization);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, double, BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, float,
+                                                      BatchNormalization);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, double,
+                                                      BatchNormalization);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, GRU);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, LSTM);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, RNN);
@@ -1035,96 +1087,127 @@ KernelCreateInfo BuildKernelCreateInfo<void>() {
 Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
     BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 10,
-                                                                    Clip)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 10, Clip)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Elu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, HardSigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15, LeakyRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15,
+                                                                    LeakyRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Relu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, Selu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Sigmoid)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softplus)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Softsign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Tanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
-                                                                    PRelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Tanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Tanh)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, PRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Multinomial)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, float, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int32_t,
-                                                                          Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12, int64_t,
-                                                                          Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int16_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int32_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int64_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint8_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint16_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint32_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, uint64_t,
-                                                                          Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float,
-                                                                          Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double,
-                                                                          Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Sqrt)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int32_t,
-                                                                          Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, int64_t,
-                                                                          Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          float, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          double, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int32_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 12,
+                                                                          int64_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int8_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int64_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint8_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          uint64_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Reciprocal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Reciprocal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Sqrt)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Sqrt)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int8_t, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int32_t, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          int64_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 11, Pow)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Exp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, float, Log)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, double, Log)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Exp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Exp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          float, Log)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                          double, Log)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           float, Sum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, float, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, double, Sum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          float, Sum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          double, Sum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           float, Min)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, Min)>,
@@ -1155,7 +1238,8 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           double, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 7,
                                                                           float, Mean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, float, Mean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          float, Mean)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, float, Sin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, double, Sin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Cos)>,
@@ -1163,8 +1247,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Asin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Acos)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, Atan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, double, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                          float, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                          double, Gemm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     Hardmax)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
@@ -1187,27 +1273,23 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           float, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
                                                                           double, BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Conv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Conv)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     ConvTranspose)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8,
-                                                                    Flatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6,
                                                           InstanceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1,
-                                                                float, LpNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1,
-                                                                double, LpNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float,
+                                                                LpNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double,
+                                                                LpNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, LRN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
                                                                     AveragePool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7,
-                                                                    MaxPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 7, MaxPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11,
                                                                     MaxPool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10,
-                                                                    LpPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, GlobalLpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalAveragePool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, GlobalMaxPool)>,
@@ -1298,31 +1380,28 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 13, RNN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Cast)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 4, 10,
-                                                                    Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Gather)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9,
-                                                                    Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 9, Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
                                                                     Identity)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 4, Reshape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 5, 12,
                                                                     Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 5, 12, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Size)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9,
-                                                                    Slice)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, Slice)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                    SpaceToDepth)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     DepthToSpace)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10,
-                                                                    Split)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 2, 10, Split)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     Squeeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, Tile)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12, Transpose)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 12,
+                                                                    Transpose)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
                                                                     Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
@@ -1333,59 +1412,59 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           int8_t, Upsample)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
                                                                           uint8_t, Upsample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, float,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, double,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int8_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int16_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int32_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, int64_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint8_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint16_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint32_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, uint64_t,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, bool,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, MLFloat16,
-                                                                          Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12, string,
-                                                                          Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          float, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          double, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int8_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int16_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int32_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          int64_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint8_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint16_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint32_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          uint64_t, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          bool, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          MLFloat16, Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 12,
+                                                                          string, Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 8, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, If)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10,
-                                                                    Loop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
 
     // Opset 9
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                     Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 19, ConstantOfShape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 19,
+                                                                    ConstantOfShape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                     MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t,
-                                                                          Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float,
-                                                                          Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double,
-                                                                          Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t,
-                                                                          Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t,
-                                                                          Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          double, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int32_t, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int64_t, Greater)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          double, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int32_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int64_t, Less)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, EyeLike)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                           float, IsNaN)>,
@@ -1393,10 +1472,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           double, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                           MLFloat16, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
-                                                                    Sign)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, Sign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Shrink)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float, Erf)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, Erf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                           int64_t_int64_t_int64_t, OneHot)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
@@ -1426,8 +1505,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Asinh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Acosh)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, Atanh)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
-                                                                    Scan)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                     Scatter)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, TfIdfVectorizer)>,
@@ -1441,36 +1519,36 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           int64_t, NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
                                                                           uint8_t, NonZero)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, string,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, float,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, double,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, int32_t,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, int64_t,
-                                                                          Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, uint8_t,
-                                                                          Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          string, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          float, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          double, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          int32_t, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          int64_t, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15,
+                                                                          uint8_t, Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                     Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                           float, Gemm)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                           double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, float,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, double,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int32_t,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12, int64_t,
-                                                                          MatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, float,
-                                                                          BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13, double,
-                                                                          BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          float, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          double, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int32_t, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 12,
+                                                                          int64_t, MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
+                                                                          float, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 13,
+                                                                          double, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 15, PRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 9,
                                                                           float, Upsample)>,
@@ -1499,40 +1577,42 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
                                                                           uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ThresholdedRelu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int32_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, uint8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearMatMul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearMatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          uint8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          int8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          int32_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          uint8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12,
+                                                                          int8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                QLinearMatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                QLinearMatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
                                                                 MatMulInteger)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
                                                                 MatMulInteger)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ConvInteger)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearConv)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
-                                                                    Slice)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t,
+                                                                QLinearConv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t,
+                                                                QLinearConv)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 11,
                                                                     Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 10,
                                                                     NonMaxSuppression)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 19, IsInf)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, float,
-                                                                          RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15, double,
-                                                                          RoiAlign)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
+                                                                          float, RoiAlign)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 15,
+                                                                          double, RoiAlign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ReverseSequence)>,
     // opset 11
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11,
-                                                                    Clip)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 11, Clip)>,
 
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 13,
                                                                           float, CumSum)>,
@@ -1552,10 +1632,8 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                           float, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
                                                                           double, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float,
-                                                                Round)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double,
-                                                                Round)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, Round)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, Round)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16,
                                                                 Round)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
@@ -1577,32 +1655,41 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
                                                                           int32_t, ArgMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Loop)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Hardmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
-                                                                          LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
-                                                                          LogSoftmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double,
-                                                                          Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float,
-                                                                          Softmax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Hardmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          float, LogSoftmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          double, LogSoftmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          double, Softmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          float, Softmax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 15, Scan)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Flatten)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Compress)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Concat)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Gather)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Slice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Split)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Squeeze)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, Unsqueeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Squeeze)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Det)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                          NonMaxSuppression)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, AveragePool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, NonMaxSuppression)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18,
+                                                                    AveragePool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17,
+                                                                    LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
@@ -1611,15 +1698,17 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceEmpty)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceInsert)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceErase)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                          SequenceConstruct)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
-                                                          ConcatFromSequence)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceConstruct)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConcatFromSequence)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SplitToSequence)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, float, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, double, Gemm)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, GatherElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    ScatterND)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          float, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                          double, Gemm)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                    GatherElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint8_t,
                                                                 BitShift)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, uint32_t,
@@ -1631,14 +1720,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                     GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Range)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Unique)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float,
-                                                                TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double,
-                                                                TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int64_t,
-                                                                TopK)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int32_t,
-                                                                TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, float, TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, double, TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int64_t, TopK)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, int32_t, TopK)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
                                                                 int64_t_int64_t_int64_t, OneHot)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11,
@@ -1780,8 +1865,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                     GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Einsum)>,
 
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
                                                                 Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
@@ -1800,8 +1884,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t,
                                                                 Expand)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool,
-                                                                Expand)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
                                                                 Expand)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, string,
@@ -1809,36 +1892,41 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Erf)>,
     // REVIEW(codemzs): ConstEigenVectorArrayMap.cast<MLFLoat16) does not seem to be supported.
     // However these types work on GPU implementation.
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_MLFloat16, Dropout)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_float, Dropout)>,
-    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, float_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12, double_double, Dropout)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
+    // MLFloat16_MLFloat16, Dropout)>,
+    // BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12,
+    // MLFloat16_float, Dropout)>, BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider,
+    // kOnnxDomain, 12, MLFloat16_double, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          float_float, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          float_double, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          double_float, Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 12,
+                                                                          double_double, Dropout)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, Celu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t,
-                                                                          GreaterOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, float,
-                                                                          LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, double,
-                                                                          LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int32_t,
-                                                                          LessOrEqual)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15, int64_t,
-                                                                          LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          float, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          double, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int32_t, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int64_t, GreaterOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          float, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          double, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int32_t, LessOrEqual)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, 15,
+                                                                          int64_t, LessOrEqual)>,
 
     // opset 13
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Cast)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Clip)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                MatMul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, MatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
                                                                 MatMul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
@@ -1854,56 +1942,60 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, Size)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sigmoid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t,
-                                                                          DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, uint8_t,
-                                                                          QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int8_t,
-                                                                          QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
+                                                                Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                Sigmoid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          uint8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int8_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int32_t, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          uint8_t, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int8_t, QuantizeLinear)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Flatten)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, LRN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
                                                           MeanVarianceNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_float, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_double, Dropout)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                float, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                double, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                uint8_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int32_t, ArgMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                float, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                double, ArgMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13,
-                                                                int32_t, ArgMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_float,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float_double,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_float,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double_double,
+                                                                Dropout)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                ArgMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, ArgMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
+                                                                ArgMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
+                                                                ArgMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
                                                                     Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 14, Shape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Concat)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, bool,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int32_t,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, int64_t,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, float,
-                                                                          Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18, double,
-                                                                          Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          bool, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int32_t, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          int64_t, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          float, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 18,
+                                                                          double, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 Greater)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
@@ -1912,16 +2004,14 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Greater)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
                                                                 Greater)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Less)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, float, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, double, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Less)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                          float, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                          double, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
                                                                           int32_t, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
@@ -1953,40 +2043,27 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Neg)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Neg)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Neg)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, Mod)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int8_t, Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t,
-                                                                Abs)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t,
-                                                                Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int64_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint8_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint16_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint32_t, Abs)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, uint64_t, Abs)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 Reciprocal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
                                                                 Reciprocal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Floor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
-                                                                Ceil)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double,
-                                                                Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Floor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Ceil)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Ceil)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, double, Sqrt)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
@@ -2010,17 +2087,19 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherElements)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, SpaceToDepth)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15, ScatterND)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, float,
-                                                                          IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, double,
-                                                                          IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19, MLFloat16,
-                                                                          IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool,
-                                                                NonZero)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
+                                                                    ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 15,
+                                                                    ScatterND)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                    Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                          float, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                          double, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 19,
+                                                                          MLFloat16, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, bool, NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
@@ -2031,76 +2110,76 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 NonZero)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, GatherND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, Pad)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceL1)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceL2)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceLogSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceLogSumExp)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int8_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, uint8_t,
-                                                                          ReduceMax)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceMean)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int8_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, uint8_t,
-                                                                          ReduceMin)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceProd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, float,
-                                                                          ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int32_t,
-                                                                          ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, double,
-                                                                          ReduceSumSquare)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17, int64_t,
-                                                                          ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceL1)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceL1)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceL1)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceL2)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceL2)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceL2)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceLogSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceLogSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceLogSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceLogSumExp)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int8_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, ReduceMax)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceMean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceMean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceMean)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          uint8_t, ReduceMin)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceProd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceProd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceProd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          float, ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int32_t, ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          double, ReduceSumSquare)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 17,
+                                                                          int64_t, ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, float,
                                                                 ReduceSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, int32_t,
@@ -2130,53 +2209,42 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 Softmax)>,
 
     // OpSet 14
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float,
-                                                                CumSum)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, CumSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double,
                                                                 CumSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
                                                                 CumSum)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
                                                                 CumSum)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float,
-                                                                Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double,
-                                                                Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int8_t,
-                                                                Relu)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int8_t, Relu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Relu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, Trilu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Add)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Add)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Add)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Sub)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Sub)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Sub)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Sub)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Mul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Mul)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Mul)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Mul)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, float, Div)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, double, Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t,
-                                                                Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t,
-                                                                Div)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15, Identity)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, float,
-                                                                          BatchNormalization)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14, double,
-                                                                          BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int32_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, int64_t, Div)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 18,
+                                                                    Reshape)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 15,
+                                                                    Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
+                                                                          float, BatchNormalization)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, 14,
+                                                                          double, BatchNormalization)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, GRU)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, LSTM)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, RNN)>,
@@ -2190,29 +2258,37 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 18, Shape)>,
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17, OptionalHasElement)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17, OptionalGetElement)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
+                                                                    OptionalHasElement)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, 17,
+                                                                    OptionalGetElement)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 15, Optional)>,
 #endif
 
     // Opset 16
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Identity)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18,
+                                                                    Identity)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, If)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float,
                                                                 RoiAlign)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double,
                                                                 RoiAlign)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 19, float,
-                                                                          GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 19,
+                                                                          float, GridSample)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
+                                                                    ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 17,
+                                                                    ScatterND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, string, Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, float, Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, double, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t, Where)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, uint8_t, Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int32_t,
+                                                                Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, int64_t,
+                                                                Where)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, uint8_t,
+                                                                Where)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, LeakyRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, PRelu)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, 18, Scan)>,
@@ -2246,14 +2322,14 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 LayerNormalization)>,
 
     // Opset 18
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, float,
-                                                                          Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, int32_t,
-                                                                          Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, int8_t,
-                                                                          Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, uint8_t,
-                                                                          Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          float, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          int32_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          int8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18,
+                                                                          uint8_t, Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, float,
                                                                 ReduceL1)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
@@ -2326,38 +2402,70 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 ReduceSumSquare)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, LpPool)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, Col2Im)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseAnd)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseNot)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseOr)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t, BitwiseXor)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t, BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseAnd)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseNot)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseOr)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int8_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int16_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int32_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, int64_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint8_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint16_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint32_t,
+                                                                BitwiseXor)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, uint64_t,
+                                                                BitwiseXor)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, 18, Pad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterND)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
@@ -2388,8 +2496,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
                                                                 DequantizeLinear)>,
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, bool, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, Equal)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int64_t, Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+                                                                Equal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int64_t,
+                                                                Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, double, Equal)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, string, Equal)>,
@@ -2413,9 +2523,12 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, float, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t, Resize)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t, Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int32_t,
+                                                                Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, int8_t,
+                                                                Resize)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, uint8_t,
+                                                                Resize)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, Shape)>,
 
@@ -2436,18 +2549,27 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, int8_t, ReduceMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, uint8_t, ReduceMin)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, DFT)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, GridSample)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, AffineGrid)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, AffineGrid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
+                                                                GridSample)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
+                                                                GridSample)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float,
+                                                                AffineGrid)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double,
+                                                                AffineGrid)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
+                                                                IsNaN)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2, IsNaN)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ, IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2,
+                                                                IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E5M2FNUZ,
+                                                                IsNaN)>,
 #endif
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, IsInf)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, StringConcat)>,
@@ -2468,23 +2590,37 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
 Status RegisterFp16Kernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, MLFloat16, GlobalAveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18, MLFloat16, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, MLFloat16, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11, MLFloat16, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12, MLFloat16, Relu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13, MLFloat16, Relu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, MLFloat16, Relu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15, MLFloat16, LeakyRelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, MLFloat16, LeakyRelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, MLFloat16,
+                                                                  GlobalAveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16,
+                                                                  Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 18,
+                                                                            MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 19, MLFloat16,
+                                                                  AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 8, 11,
+                                                                            MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 12, MLFloat16,
+                                                                  MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 12,
+                                                                            MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, 13,
+                                                                            MLFloat16, Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 14, MLFloat16,
+                                                                  Relu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 6, 15,
+                                                                            MLFloat16, LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 16, MLFloat16,
+                                                                  LeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 7, 8,
+                                                                            MLFloat16, Gemm)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, 10,
                                                                             MLFloat16, Gemm)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12,
+                                                                            MLFloat16, Gemm)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16, Gemm)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 13, MLFloat16,
+                                                                  Gemm)>,
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2532,23 +2668,37 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t, Scaler);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMClassifier);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMRegressor);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int64_t, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int32_t, TreeEnsembleClassifier);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float, TreeEnsembleRegressor);
-class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double, TreeEnsembleRegressor);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int64_t,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int32_t,
+                                                      TreeEnsembleClassifier);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
+                                                      TreeEnsembleRegressor);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
+                                                      TreeEnsembleRegressor);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, ZipMap);
-
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_string, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_float, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_float, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_int64, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_string, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_int64, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_int64, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_string, LabelEncoder);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_float, LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, float_string,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, string_float,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, int64_float,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, float_int64,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, int64_string,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, string_int64,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, int64_int64,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, string_string,
+                                                      LabelEncoder);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3, float_float,
+                                                      LabelEncoder);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double, TreeEnsembleClassifier);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, int64_t, TreeEnsembleClassifier);
@@ -2556,6 +2706,22 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float, TreeEnsembleRegressor);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double, TreeEnsembleRegressor);
 
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_float, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_float, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_float, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int16, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_string, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_double, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_double, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_int64, LabelEncoder);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_double, LabelEncoder);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   KernelCreateInfo info;
@@ -2606,46 +2772,45 @@ Status RegisterOnnxMLOperatorKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, string,
                                                                   OneHotEncoder)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, float, Scaler)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double,
-                                                                  Scaler)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, double, Scaler)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int64_t,
                                                                   Scaler)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, int32_t,
                                                                   Scaler)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMClassifier)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, SVMRegressor)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int64_t,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, int32_t,
-                                                                            TreeEnsembleClassifier)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, float,
-                                                                            TreeEnsembleRegressor)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2, double,
-                                                                            TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            float, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            double, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            int64_t, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            int32_t, TreeEnsembleClassifier)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            float, TreeEnsembleRegressor)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, 2,
+                                                                            double, TreeEnsembleRegressor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 1, ZipMap)>,
 
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_string,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_float,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_float,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_int64,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_string,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_int64,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, int64_int64,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, string_string,
-                                                                  LabelEncoder)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, float_float,
-                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            float_string, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            string_float, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            int64_float, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            float_int64, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            int64_string, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            string_int64, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            int64_int64, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            string_string, LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 2, 3,
+                                                                            float_float, LabelEncoder)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, float,
                                                                   TreeEnsembleClassifier)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double,
@@ -2658,6 +2823,37 @@ Status RegisterOnnxMLOperatorKernels(KernelRegistry& kernel_registry) {
                                                                   TreeEnsembleRegressor)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 3, double,
                                                                   TreeEnsembleRegressor)>,
+
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_float,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_float,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, float_float,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_int16,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_string,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, string_double,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, int64_double,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_int64,
+                                                                  LabelEncoder)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMLDomain, 4, double_double,
+                                                                  LabelEncoder)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cpu/ml/label_encoder.cc b/onnxruntime/core/providers/cpu/ml/label_encoder.cc
index 7f626cfefb0c8..65102b62a963b 100644
--- a/onnxruntime/core/providers/cpu/ml/label_encoder.cc
+++ b/onnxruntime/core/providers/cpu/ml/label_encoder.cc
@@ -10,14 +10,12 @@ namespace onnxruntime {
 namespace ml {
 
 ONNX_CPU_OPERATOR_VERSIONED_ML_KERNEL(
-    LabelEncoder,
-    1, 1,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
-                                                              DataTypeImpl::GetTensorType<int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
-                                                DataTypeImpl::GetTensorType<int64_t>()})
+    LabelEncoder, 1, 1,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
+                                                      DataTypeImpl::GetTensorType<int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>(),
+                                                      DataTypeImpl::GetTensorType<int64_t>()})
         .SinceVersion(1, 2),
     LabelEncoder);
 
@@ -39,12 +37,11 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
     // map isn't going to change so get end() once instead of calling inside the for_each loop
     const auto map_end = string_to_int_map_.end();
 
-    std::for_each(input.begin(), input.end(),
-                  [&out, &map_end, this](const std::string& value) {
-                    auto map_to = string_to_int_map_.find(value);
-                    *out = map_to == map_end ? default_int_ : map_to->second;
-                    ++out;
-                  });
+    std::for_each(input.begin(), input.end(), [&out, &map_end, this](const std::string& value) {
+      auto map_to = string_to_int_map_.find(value);
+      *out = map_to == map_end ? default_int_ : map_to->second;
+      ++out;
+    });
   } else {
     if (!Y.IsDataTypeString())
       return Status(ONNXRUNTIME, FAIL, "Input of tensor(int64) must have output of tensor(string)");
@@ -55,169 +52,346 @@ Status LabelEncoder::Compute(OpKernelContext* context) const {
 
     const auto map_end = int_to_string_map_.end();
 
-    std::for_each(input.begin(), input.end(),
-                  [&out, &map_end, this](const int64_t& value) {
-                    auto map_to = int_to_string_map_.find(value);
-                    *out = map_to == map_end ? default_string_ : map_to->second;
-                    ++out;
-                  });
+    std::for_each(input.begin(), input.end(), [&out, &map_end, this](const int64_t& value) {
+      auto map_to = int_to_string_map_.find(value);
+      *out = map_to == map_end ? default_string_ : map_to->second;
+      ++out;
+    });
   }
 
   return Status::OK();
 }
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    float_string,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, float_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
     LabelEncoder_2<float, std::string>);
 
 template <>
 void LabelEncoder_2<float, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_floats";
-  _value_field_name = "values_strings";
-  info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
-};
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_strings";
+  info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    string_float,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, string_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
     LabelEncoder_2<std::string, float>);
 
 template <>
 void LabelEncoder_2<std::string, float>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_strings";
-  _value_field_name = "values_floats";
-  info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
-};
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_floats";
+  info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    int64_float,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, int64_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
     LabelEncoder_2<std::int64_t, float>);
 
 template <>
 void LabelEncoder_2<std::int64_t, float>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_int64s";
-  _value_field_name = "values_floats";
-  info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
-};
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_floats";
+  info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    float_int64,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, float_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
     LabelEncoder_2<float, std::int64_t>);
 
 template <>
 void LabelEncoder_2<float, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_floats";
-  _value_field_name = "values_int64s";
-  info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
-};
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_int64s";
+  info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, (std::int64_t)-1);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    string_string,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, string_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
     LabelEncoder_2<std::string, std::string>)
 
 template <>
 void LabelEncoder_2<std::string, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_strings";
-  _value_field_name = "values_strings";
-  info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
-};
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_strings";
+  info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    float_float,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, float_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
     LabelEncoder_2<float, float>)
 
 template <>
 void LabelEncoder_2<float, float>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_floats";
-  _value_field_name = "values_floats";
-  info.GetAttrOrDefault<float>("default_float", &_default_value, -0.0f);
-};
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_floats";
+  info.GetAttrOrDefault<float>("default_float", &default_value_, -0.0f);
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    int64_string,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, int64_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
     LabelEncoder_2<std::int64_t, std::string>)
 
 template <>
 void LabelEncoder_2<std::int64_t, std::string>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_int64s";
-  _value_field_name = "values_strings";
-  info.GetAttrOrDefault<std::string>("default_string", &_default_value, std::string("_Unused"));
-};
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_strings";
+  info.GetAttrOrDefault<std::string>("default_string", &default_value_, std::string("_Unused"));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    string_int64,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, string_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
     LabelEncoder_2<std::string, std::int64_t>)
 
 template <>
 void LabelEncoder_2<std::string, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_strings";
-  _value_field_name = "values_int64s";
-  info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
-};
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_int64s";
+  info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, static_cast<std::int64_t>(-1));
+}
 
-ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
-    LabelEncoder,
-    2,
-    int64_int64,
-    KernelDefBuilder().TypeConstraint("T1",
-                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
-        .TypeConstraint("T2",
-                        std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(
+    LabelEncoder, 2, 3, int64_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
     LabelEncoder_2<std::int64_t, std::int64_t>)
 
 template <>
 void LabelEncoder_2<std::int64_t, std::int64_t>::InitializeSomeFields(const OpKernelInfo& info) {
-  _key_field_name = "keys_int64s";
-  _value_field_name = "values_int64s";
-  info.GetAttrOrDefault<std::int64_t>("default_int64", &_default_value, (std::int64_t)-1);
-};
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_int64s";
+  info.GetAttrOrDefault<std::int64_t>("default_int64", &default_value_, static_cast<std::int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<std::int64_t, std::int64_t>)
+
+template <>
+void LabelEncoder_4<std::int64_t, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<std::int64_t, std::string>)
+
+template <>
+void LabelEncoder_4<std::int64_t, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+    LabelEncoder_4<std::int64_t, float>)
+
+template <>
+void LabelEncoder_4<std::int64_t, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  value_field_name_ = "values_floats";
+  default_value_ = GetDefault(kernel_info, "default_float", 0.f);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(LabelEncoder, 4, float_float,
+                                  KernelDefBuilder()
+                                      .TypeConstraint("T1",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+                                      .TypeConstraint("T2",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+                                  LabelEncoder_4<float, float>)
+
+template <>
+void LabelEncoder_4<float, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_floats";
+  default_value_ = GetDefault(kernel_info, "default_float", -0.f);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, float_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<float, std::string>)
+
+template <>
+void LabelEncoder_4<float, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, float_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<float, std::int64_t>)
+
+template <>
+void LabelEncoder_4<float, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_floats";
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<std::string, std::int64_t>)
+
+template <>
+void LabelEncoder_4<std::string, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_float,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<float>()}),
+    LabelEncoder_4<std::string, float>)
+
+template <>
+void LabelEncoder_4<std::string, float>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_floats";
+  default_value_ = GetDefault(kernel_info, "default_float", 0.f);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<std::string, std::string>)
+
+template <>
+void LabelEncoder_4<std::string, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_int16,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int16_t>()}),
+    LabelEncoder_4<std::string, std::int16_t>)
+
+template <>
+void LabelEncoder_4<std::string, std::int16_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  default_value_ = static_cast<std::int16_t>(GetDefault(kernel_info, "", static_cast<std::int16_t>(-1)));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(LabelEncoder, 4, double_double,
+                                  KernelDefBuilder()
+                                      .TypeConstraint("T1",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
+                                      .TypeConstraint("T2",
+                                                      std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
+                                  LabelEncoder_4<double, double>)
+
+template <>
+void LabelEncoder_4<double, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  default_value_ = GetDefault(kernel_info, "default_float", -0.);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, double_string,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()}),
+    LabelEncoder_4<double, std::string>)
+
+template <>
+void LabelEncoder_4<double, std::string>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  value_field_name_ = "values_strings";
+  default_value_ = GetDefault(kernel_info, "default_string", std::string("_Unused"));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, string_double,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::string>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
+    LabelEncoder_4<std::string, double>)
+
+template <>
+void LabelEncoder_4<std::string, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_strings";
+  default_value_ = GetDefault(kernel_info, "default_float", -0.);
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, double_int64,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()}),
+    LabelEncoder_4<double, std::int64_t>)
+
+template <>
+void LabelEncoder_4<double, std::int64_t>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  value_field_name_ = "values_int64s";
+  default_value_ = GetDefault(kernel_info, "default_int64", static_cast<int64_t>(-1));
+}
+
+ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
+    LabelEncoder, 4, int64_double,
+    KernelDefBuilder()
+        .TypeConstraint("T1", std::vector<MLDataType>{DataTypeImpl::GetTensorType<std::int64_t>()})
+        .TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<double>()}),
+    LabelEncoder_4<std::int64_t, double>)
+
+template <>
+void LabelEncoder_4<std::int64_t, double>::InitializeAttrFields(const OpKernelInfo& kernel_info) {
+  key_field_name_ = "keys_int64s";
+  default_value_ = GetDefault(kernel_info, "default_float", -0.);
+}
 
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/label_encoder.h b/onnxruntime/core/providers/cpu/ml/label_encoder.h
index 1b4fa01900ae9..0f9f7cfb5dba6 100644
--- a/onnxruntime/core/providers/cpu/ml/label_encoder.h
+++ b/onnxruntime/core/providers/cpu/ml/label_encoder.h
@@ -6,6 +6,8 @@
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/ml/ml_common.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/common/safeint.h"
 
 namespace onnxruntime {
 namespace ml {
@@ -53,57 +55,182 @@ class LabelEncoder_2 final : public OpKernel {
     std::vector<TKey> keys;
     std::vector<TValue> values;
 
-    ORT_THROW_IF_ERROR(info.GetAttrs<TKey>(_key_field_name, keys));
-    ORT_THROW_IF_ERROR(info.GetAttrs<TValue>(_value_field_name, values));
+    ORT_THROW_IF_ERROR(info.GetAttrs<TKey>(key_field_name_, keys));
+    ORT_THROW_IF_ERROR(info.GetAttrs<TValue>(value_field_name_, values));
 
     auto num_keys = keys.size();
     auto num_values = values.size();
-    ORT_ENFORCE(num_keys == num_values,
-                "The ", _key_field_name, " and ", _value_field_name, " attribtues in LabelEncoder ",
-                "(name: ", info.node().Name(), ") must have the same length. ",
-                "However, the number of key is ", num_keys, " and the number of ",
-                "values is ", num_values, ".");
-    _map.reserve(num_keys);
-    for (size_t i = 0; i < num_keys; ++i)
-      _map.emplace(keys[i], values[i]);
+    ORT_ENFORCE(num_keys == num_values, "The ", key_field_name_, " and ", value_field_name_,
+                " attributes in LabelEncoder ", "(name: ", info.node().Name(), ") must have the same length. ",
+                "However, the number of key is ", num_keys, " and the number of ", "values is ", num_values, ".");
+    map_.reserve(num_keys);
+    for (size_t i = 0; i < num_keys; ++i) map_.emplace(keys[i], values[i]);
   }
 
   Status Compute(OpKernelContext* context) const override {
-    const auto* tensor_pointer = context->Input<Tensor>(0);
-    if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
-    const Tensor& X = *tensor_pointer;
-    const TensorShape& shape = X.Shape();
-    Tensor& Y = *context->Output(0, shape);
-
-    auto input = X.template DataAsSpan<TKey>();
-    auto output = Y.template MutableDataAsSpan<TValue>();
-
-    for (int64_t i = 0; i < shape.Size(); ++i) {
-      const auto found = _map.find(input[onnxruntime::narrow<size_t>(i)]);
-      if (found == _map.end())
-        output[onnxruntime::narrow<size_t>(i)] = _default_value;
-      else
-        output[onnxruntime::narrow<size_t>(i)] = found->second;
+    const auto* X = context->Input<Tensor>(0);
+    const TensorShape& shape = X->Shape();
+    auto* Y = context->Output(0, shape);
+
+    auto input = X->template DataAsSpan<TKey>();
+    auto output = Y->template MutableDataAsSpan<TValue>();
+    auto input_iter = input.begin();
+    auto output_iter = output.begin();
+    while (input_iter != input.end()) {
+      const auto found = map_.find(*input_iter);
+      *output_iter = found == map_.end() ? default_value_ : found->second;
+      ++output_iter;
+      ++input_iter;
     }
-
     return Status::OK();
   }
 
  private:
   // Specialize this method to set attribute names. For example, if keys' type
-  // is 64-bit integer, _key_field_name should be "keys_int64s". Field names
+  // is 64-bit integer, key_field_name_ should be "keys_int64s". Field names
   // for other types can be found in ONNX spec.
   void InitializeSomeFields(const OpKernelInfo& info);
 
   // A collection of key-value pairs. Each (a_key, a_value) pair
   // means that the "a_key" in the input would be mapped to "a_value".
-  // If _map doesn't contain "a_key", we use _default_value as its output.
-  InlinedHashMap<TKey, TValue> _map;
-  TValue _default_value;
+  // If map_ doesn't contain "a_key", we use default_value_ as its output.
+  InlinedHashMap<TKey, TValue> map_;
+  TValue default_value_;
   // ONNX attribute name to load keys.
-  std::string _key_field_name;
+  std::string key_field_name_;
   // ONNX attribute name to load values.
-  std::string _value_field_name;
+  std::string value_field_name_;
+};
+
+template <typename T>
+std::vector<T> GetAttribute(const OpKernelInfo& info, const std::string& name, const std::string& tensor_name) {
+  if constexpr (std::is_same_v<T, std::string> || std::is_same_v<T, float> || std::is_same_v<T, int64_t>) {
+    std::vector<T> attrs;
+    if (info.GetAttrs<T>(name, attrs).IsOK()) {
+      return attrs;
+    }
+  }
+  ONNX_NAMESPACE::TensorProto attr_tensor_proto;
+  auto result = info.GetAttr(tensor_name, &attr_tensor_proto);
+  if (name.empty()) {
+    ORT_ENFORCE(result.IsOK(), "LabelEncoder is missing attribute ", tensor_name);
+  } else {
+    ORT_ENFORCE(result.IsOK(), "LabelEncoder is missing attribute ", tensor_name, " or ", name);
+  }
+  SafeInt<int64_t> element_count(1);
+  for (auto dim : attr_tensor_proto.dims()) {
+    element_count *= dim;
+  }
+  const SafeInt<size_t> tensor_size(element_count);
+  std::vector<T> out(tensor_size);
+  result = utils::UnpackTensor<T>(attr_tensor_proto, Path(), out.data(), tensor_size);
+  ORT_ENFORCE(result.IsOK(), "LabelEncoder could not unpack tensor attribute ", name);
+  return out;
+}
+
+template <typename T>
+T GetDefault(const OpKernelInfo& info, const std::string& attr_name, const T& backup) {
+  ONNX_NAMESPACE::TensorProto attr_tensor_proto;
+  auto result = info.GetAttr("default_tensor", &attr_tensor_proto);
+  if (result.IsOK() && utils::HasDataType(attr_tensor_proto)) {
+    T default_value;
+    result = utils::UnpackTensor<T>(attr_tensor_proto, Path(), &default_value, 1);
+    ORT_ENFORCE(result.IsOK(), "LabelEncoder could not unpack default tensor ", attr_name);
+    return default_value;
+  } else if constexpr (std::is_same_v<T, std::string> || std::is_same_v<T, float> || std::is_same_v<T, int64_t>) {
+    T default_value;
+    result = info.GetAttr<T>(attr_name, &default_value);
+    if (result.IsOK()) {
+      return default_value;
+    }
+  }
+  return backup;
+}
+
+// We don't make use of InlinedHashMap since we make use of a custom hash and equality function.
+// Introducing new template parameters in inlined_containers_fwd.h creates compilation errors
+// (see https://github.com/microsoft/onnxruntime/pull/17977#discussion_r1446510961).
+#ifndef DISABLE_ABSEIL
+template <typename T>
+using HashFunc = absl::container_internal::hash_default_hash<T>;
+
+template <typename T>
+using EqualFunc = absl::container_internal::hash_default_eq<T>;
+
+template <typename K, typename V, typename Hash, typename Equal>
+using HashMap = absl::flat_hash_map<K, V, Hash, Equal>;
+#else
+template <typename T>
+using HashFunc = std::hash<T>;
+
+template <typename T>
+using EqualFunc = std::equal_to<T>;
+
+template <typename K, typename V, typename Hash, typename Equal>
+using HashMap = std::unordered_map<K, V, Hash, Equal>;
+#endif  // DISABLE_ABSEIL
+
+template <typename T>
+struct NaNHash {
+  size_t operator()(const T& value) const {
+    if constexpr (std::is_floating_point_v<T>) {
+      if (std::isnan(value)) {
+        return 0;
+      }
+    }
+    return HashFunc<T>{}(value);
+  }
+};
+
+template <typename T>
+struct NaNEqual {
+  bool operator()(const T& lhs, const T& rhs) const {
+    if constexpr (std::is_floating_point_v<T>) {
+      if (std::isnan(lhs) && std::isnan(rhs)) {
+        return true;
+      }
+    }
+    return EqualFunc<T>{}(lhs, rhs);
+  }
+};
+
+template <typename TKey, typename TValue>
+class LabelEncoder_4 final : public OpKernel {
+ public:
+  LabelEncoder_4(const OpKernelInfo& kernel_info) : OpKernel(kernel_info) {
+    InitializeAttrFields(kernel_info);
+    auto keys = GetAttribute<TKey>(kernel_info, key_field_name_, "keys_tensor");
+    auto values = GetAttribute<TValue>(kernel_info, value_field_name_, "values_tensor");
+    ORT_ENFORCE(keys.size() == values.size(), "Keys and values must have the same length.");
+    for (size_t i = 0; i < keys.size(); ++i) {
+      map_.emplace(keys[i], values[i]);
+    }
+  }
+  Status Compute(OpKernelContext* context) const override {
+    const auto* X = context->Input<Tensor>(0);
+    const TensorShape& shape = X->Shape();
+    auto* Y = context->Output(0, shape);
+
+    auto input = X->template DataAsSpan<TKey>();
+    auto output = Y->template MutableDataAsSpan<TValue>();
+    auto input_iter = input.begin();
+    auto output_iter = output.begin();
+    while (input_iter != input.end()) {
+      const auto found = map_.find(*input_iter);
+      *output_iter = found == map_.end() ? default_value_ : found->second;
+      ++output_iter;
+      ++input_iter;
+    }
+    return Status::OK();
+  }
+
+ private:
+  void InitializeAttrFields(const OpKernelInfo& kernel_info);
+  HashMap<TKey, TValue, NaNHash<TKey>, NaNEqual<TKey>> map_;
+  TValue default_value_;
+  std::string key_field_name_;
+  std::string value_field_name_;
 };
+
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc b/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc
index 2ce652e833717..63001dd1063ce 100644
--- a/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/label_encoder_test.cc
@@ -8,7 +8,8 @@ namespace onnxruntime {
 namespace test {
 
 template <typename TInput, typename TOutput>
-static void RunTest(const std::vector<int64_t>& dims, const std::vector<TInput>& input, const std::vector<TOutput>& output) {
+static void RunTest(const std::vector<int64_t>& dims, const std::vector<TInput>& input,
+                    const std::vector<TOutput>& output) {
   OpTester test("LabelEncoder", 1, onnxruntime::kMLDomain);
 
   static const std::vector<std::string> labels = {"Beer", "Wine", "Tequila"};
@@ -231,5 +232,284 @@ TEST(LabelEncoder, FloatToFloatOpset2) {
   test.Run();
 }
 
+TEST(LabelEncoder, Int64toInt64Opset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<int64_t> input{1, 2, 3, 4, 5};
+  std::vector<int64_t> output{12, 13, 14, 15, 42};
+  std::vector<int64_t> key_data{1, 2, 3, 4};
+  std::vector<int64_t> value_data{12, 13, 14, 15};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  test.AddAttribute("keys_int64s", key_data);
+  test.AddAttribute("values_int64s", value_data);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  default_proto.add_dims(1);
+  default_proto.add_int64_data(42);
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<int64_t>("X", dims, input);
+  test.AddOutput<int64_t>("Y", dims, output);
+  test.Run();
+}
+
+TEST(LabelEncoder, StringtoInt16Opset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  const std::vector<std::string> input{"a", "b", "d", "c", "g"};
+  const std::vector<int16_t> output{0, 1, 42, 2, 42};
+  const std::vector<std::string> key_data{"a", "b", "c"};
+  const std::vector<int16_t> value_data{0, 1, 2};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  test.AddAttribute("keys_strings", key_data);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT16);
+  values_proto.add_dims(value_data.size());
+  for (const auto value : value_data) {
+    values_proto.add_int32_data(value);
+  }
+
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT16);
+  default_proto.add_dims(1);
+  default_proto.add_int32_data(42);
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<std::string>("X", dims, input);
+  test.AddOutput<int16_t>("Y", dims, output);
+  test.Run();
+}
+
+TEST(LabelEncoder, Int64toStringOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<int64_t> input{1, 2, 3, 4, 5};
+  std::vector<std::string> output{"Hello", "world", "_Unused", "onnxruntime", "!"};
+  std::vector<int64_t> key_data{1, 2, 4, 5};
+  std::vector<std::string> value_data{"Hello", "world", "onnxruntime", "!"};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  keys_proto.add_dims(key_data.size());
+  for (const auto key : key_data) {
+    keys_proto.add_int64_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  values_proto.add_dims(value_data.size());
+  for (const auto& value : value_data) {
+    values_proto.add_string_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  default_proto.add_dims(1);
+  default_proto.add_string_data("_Unused");
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<int64_t>("X", dims, input);
+  test.AddOutput<std::string>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, StringToFloatOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<std::string> input{"Hello", "world", "Random", "onnxruntime", "!"};
+  std::vector<float> output{3.14f, 2.0f, -0.0f, 2.718f, 5.0f};
+  std::vector<std::string> key_data{"Hello", "world", "onnxruntime", "!"};
+  std::vector<float> value_data{3.14f, 2.0f, 2.718f, 5.0f};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  keys_proto.add_dims(key_data.size());
+  for (const auto& key : key_data) {
+    keys_proto.add_string_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  values_proto.add_dims(value_data.size());
+  for (const auto& value : value_data) {
+    values_proto.add_float_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  default_proto.add_dims(1);
+  default_proto.add_float_data(-0.0f);
+  test.AddAttribute("default_tensor", default_proto);
+  test.AddInput<std::string>("X", dims, input);
+  test.AddOutput<float>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, StringToDoubleOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<std::string> input{"Hello", "world", "Random", "onnxruntime", "!"};
+  std::vector<double> output{0.1, 1.1231e30, -0.0, 2.718, 5.0};
+  std::vector<std::string> key_data{"Hello", "world", "onnxruntime", "!"};
+  std::vector<double> value_data{0.1, 1.1231e30, 2.718, 5.0};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  keys_proto.add_dims(key_data.size());
+  for (const auto& key : key_data) {
+    keys_proto.add_string_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  values_proto.add_dims(value_data.size());
+  for (const auto& value : value_data) {
+    values_proto.add_double_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  default_proto.add_dims(1);
+  default_proto.add_double_data(-0.0);
+  test.AddAttribute("default_tensor", default_proto);
+  test.AddInput<std::string>("X", dims, input);
+  test.AddOutput<double>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, TensorBasedAttributesOpset4) {
+  std::vector<std::int64_t> dims{1, 5};
+
+  std::vector<int64_t> input{1, 2, 3, 4, 5};
+  std::vector<int64_t> output{12, 13, 14, 15, 42};
+  std::vector<int64_t> key_data{1, 2, 3, 4};
+  std::vector<int64_t> value_data{12, 13, 14, 15};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  keys_proto.add_dims(key_data.size());
+  for (const auto key : key_data) {
+    keys_proto.add_int64_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  ONNX_NAMESPACE::TensorProto values_proto;
+  values_proto.set_name("values_tensor");
+  values_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  values_proto.add_dims(value_data.size());
+  for (const auto value : value_data) {
+    values_proto.add_int64_data(value);
+  }
+  test.AddAttribute("values_tensor", values_proto);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  default_proto.add_dims(1);
+  default_proto.add_int64_data(42);
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<int64_t>("X", dims, input);
+  test.AddOutput<int64_t>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, NaNsMappedTogetherOpset4) {
+  std::vector<std::int64_t> dims{1, 6};
+  std::vector<float> input{3.14f, std::nanf("1"), 2.718f, std::nanf("2"), 5.f, -1.f};
+  std::vector<std::string> output{"a", "ONNX", "b", "ONNX", "c", "onnxruntime"};
+  std::vector<float> key_data{3.14f, 2.718f, 5.0f, std::nanf("3")};
+  std::vector<std::string> value_data{"a", "b", "c", "ONNX"};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  test.AddAttribute("keys_floats", key_data);
+  test.AddAttribute("values_strings", value_data);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  default_proto.add_dims(1);
+  default_proto.add_string_data("onnxruntime");
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<float>("X", dims, input);
+  test.AddOutput<std::string>("Y", dims, output);
+
+  test.Run();
+}
+
+TEST(LabelEncoder, DoubleNaNsMappedTogetherOpset4) {
+  std::vector<std::int64_t> dims{1, 6};
+  std::vector<double> input{3.14, std::nan("1"), 2.718, std::nan("2"), 5.0, -1};
+  std::vector<std::string> output{"a", "ONNX", "b", "ONNX", "c", "onnxruntime"};
+  std::vector<double> key_data{3.14, 2.718, 5.0, std::nan("3")};
+  std::vector<std::string> value_data{"a", "b", "c", "ONNX"};
+
+  OpTester test("LabelEncoder", 4, onnxruntime::kMLDomain);
+
+  ONNX_NAMESPACE::TensorProto keys_proto;
+  keys_proto.set_name("keys_tensor");
+  keys_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE);
+  keys_proto.add_dims(key_data.size());
+  for (const auto key : key_data) {
+    keys_proto.add_double_data(key);
+  }
+  test.AddAttribute("keys_tensor", keys_proto);
+
+  test.AddAttribute("values_strings", value_data);
+
+  ONNX_NAMESPACE::TensorProto default_proto;
+  default_proto.set_name("default_tensor");
+  default_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_STRING);
+  default_proto.add_dims(1);
+  default_proto.add_string_data("onnxruntime");
+  test.AddAttribute("default_tensor", default_proto);
+
+  test.AddInput<double>("X", dims, input);
+  test.AddOutput<std::string>("Y", dims, output);
+
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
index ed263515d6dd6..ca089c42032b1 100644
--- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
+++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc
@@ -235,10 +235,6 @@
         "^test_resize_upsample_sizes_nearest_not_larger_cuda",
         "^test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric_cuda",
         // onnx 1.15 (opset 20) new and updated op tests
-        "^test_ai_onnx_ml_label_encoder_string_int",
-        "^test_ai_onnx_ml_label_encoder_string_int_no_default",
-        "^test_ai_onnx_ml_label_encoder_tensor_mapping",
-        "^test_ai_onnx_ml_label_encoder_tensor_value_only_mapping",
         "^test_image_decoder_decode_bmp_rgb",
         "^test_image_decoder_decode_jpeg2k_rgb",
         "^test_image_decoder_decode_jpeg_bgr",

From c340bf08f602ba269245c46244435810ee3a52c3 Mon Sep 17 00:00:00 2001
From: Preetha Veeramalai <preetha.veeramalai@intel.com>
Date: Sat, 13 Jan 2024 02:50:51 +0530
Subject: [PATCH 043/100] Openvino EP code changes for 1.17 update (#19023)

### Description
Introduce AppendExecutionProvider_OpenVINO_V2 API and support for OV
2023.3.


### Context

- The API is added to facilitate customers in using published official
Microsoft onnxruntime libraries with OVEP libraries.
- Add support for OpenVINO 2023.3 official release.
- Extend operator coverage
- GH fixes

---------

Co-authored-by: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
---
 cmake/CMakeLists.txt                          | 14 ++---
 .../core/session/onnxruntime_c_api.h          | 17 ++++++
 .../core/session/onnxruntime_cxx_api.h        | 10 ++--
 .../core/session/onnxruntime_cxx_inline.h     | 20 +++++++
 .../providers/openvino/backend_manager.cc     | 17 ++----
 .../core/providers/openvino/backend_manager.h |  8 ++-
 .../core/providers/openvino/backend_utils.cc  |  2 -
 .../openvino/backends/basic_backend.cc        | 12 +---
 .../openvino/openvino_execution_provider.cc   | 59 +++++++++++--------
 .../openvino/openvino_execution_provider.h    |  3 +
 .../openvino/openvino_provider_factory.cc     |  1 -
 .../core/providers/openvino/ov_interface.cc   |  2 -
 .../core/providers/openvino/ov_interface.h    |  6 --
 .../openvino/ov_versions/capability.cc        | 24 ++++----
 .../{capabilities.h => capability.h}          | 10 +++-
 .../openvino/ov_versions/data_ops.cc          | 24 ++++----
 .../providers/openvino/ov_versions/data_ops.h |  8 ++-
 onnxruntime/core/session/onnxruntime_c_api.cc |  1 +
 onnxruntime/core/session/ort_apis.h           |  5 ++
 .../core/session/provider_bridge_ort.cc       | 57 ++++++++++++++----
 .../core/session/provider_registration.cc     | 12 ++++
 onnxruntime/test/perftest/ort_test_session.cc |  2 +-
 22 files changed, 201 insertions(+), 113 deletions(-)
 rename onnxruntime/core/providers/openvino/ov_versions/{capabilities.h => capability.h} (57%)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 1567da90cacfc..bc96218dac79e 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1296,21 +1296,21 @@ if (onnxruntime_USE_OPENVINO)
   endif()
 
   # Check OpenVINO version for support
-  if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2022.3")
-    set(OPENVINO_VERSION "2022.3")
-    add_definitions(-DOPENVINO_2022_3=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
+  if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
     set(OPENVINO_VERSION "2023.0")
     add_definitions(-DOPENVINO_2023_0=1)
   elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1")
     set(OPENVINO_VERSION "2023.1")
     add_definitions(-DOPENVINO_2023_1=1)
   elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.2")
-    set(OPENVINO_VERSION "2023.2")
-    add_definitions(-DOPENVINO_2023_1=1)
-  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
     set(OPENVINO_VERSION "2023.2")
     add_definitions(-DOPENVINO_2023_2=1)
+  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.3")
+    set(OPENVINO_VERSION "2023.3")
+    add_definitions(-DOPENVINO_2023_3=1)
+  elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
+    set(OPENVINO_VERSION "2023.3")
+    add_definitions(-DOPENVINO_2023_3=1)
   else()
     message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}")
   endif()
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 504f1db7b4420..3e69923330d6b 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4541,6 +4541,23 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data);
+
+  /** \brief Append OpenVINO execution provider to the session options
+   *
+   * If OpenVINO is not available (due to a non OpenVINO enabled build, or if OpenVINO is not installed on the system), this function will fail.
+   *
+   * \param[in] options
+   * \param[in] provider_options_keys
+   * \param[in] provider_options_values
+   * \param[in] num_keys
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   */
+  ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                  _In_ OrtSessionOptions* options,
+                  _In_reads_(num_keys) const char* const* provider_options_keys,
+                  _In_reads_(num_keys) const char* const* provider_options_values,
+                  _In_ size_t num_keys);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 3773a01cb65a8..7a553f9f94006 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -874,10 +874,12 @@ struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
   SessionOptionsImpl& AddInitializer(const char* name, const OrtValue* ort_val);                                             ///< Wraps OrtApi::AddInitializer
   SessionOptionsImpl& AddExternalInitializers(const std::vector<std::string>& names, const std::vector<Value>& ort_values);  ///< Wraps OrtApi::AddExternalInitializers
 
-  SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);               ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
-  SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
-  SessionOptionsImpl& AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options);               ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_ROCM
-  SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+  SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
+  SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options);     ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
+  SessionOptionsImpl& AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_ROCM
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO_V2
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options = {});
   SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
   SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
   SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index db4619eeeae62..957e849cf5d4d 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -865,6 +865,26 @@ inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_Ope
   return *this;
 }
 
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO_V2(this->p_,
+                                                                          keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
 template <typename T>
 inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::RegisterCustomOpsLibrary(const ORTCHAR_T* library_name,
                                                                               const CustomOpConfigs& custom_op_configs) {
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index b2a7028f49e55..330b464ffd1bb 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -13,23 +13,16 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
-static std::unique_ptr<GlobalContext> g_global_context;
-
 GlobalContext& BackendManager::GetGlobalContext() {
-  // This is not thread safe to call for the first time,
-  // but it is first called on the main thread by the constructor so it is safe.
-  if (!g_global_context)
-    g_global_context = std::make_unique<GlobalContext>();
-  return *g_global_context;
-}
-
-void BackendManager::ReleaseGlobalContext() {
-  g_global_context.reset();
+  return global_context_;
 }
 
-BackendManager::BackendManager(const onnxruntime::Node& fused_node,
+BackendManager::BackendManager(const GlobalContext& global_context,
+                               const onnxruntime::Node& fused_node,
                                const onnxruntime::GraphViewer& subgraph,
                                const logging::Logger& logger) {
+  global_context_ = global_context;
+
   auto prec_str = GetGlobalContext().precision_str;
   if (prec_str == "FP32") {
     subgraph_context_.precision = "FP32";
diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h
index a177324b23f7d..59bda7ca640ee 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.h
+++ b/onnxruntime/core/providers/openvino/backend_manager.h
@@ -18,13 +18,14 @@ namespace openvino_ep {
 // Singleton class that manages all the backends
 class BackendManager {
  public:
-  BackendManager(const onnxruntime::Node& fused_node,
+  BackendManager(const GlobalContext& global_context,
+                 const onnxruntime::Node& fused_node,
                  const onnxruntime::GraphViewer& subgraph,
                  const logging::Logger& logger);
   void Compute(OrtKernelContext* context);
   void ShutdownBackendManager();
-  static GlobalContext& GetGlobalContext();
-  static void ReleaseGlobalContext();
+  void SetGlobalCotext(const GlobalContext& global_context);
+  GlobalContext& GetGlobalContext();
 
  private:
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> GetModelProtoFromFusedNode(
@@ -45,6 +46,7 @@ class BackendManager {
   std::shared_ptr<IBackend> concrete_backend_;
   std::map<std::string, std::shared_ptr<IBackend>> backend_map_;
   SubGraphContext subgraph_context_;
+  GlobalContext global_context_;
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 5092fffcfc111..50c839017df2a 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -95,13 +95,11 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
       }
     }
 #ifndef NDEBUG
-#if defined(OPENVINO_2022_3) || (OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
     if (IsDebugEnabled()) {
       std::string name = cnn_network->get_friendly_name();
       ov::pass::Serialize serializer(name + ".xml", name + ".bin");
       serializer.run_on_model(cnn_network);
     }
-#endif
 #endif
     return cnn_network;
   } catch (std::string const& msg) {
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 2280d853e30f4..e6c093d584031 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -70,7 +70,6 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       }
 #else
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
       if (global_context_.disable_dynamic_shapes && dev_prec != "CPU_FP16") {
         const std::string model = model_proto.SerializeAsString();
         exe_network_ = global_context_.ie_core.LoadNetwork(
@@ -82,12 +81,6 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
             ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
         LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
       }
-#else
-      ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
-      exe_network_ = global_context_.ie_core.LoadNetwork(
-          ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name);
-      LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
-#endif
 #endif
     } else {
       ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_);
@@ -126,13 +119,11 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
     device_config.emplace(ov::enable_profiling(true));
   }
 #endif
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVION_2023_2)
   if (global_context_.device_type.find("NPU") != std::string::npos) {
     std::pair<std::string, ov::Any> device_property;
     device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER");
     device_config.emplace(ov::device::properties("NPU", device_property));
   }
-#endif
 }
 
 void BasicBackend::EnableCaching() {
@@ -463,8 +454,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
 
 #ifdef IO_BUFFER_ENABLED
     if ((global_context_.device_type.find("GPU") != std::string::npos) &&
-        (global_context_.context != nullptr) &&
-        (openvino_ep::BackendManager::GetGlobalContext().is_wholly_supported_graph)) {
+        (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) {
       try {
         StartRemoteAsyncInference(context, infer_request);
       } catch (std::string const& msg) {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index aa389f6297d80..e3948cc94b348 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -5,7 +5,7 @@
 #include "openvino_execution_provider.h"
 #include "contexts.h"
 #include "backend_manager.h"
-#include "ov_versions/capabilities.h"
+#include "ov_versions/capability.h"
 
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
@@ -15,22 +15,23 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
     : IExecutionProvider{onnxruntime::kOpenVINOExecutionProvider} {
   InitProviderOrtApi();
 
-  openvino_ep::BackendManager::GetGlobalContext().device_type = info.device_type_;
-  openvino_ep::BackendManager::GetGlobalContext().precision_str = info.precision_;
-  openvino_ep::BackendManager::GetGlobalContext().enable_npu_fast_compile = info.enable_npu_fast_compile_;
-  openvino_ep::BackendManager::GetGlobalContext().cache_dir = info.cache_dir_;
-  openvino_ep::BackendManager::GetGlobalContext().num_streams = info.num_streams_;
-  openvino_ep::BackendManager::GetGlobalContext().context = info.context_;
-  openvino_ep::BackendManager::GetGlobalContext().enable_opencl_throttling = info.enable_opencl_throttling_;
-  openvino_ep::BackendManager::GetGlobalContext().disable_dynamic_shapes = info.disable_dynamic_shapes_;
-  openvino_ep::BackendManager::GetGlobalContext().num_of_threads = info.num_of_threads_;
+  global_context_ = std::make_unique<openvino_ep::GlobalContext>();
+  global_context_->device_type = info.device_type_;
+  global_context_->precision_str = info.precision_;
+  global_context_->enable_npu_fast_compile = info.enable_npu_fast_compile_;
+  global_context_->cache_dir = info.cache_dir_;
+  global_context_->num_streams = info.num_streams_;
+  global_context_->context = info.context_;
+  global_context_->enable_opencl_throttling = info.enable_opencl_throttling_;
+  global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_;
+  global_context_->num_of_threads = info.num_of_threads_;
 
   // to check if target device is available
   // using ie_core capability GetAvailableDevices to fetch list of devices plugged in
   if (info.cache_dir_.empty()) {
     bool device_found = false;
     bool device_id_found = false;
-    auto available_devices = openvino_ep::BackendManager::GetGlobalContext().ie_core.GetAvailableDevices();
+    auto available_devices = global_context_->ie_core.GetAvailableDevices();
     // Checking for device_type configuration
     if (info.device_type_ != "") {
       if (info.device_type_.find("HETERO") != std::string::npos ||
@@ -89,7 +90,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv
       }
     }
   }
-  openvino_ep::BackendManager::GetGlobalContext().device_id = info.device_id_;
+  global_context_->device_id = info.device_id_;
 }
 
 std::vector<std::unique_ptr<ComputeCapability>>
@@ -100,36 +101,42 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer,
   if (!(GetEnvironmentVar("ORT_OPENVINO_ENABLE_CI_LOG").empty())) {
     std::cout << "In the OpenVINO EP" << std::endl;
   }
-  openvino_ep::BackendManager::GetGlobalContext().onnx_model_name = graph_viewer.Name();
+  global_context_->onnx_model_name = graph_viewer.Name();
 #ifdef _WIN32
   std::wstring onnx_path = graph_viewer.ModelPath().ToPathString();
-  openvino_ep::BackendManager::GetGlobalContext().onnx_model_path_name =
+  global_context_->onnx_model_path_name =
       std::string(onnx_path.begin(), onnx_path.end());
 #else
-  openvino_ep::BackendManager::GetGlobalContext().onnx_model_path_name =
+  global_context_->onnx_model_path_name =
       graph_viewer.ModelPath().ToPathString();
 #endif
-  openvino_ep::BackendManager::GetGlobalContext().onnx_opset_version =
+  global_context_->onnx_opset_version =
       graph_viewer.DomainToVersionMap().at(kOnnxDomain);
 
-#if defined(OPENVINO_2022_3)
+#if defined(OPENVINO_2023_0)
   openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2022_3");
-  result = obj.Execute();
-#elif defined(OPENVINO_2023_0)
-  openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2023_0");
+                                 global_context_->device_type,
+                                 global_context_->precision_str, "V_2023_0");
   result = obj.Execute();
 #elif defined(OPENVINO_2023_1)
   openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2023_1");
+                                 global_context_->device_type,
+                                 global_context_->precision_str, "V_2023_1");
   result = obj.Execute();
 #elif defined(OPENVINO_2023_2)
   openvino_ep::GetCapability obj(graph_viewer,
-                                 openvino_ep::BackendManager::GetGlobalContext().device_type, "V_2023_2");
+                                 global_context_->device_type,
+                                 global_context_->precision_str, "V_2023_2");
+  result = obj.Execute();
+#elif defined(OPENVINO_2023_3)
+  openvino_ep::GetCapability obj(graph_viewer,
+                                 global_context_->device_type,
+                                 global_context_->precision_str, "V_2023_3");
   result = obj.Execute();
 #endif
 
+  global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph();
+
   return result;
 }
 
@@ -142,10 +149,10 @@ common::Status OpenVINOExecutionProvider::Compile(
 
     NodeComputeInfo compute_info;
 
-    openvino_ep::BackendManager::GetGlobalContext().use_api_2 = true;
+    global_context_->use_api_2 = true;
 
     std::shared_ptr<openvino_ep::BackendManager> backend_manager =
-        std::make_shared<openvino_ep::BackendManager>(fused_node, graph_body_viewer, *GetLogger());
+        std::make_shared<openvino_ep::BackendManager>(*global_context_, fused_node, graph_body_viewer, *GetLogger());
 
     compute_info.create_state_func =
         [backend_manager](ComputeContext* context, FunctionState* state) {
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
index 7cc2fb9b1ea98..b0c92828d8a38 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -193,6 +193,9 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
   const void* GetExecutionHandle() const noexcept override {
     return nullptr;
   }
+
+ private:
+  std::unique_ptr<openvino_ep::GlobalContext> global_context_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 749907da18354..068456777bece 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -169,7 +169,6 @@ struct OpenVINO_Provider : Provider {
   }
 
   void Shutdown() override {
-    openvino_ep::BackendManager::ReleaseGlobalContext();
   }
 } g_provider;
 
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 31952e5b15e37..931173fd7ef47 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -87,7 +87,6 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr<OVNetwork>& ie_cnn_network,
   }
 }
 
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
 OVExeNetwork OVCore::LoadNetwork(const std::string& model,
                                  std::string& hw_target,
                                  ov::AnyMap& device_config,
@@ -103,7 +102,6 @@ OVExeNetwork OVCore::LoadNetwork(const std::string& model,
     ORT_THROW(log_tag + " Exception while Loading Network for graph " + name);
   }
 }
-#endif
 
 void OVCore::SetCache(std::string cache_dir_path) {
   oe.set_property(ov::cache_dir(cache_dir_path));
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 690e91742beed..3db19463809cf 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -6,14 +6,10 @@
 #include <vector>
 #include <memory>
 
-#if defined(OPENVINO_2022_3) || (OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
 #define OV_API_20
 #include "openvino/openvino.hpp"
 #include "openvino/pass/convert_fp32_to_fp16.hpp"
 #include "openvino/frontend/manager.hpp"
-#else
-#include <inference_engine.hpp>
-#endif
 
 #ifdef IO_BUFFER_ENABLED
 #include <gpu/gpu_context_api_ocl.hpp>
@@ -49,12 +45,10 @@ class OVCore {
                            std::string& hw_target,
                            ov::AnyMap& device_config,
                            std::string name);
-#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1) || (OPENVINO_2023_2)
   OVExeNetwork LoadNetwork(const std::string& model_stream,
                            std::string& hw_target,
                            ov::AnyMap& device_config,
                            std::string name);
-#endif
   void SetCache(std::string cache_dir_path);
 #ifdef IO_BUFFER_ENABLED
   OVExeNetwork LoadNetwork(std::shared_ptr<OVNetwork>& model, OVRemoteContextPtr context, std::string& name);
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 4494bb8ab2d60..11c8a1629b073 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -4,7 +4,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "../backend_utils.h"
 #include "../backend_manager.h"
-#include "capabilities.h"
+#include "capability.h"
 #include "utils.h"
 
 #if defined(_MSC_VER)
@@ -23,19 +23,21 @@ namespace onnxruntime {
 namespace openvino_ep {
 
 // Constructor
-GetCapability::GetCapability(const GraphViewer& graph_viewer_param, std::string device_type_param,
+GetCapability::GetCapability(const GraphViewer& graph_viewer_param,
+                             const std::string device_type_param,
+                             const std::string device_precision,
                              const std::string version_param)
-    : graph_viewer_(graph_viewer_param), device_type_(device_type_param) {
-  if (version_param == "V_2022_3") {
-    data_ops_ = new DataOps(graph_viewer_, V_2022_3, device_type_);
-  } else if (version_param == "V_2023_0") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_0, device_type_);
+    : graph_viewer_(graph_viewer_param), device_type_(device_type_param), device_precision_(device_precision) {
+  if (version_param == "V_2023_0") {
+    data_ops_ = new DataOps(graph_viewer_, V_2023_0, device_type_, device_precision_);
   } else if (version_param == "V_2023_1") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_);
+    data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_);
   } else if (version_param == "V_2023_2") {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_);
+    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_);
+  } else if (version_param == "V_2023_3") {
+    data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
   } else {
-    data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_);
+    data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_);
   }
 }
 
@@ -111,7 +113,7 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     if (backend_utils::IsCILogEnabled()) {
       std::cout << "Model is fully supported on OpenVINO" << std::endl;
     }
-    openvino_ep::BackendManager::GetGlobalContext().is_wholly_supported_graph = true;
+    is_wholly_supported_graph_ = true;
 
   } else {                                     // unsupported_nodes_idx.empty()
 #if defined(OPENVINO_DISABLE_GRAPH_PARTITION)  // disables graph partition at build time
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capabilities.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h
similarity index 57%
rename from onnxruntime/core/providers/openvino/ov_versions/capabilities.h
rename to onnxruntime/core/providers/openvino/ov_versions/capability.h
index 5bcf9d68cd94e..2040634cc45d9 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capabilities.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h
@@ -14,11 +14,19 @@ class GetCapability {
  private:
   const GraphViewer& graph_viewer_;
   std::string device_type_;
+  std::string device_precision_;
   DataOps* data_ops_;
+  bool is_wholly_supported_graph_ = false;
 
  public:
-  GetCapability(const GraphViewer& graph_viewer_param, std::string device_type_param, const std::string version_param);
+  GetCapability(const GraphViewer& graph_viewer_param,
+                const std::string device_type_param,
+                const std::string precision,
+                const std::string version_param);
   virtual std::vector<std::unique_ptr<ComputeCapability>> Execute();
+  bool IsWhollySupportedGraph() {
+    return is_wholly_supported_graph_;
+  }
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index 8749885660314..e829bf377b195 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -12,7 +12,7 @@
 #include "../backend_utils.h"
 #include "../backend_manager.h"
 #include "data_ops.h"
-#include "capabilities.h"
+#include "capability.h"
 #include "utils.h"
 
 #if defined(_MSC_VER)
@@ -440,11 +440,14 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Greater", V_2023_0, {"NPU"}});
+  no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}});
   no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Max", V_2023_0, {"NPU"}});
   no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}});
+  no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}});
+  no_dimension_supported_.push_back({"Pow", V_2023_0, {"CPU", "GPU"}});
   no_dimension_supported_.push_back({"QuantizeLinear", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Range", V_2021_2, {"All"}});
   no_dimension_supported_.push_back({"ReduceMax", V_2021_4, {"All"}});
@@ -453,6 +456,7 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"Reshape", V_2022_1, {"All"}});
   no_dimension_supported_.push_back({"Shape", V_2022_1, {"GPU"}});
   no_dimension_supported_.push_back({"Shape", V_2023_0, {"CPU"}});
+  no_dimension_supported_.push_back({"Sqrt", V_2023_0, {"All"}});
   no_dimension_supported_.push_back({"Squeeze", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Sub", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Unsqueeze", V_2020_4, {"All"}});
@@ -640,8 +644,7 @@ void DataOps::populate_op_mode_supported() {
                              [this](const Node* node, const InitializedTensorSet&) {
                                // Max op with one input is not supporting for GPU_FP16
                                if (device_id_.find("GPU") != std::string::npos) {
-                                 auto prec_str = openvino_ep::BackendManager::GetGlobalContext().precision_str;
-                                 if (prec_str == "FP16") {
+                                 if (device_precision_ == "FP16") {
                                    if (node->InputDefs().size() == 1) {
                                      return true;
                                    }
@@ -656,8 +659,7 @@ void DataOps::populate_op_mode_supported() {
                              [this](const Node* node, const InitializedTensorSet&) {
                                // Min op with one input is not supporting for GPU_FP16
                                if (device_id_.find("GPU") != std::string::npos) {
-                                 auto prec_str = openvino_ep::BackendManager::GetGlobalContext().precision_str;
-                                 if (prec_str == "FP16") {
+                                 if (device_precision_ == "FP16") {
                                    if (node->InputDefs().size() == 1) {
                                      return true;
                                    }
@@ -672,8 +674,7 @@ void DataOps::populate_op_mode_supported() {
                              [this](const Node* node, const InitializedTensorSet&) {
                                // Sum op with one input is not supporting for GPU_FP16
                                if (device_id_.find("GPU") != std::string::npos) {
-                                 auto prec_str = openvino_ep::BackendManager::GetGlobalContext().precision_str;
-                                 if (prec_str == "FP16") {
+                                 if (device_precision_ == "FP16") {
                                    if (node->InputDefs().size() == 1) {
                                      return true;
                                    }
@@ -705,7 +706,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"PRelu", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3, V_2023_0, V_2023_1, V_2023_2},
+    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_arg = node->InputDefs()[1];
                                auto shape = input_arg->Shape();
@@ -820,7 +821,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Squeeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3, V_2023_0, V_2023_1, V_2023_2},
+    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -835,7 +836,7 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3, V_2023_0, V_2023_1, V_2023_2},
+    UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();
@@ -1131,9 +1132,6 @@ bool DataOps::node_is_supported(const std::map<std::string, std::set<std::string
         if (op_is_supported(optype, no_dimension_supported_)) {
           return;
         }
-        if ((optype == "Identity") || (optype == "Sqrt")) {
-          return;
-        }
         has_unsupported_dimension = true;
         return;
       } else {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
index f6ad2dd5c9d60..87688601ad692 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -25,7 +25,8 @@ enum versionNum {
   V_2022_3,
   V_2023_0,
   V_2023_1,
-  V_2023_2
+  V_2023_2,
+  V_2023_3
 };
 
 using VersionNum = enum versionNum;
@@ -50,6 +51,7 @@ class DataOps {
   const GraphViewer& graph_viewer_;
   VersionNum version_id_;
   std::string device_id_;
+  std::string device_precision_;
   std::multimap<std::string, UnsupportedOpMode> op_list_;
   std::vector<SupportedOp> subgraph_supported_;
   std::vector<SupportedOp> no_dimension_supported_;
@@ -70,8 +72,8 @@ class DataOps {
                          const NodeIndex node_idx);
 
  public:
-  DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, std::string dev_id)
-      : graph_viewer_(graph_viewer_param), version_id_(ver), device_id_(dev_id) {
+  DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, const std::string dev_id, const std::string device_precision)
+      : graph_viewer_(graph_viewer_param), version_id_(ver), device_id_(dev_id), device_precision_(device_precision) {
     populate_op_mode_supported();
     populate_types_supported();
   }
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 08bfb618f55b4..d77c188f832a7 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2723,6 +2723,7 @@ static constexpr OrtApi ort_api_1_to_17 = {
     &OrtApis::ReadOpAttr,
     &OrtApis::SetDeterministicCompute,
     &OrtApis::KernelContext_ParallelFor,
+    &OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 6df5e4145b416..c1caafa4dcad3 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -504,4 +504,9 @@ ORT_API_STATUS_IMPL(SetDeterministicCompute, _Inout_ OrtSessionOptions* options,
 
 ORT_API_STATUS_IMPL(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* user_data);
 
+ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                    _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values,
+                    _In_ size_t num_keys);
 }  // namespace OrtApis
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index e2d46012c097b..2df30ba2de391 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1440,23 +1440,27 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O
   if (legacy_ov_options->device_id != nullptr)
     ov_options_converted_map["device_id"] = legacy_ov_options->device_id;
 
-  ov_options_converted_map["num_of_threads"] = std::to_string(legacy_ov_options->num_of_threads);
+  if (legacy_ov_options->num_of_threads != '\0')
+    ov_options_converted_map["num_of_threads"] = std::to_string(legacy_ov_options->num_of_threads);
 
   if (legacy_ov_options->cache_dir != nullptr)
     ov_options_converted_map["cache_dir"] = legacy_ov_options->cache_dir;
 
-  std::stringstream context_string;
-
-  if (legacy_ov_options->context != nullptr)
+  if (legacy_ov_options->context != nullptr) {
+    std::stringstream context_string;
     context_string << legacy_ov_options->context;
-  ov_options_converted_map["context"] = context_string.str();
+    ov_options_converted_map["context"] = context_string.str();
+  }
 
   ov_options_converted_map["enable_opencl_throttling"] = legacy_ov_options->enable_opencl_throttling;
-  std::string enable_dynamic_shapes = reinterpret_cast<const char*>(legacy_ov_options->enable_dynamic_shapes);
-  if (enable_dynamic_shapes == "true" || enable_dynamic_shapes == "True") {
-    ov_options_converted_map["disable_dynamic_shapes"] = "false";
-  } else if (enable_dynamic_shapes == "false" || enable_dynamic_shapes == "False") {
-    ov_options_converted_map["disable_dynamic_shapes"] = "true";
+
+  if (legacy_ov_options->enable_dynamic_shapes != '\0') {
+    std::string enable_dynamic_shapes = reinterpret_cast<const char*>(legacy_ov_options->enable_dynamic_shapes);
+    if (enable_dynamic_shapes == "true" || enable_dynamic_shapes == "True") {
+      ov_options_converted_map["disable_dynamic_shapes"] = "false";
+    } else if (enable_dynamic_shapes == "false" || enable_dynamic_shapes == "False") {
+      ov_options_converted_map["disable_dynamic_shapes"] = "true";
+    }
   }
   // Add new provider option below
   ov_options_converted_map["num_streams"] = "1";
@@ -1733,6 +1737,39 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO, _In
   API_IMPL_END
 }
 
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                    _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values,
+                    _In_ size_t num_keys) {
+  API_IMPL_BEGIN
+  onnxruntime::ProviderOptions provider_options;
+  for (size_t i = 0; i != num_keys; ++i) {
+    if (provider_options_keys[i] == nullptr || provider_options_keys[i][0] == '\0' ||
+        provider_options_values[i] == nullptr || provider_options_values[i][0] == '\0') {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Provider options key/value cannot be empty");
+    }
+
+    // arbitrary length to validate the key/value. adjust if/when needed.
+    // TODO: are any other input validation checks required here (and in the other functions that process
+    // provider options)?
+    if (strlen(provider_options_keys[i]) > 1024 || strlen(provider_options_values[i]) > 1024) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                   "Maximum string length for a provider options key/value is 1024.");
+    }
+
+    provider_options[provider_options_keys[i]] = provider_options_values[i];
+  }
+  auto factory = onnxruntime::OpenVINOProviderFactoryCreator::Create(&provider_options);
+  if (!factory) {
+    return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_OpenVINO_V2: Failed to load shared library");
+  }
+
+  options->provider_factories.push_back(factory);
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_OpenVINO, _In_ OrtSessionOptions* options,
                     _In_ const char* device_type) {
   OrtOpenVINOProviderOptions provider_options{};
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index b012406bd026a..86b3d01c640a3 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -311,6 +311,18 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO,
   return CreateNotEnabledStatus("OpenVINO");
 }
 
+ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO_V2,
+                    _In_ OrtSessionOptions* options,
+                    _In_reads_(num_keys) const char* const* provider_options_keys,
+                    _In_reads_(num_keys) const char* const* provider_options_values,
+                    _In_ size_t num_keys) {
+  ORT_UNUSED_PARAMETER(options);
+  ORT_UNUSED_PARAMETER(provider_options_keys);
+  ORT_UNUSED_PARAMETER(provider_options_values);
+  ORT_UNUSED_PARAMETER(num_keys);
+  return CreateNotEnabledStatus("OpenVINO");
+}
+
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT,
                     _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
   ORT_UNUSED_PARAMETER(options);
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 13082fe69cf48..27385d44e257a 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -307,7 +307,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         ORT_THROW("[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO. ['device_type', 'device_id', 'enable_npu_fast_compile', 'num_of_threads', 'cache_dir', 'num_streams', 'enable_opencl_throttling', 'disable_dynamic_shapes'] \n");
       }
     }
-    session_options.AppendExecutionProvider("OpenVINO", ov_options);
+    session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);
 #else
     ORT_THROW("OpenVINO is not supported in this build\n");
 #endif

From 4dbaa7373875d8366d1b47691c86386fee614fe2 Mon Sep 17 00:00:00 2001
From: Caroline Zhu <wolfivyaura@gmail.com>
Date: Fri, 12 Jan 2024 13:33:33 -0800
Subject: [PATCH 044/100] [js/web/training] added end-to-end tests (#18700)

## Summary
* following inference's [set-up for end-to-end
tests](https://github.com/microsoft/onnxruntime/tree/main/js/web/test/e2e),
created an end-to-end test runner for training
* this test runner copies testdata from the [trainingapi
folder](https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/test/testdata/training_api)
* then runs two tests (training session with evalModel & optimizer
model, and training session with the minimum options), and tests if the
ORT-web training package encompasses inference
  * these tests check
    * createTrainingSession
    * runTrainStep
    * runOptimizerStep if applicable
* the parameters methods (getParametersSize, loadParametersBuffer, and
getContiguousParameters)

## TL;DR
*
[`js/web/test/training/e2e/run.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-c1359c4d401f9ba69e937814219cefe5fd11b151a6ffd084c641af3c82e8216c)
is responsible for setting up and running the end to end tests
*
[`js/web/test/training/e2e/common.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-ee5452491b7b2563d175d13d81d10f2323b12b18589aa4c5798962a8b904a4a8)
contains the test function definitions (`testInferenceFunction`,
`testTrainingFunctionMin`, `testTrainingFunctionAll`)

## Flow
* entrypoint: user runs the following command in the terminal: `npm run
test:training:e2e`
*
[`js/web/package.json`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-79275844e75c3c410bb3a71c7f59b2b633e5a3e975c804ffc47220025084da28)
was modified to include an npm script that will run `run.js` which will
run the end to end tests
*
[`js/web/test/training/e2e/run.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-c1359c4d401f9ba69e937814219cefe5fd11b151a6ffd084c641af3c82e8216c)
is responsible for
  * detecting and installing local tarball packages of ORT-web
  * copying training data to the `js/web/training/e2e/data` folder
* starting two Karma processes. Karma is a test runner framework that
simulates testing in the browser.
* In this case, the tests happen in Chrome. We can configure the tests
to run in Edge and other browsers in the future.
* one of these karma processes is self-hosted, meaning it pulls the
ORT-web package from local
* the other karma process is not self-hosted, meaning it pulls the
ORT-web package from another source. In this case, we start an http
server that serves the ORT-web binaries.
*
[`js/web/test/training/e2e/simple-http-server.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-f798ab485f3ec26c299fe5b2923574c9e4b090200ba20d490bbf6c183286993c)
is responsible for starting the HTTP server and serving the ORT binary
files. This code almost identical to the same code in the inference E2E
tests.
*
[`js/web/test/training/e2e/karma.conf.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-436cfe8f670c768a04895bd4a1874a5e033f85e0e2d84941c62ff1f7c30a9f28)
Karma configuration file that specifies what happens when a karma
process is started. The config specifies Mocha as the testing framework,
which will go through all the loaded files and run any tests that exist
*
[`js/web/test/training/e2e/browser-test-wasm.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-13b6155e106dddc7b531ef671186e69b2aadb8a0f4b2f3001db0991567d78221)
File that contains the tests that Mocha will pick up on and run.
* The test functions (such as testInference and testTrainingFunctionAll)
are defined in
[`js/web/test/training/e2e/common.js`](https://github.com/microsoft/onnxruntime/compare/main...carzh:onnxruntime:carzh/training-e2e-runner?expand=1#diff-ee5452491b7b2563d175d13d81d10f2323b12b18589aa4c5798962a8b904a4a8).

## Notes
* I followed the [tests for training
core](https://github.com/microsoft/onnxruntime/blob/b023de0bfc7acb2404dfdcc4adc060b7b72fdaa1/orttraining/orttraining/test/training_api/core/training_api_tests.cc)
where they randomly generated input for the training session
* E2E tests are triggered by running `npm run test:training:e2e` --
suggestions for alternative script names are appreciated!!!

## Motivation and Context
- adding training bindings for web
---
 js/web/package.json                           |   1 +
 js/web/test/training/e2e/browser-test-wasm.js |  21 ++
 js/web/test/training/e2e/common.js            | 246 ++++++++++++++++++
 js/web/test/training/e2e/data/model.onnx      |  16 ++
 js/web/test/training/e2e/karma.conf.js        |  54 ++++
 js/web/test/training/e2e/package.json         |  14 +
 js/web/test/training/e2e/run.js               | 138 ++++++++++
 .../test/training/e2e/simple-http-server.js   |  64 +++++
 .../azure-pipelines/templates/win-web-ci.yml  |   5 +
 9 files changed, 559 insertions(+)
 create mode 100644 js/web/test/training/e2e/browser-test-wasm.js
 create mode 100644 js/web/test/training/e2e/common.js
 create mode 100644 js/web/test/training/e2e/data/model.onnx
 create mode 100644 js/web/test/training/e2e/karma.conf.js
 create mode 100644 js/web/test/training/e2e/package.json
 create mode 100644 js/web/test/training/e2e/run.js
 create mode 100644 js/web/test/training/e2e/simple-http-server.js

diff --git a/js/web/package.json b/js/web/package.json
index 9b4531d7766fe..7ffc9ba16aaa9 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -24,6 +24,7 @@
     "build:doc": "node ./script/generate-webgl-operator-md && node ./script/generate-webgpu-operator-md",
     "pull:wasm": "node ./script/pull-prebuilt-wasm-artifacts",
     "test:e2e": "node ./test/e2e/run",
+    "test:training:e2e": "node ./test/training/e2e/run",
     "prebuild": "tsc -p . --noEmit && tsc -p lib/wasm/proxy-worker --noEmit",
     "build": "node ./script/build",
     "test": "tsc --build ../scripts && node ../scripts/prepare-onnx-node-tests && node ./script/test-runner-cli",
diff --git a/js/web/test/training/e2e/browser-test-wasm.js b/js/web/test/training/e2e/browser-test-wasm.js
new file mode 100644
index 0000000000000..fa87389f7ac46
--- /dev/null
+++ b/js/web/test/training/e2e/browser-test-wasm.js
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+describe('Browser E2E testing for training package', function() {
+  it('Check that training package encompasses inference', async function() {
+    ort.env.wasm.numThreads = 1;
+    await testInferenceFunction(ort, {executionProviders: ['wasm']});
+  });
+
+  it('Check training functionality, all options', async function() {
+    ort.env.wasm.numThreads = 1;
+    await testTrainingFunctionAll(ort, {executionProviders: ['wasm']});
+  });
+
+  it('Check training functionality, minimum options', async function() {
+    ort.env.wasm.numThreads = 1;
+    await testTrainingFunctionMin(ort, {executionProviders: ['wasm']});
+  });
+});
diff --git a/js/web/test/training/e2e/common.js b/js/web/test/training/e2e/common.js
new file mode 100644
index 0000000000000..b6040b63d56b4
--- /dev/null
+++ b/js/web/test/training/e2e/common.js
@@ -0,0 +1,246 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const DATA_FOLDER = 'data/';
+const TRAININGDATA_TRAIN_MODEL = DATA_FOLDER + 'training_model.onnx';
+const TRAININGDATA_OPTIMIZER_MODEL = DATA_FOLDER + 'adamw.onnx';
+const TRAININGDATA_EVAL_MODEL = DATA_FOLDER + 'eval_model.onnx';
+const TRAININGDATA_CKPT = DATA_FOLDER + 'checkpoint.ckpt';
+
+const trainingSessionAllOptions = {
+  checkpointState: TRAININGDATA_CKPT,
+  trainModel: TRAININGDATA_TRAIN_MODEL,
+  evalModel: TRAININGDATA_EVAL_MODEL,
+  optimizerModel: TRAININGDATA_OPTIMIZER_MODEL
+}
+
+const trainingSessionMinOptions = {
+  checkpointState: TRAININGDATA_CKPT,
+  trainModel: TRAININGDATA_TRAIN_MODEL,
+}
+
+// ASSERT METHODS
+
+function assert(cond) {
+  if (!cond) throw new Error();
+}
+
+function assertStrictEquals(actual, expected) {
+  if (actual !== expected) {
+    let strRep = actual;
+    if (typeof actual === 'object') {
+      strRep = JSON.stringify(actual);
+    }
+    throw new Error(`expected: ${expected}; got: ${strRep}`);
+  }
+}
+
+function assertTwoListsUnequal(list1, list2) {
+  if (list1.length !== list2.length) {
+    return;
+  }
+  for (let i = 0; i < list1.length; i++) {
+    if (list1[i] !== list2[i]) {
+      return;
+    }
+  }
+  throw new Error(`expected ${list1} and ${list2} to be unequal; got two equal lists`);
+}
+
+// HELPER METHODS FOR TESTS
+
+function generateGaussianRandom(mean=0, scale=1) {
+  const u = 1 - Math.random();
+  const v = Math.random();
+  const z = Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v);
+  return z * scale + mean;
+}
+
+function generateGaussianFloatArray(length) {
+  const array = new Float32Array(length);
+
+  for (let i = 0; i < length; i++) {
+    array[i] = generateGaussianRandom();
+  }
+
+  return array;
+}
+
+/**
+ * creates the TrainingSession and verifies that the input and output names of the training model loaded into the
+ * training session are correct.
+ * @param {} ort
+ * @param {*} createOptions
+ * @param {*} options
+ * @returns
+ */
+async function createTrainingSessionAndCheckTrainingModel(ort, createOptions, options) {
+  const trainingSession = await ort.TrainingSession.create(createOptions, options);
+
+  assertStrictEquals(trainingSession.trainingInputNames[0], 'input-0');
+  assertStrictEquals(trainingSession.trainingInputNames[1], 'labels');
+  assertStrictEquals(trainingSession.trainingInputNames.length, 2);
+  assertStrictEquals(trainingSession.trainingOutputNames[0], 'onnx::loss::21273');
+  assertStrictEquals(trainingSession.trainingOutputNames.length, 1);
+  return trainingSession;
+}
+
+/**
+ * verifies that the eval input and output names associated with the eval model loaded into the given training session
+ * are correct.
+ */
+function checkEvalModel(trainingSession) {
+  assertStrictEquals(trainingSession.evalInputNames[0], 'input-0');
+  assertStrictEquals(trainingSession.evalInputNames[1], 'labels');
+  assertStrictEquals(trainingSession.evalInputNames.length, 2);
+  assertStrictEquals(trainingSession.evalOutputNames[0], 'onnx::loss::21273');
+  assertStrictEquals(trainingSession.evalOutputNames.length, 1);
+}
+
+/**
+ * Checks that accessing trainingSession.evalInputNames or trainingSession.evalOutputNames will throw an error if
+ * accessed
+ * @param {} trainingSession
+ */
+function checkNoEvalModel(trainingSession) {
+  try {
+    assertStrictEquals(trainingSession.evalInputNames, "should have thrown an error upon accessing");
+  } catch (error) {
+    assertStrictEquals(error.message, 'This training session has no evalModel loaded.');
+  }
+  try {
+    assertStrictEquals(trainingSession.evalOutputNames, "should have thrown an error upon accessing");
+  } catch (error) {
+    assertStrictEquals(error.message, 'This training session has no evalModel loaded.');
+  }
+}
+
+/**
+ * runs the train step with the given inputs and checks that the tensor returned is of type float32 and has a length
+ * of 1 for the loss.
+ * @param {} trainingSession
+ * @param {*} feeds
+ * @returns
+ */
+var runTrainStepAndCheck = async function(trainingSession, feeds) {
+  const results =  await trainingSession.runTrainStep(feeds);
+  assertStrictEquals(Object.keys(results).length, 1);
+  assertStrictEquals(results['onnx::loss::21273'].data.length, 1);
+  assertStrictEquals(results['onnx::loss::21273'].type, 'float32');
+  return results;
+};
+
+var loadParametersBufferAndCheck = async function(trainingSession, paramsLength, constant, paramsBefore) {
+  // make a float32 array that is filled with the constant
+  const newParams = new Float32Array(paramsLength);
+  for (let i = 0; i < paramsLength; i++) {
+    newParams[i] = constant;
+  }
+
+  const newParamsUint8 = new Uint8Array(newParams.buffer, newParams.byteOffset, newParams.byteLength);
+
+  await trainingSession.loadParametersBuffer(newParamsUint8);
+  const paramsAfterLoad = await trainingSession.getContiguousParameters();
+
+  // check that the parameters have changed
+  assertTwoListsUnequal(paramsAfterLoad.data, paramsBefore.data);
+  assertStrictEquals(paramsAfterLoad.dims[0], paramsLength);
+
+  // check that the parameters have changed to what they should be
+  for (let i = 0; i < paramsLength; i++) {
+    // round to the same number of digits (4 decimal places)
+    assertStrictEquals(paramsAfterLoad.data[i].toFixed(4), constant.toFixed(4));
+  }
+
+  return paramsAfterLoad;
+}
+
+// TESTS
+
+var testInferenceFunction = async function(ort, options) {
+  const session = await ort.InferenceSession.create('data/model.onnx', options || {});
+
+  const dataA = Float32Array.from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]);
+  const dataB = Float32Array.from([10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]);
+
+  const fetches =
+      await session.run({a: new ort.Tensor('float32', dataA, [3, 4]), b: new ort.Tensor('float32', dataB, [4, 3])});
+
+  const c = fetches.c;
+
+  assert(c instanceof ort.Tensor);
+  assert(c.dims.length === 2 && c.dims[0] === 3 && c.dims[1] === 3);
+  assert(c.data[0] === 700);
+  assert(c.data[1] === 800);
+  assert(c.data[2] === 900);
+  assert(c.data[3] === 1580);
+  assert(c.data[4] === 1840);
+  assert(c.data[5] === 2100);
+  assert(c.data[6] === 2460);
+  assert(c.data[7] === 2880);
+  assert(c.data[8] === 3300);
+};
+
+var testTrainingFunctionMin = async function(ort, options) {
+  const trainingSession = await createTrainingSessionAndCheckTrainingModel(ort, trainingSessionMinOptions, options);
+  checkNoEvalModel(trainingSession);
+  const input0 = new ort.Tensor('float32', generateGaussianFloatArray(2 * 784), [2, 784]);
+  const labels = new ort.Tensor('int32', [2, 1], [2]);
+  const feeds = {"input-0": input0, "labels": labels};
+
+  // check getParametersSize
+  const paramsSize = await trainingSession.getParametersSize();
+  assertStrictEquals(paramsSize, 397510);
+
+  // check getContiguousParameters
+  const originalParams = await trainingSession.getContiguousParameters();
+  assertStrictEquals(originalParams.dims.length, 1);
+  assertStrictEquals(originalParams.dims[0], 397510);
+  assertStrictEquals(originalParams.data[0], -0.025190064683556557);
+  assertStrictEquals(originalParams.data[2000], -0.034044936299324036);
+
+  await runTrainStepAndCheck(trainingSession, feeds);
+
+  await loadParametersBufferAndCheck(trainingSession, 397510, -1.2, originalParams);
+}
+
+var testTrainingFunctionAll = async function(ort, options) {
+  const trainingSession = await createTrainingSessionAndCheckTrainingModel(ort, trainingSessionAllOptions, options);
+  checkEvalModel(trainingSession);
+
+  const input0 = new ort.Tensor('float32', generateGaussianFloatArray(2 * 784), [2, 784]);
+  const labels = new ort.Tensor('int32', [2, 1], [2]);
+  let feeds = {"input-0": input0, "labels": labels};
+
+  // check getParametersSize
+  const paramsSize = await trainingSession.getParametersSize();
+  assertStrictEquals(paramsSize, 397510);
+
+  // check getContiguousParameters
+  const originalParams = await trainingSession.getContiguousParameters();
+  assertStrictEquals(originalParams.dims.length, 1);
+  assertStrictEquals(originalParams.dims[0], 397510);
+  assertStrictEquals(originalParams.data[0], -0.025190064683556557);
+  assertStrictEquals(originalParams.data[2000], -0.034044936299324036);
+
+  const results = await runTrainStepAndCheck(trainingSession, feeds);
+
+  await trainingSession.runOptimizerStep(feeds);
+  feeds = {"input-0": input0, "labels": labels};
+  // check getContiguousParameters after optimizerStep -- that the parameters have been updated
+  const optimizedParams = await trainingSession.getContiguousParameters();
+  assertTwoListsUnequal(originalParams.data, optimizedParams.data);
+
+  const results2 = await runTrainStepAndCheck(trainingSession, feeds);
+
+  // check that loss decreased after optimizer step and training again
+  assert(results2['onnx::loss::21273'].data < results['onnx::loss::21273'].data);
+
+  await loadParametersBufferAndCheck(trainingSession, 397510, -1.2, optimizedParams);
+}
+
+if (typeof module === 'object') {
+  module.exports = [testInferenceFunction, testTrainingFunctionMin, testTrainingFunctionAll, testTest];
+}
diff --git a/js/web/test/training/e2e/data/model.onnx b/js/web/test/training/e2e/data/model.onnx
new file mode 100644
index 0000000000000..088124bd48624
--- /dev/null
+++ b/js/web/test/training/e2e/data/model.onnx
@@ -0,0 +1,16 @@
+backend-test:b
+
+a
+bc"MatMultest_matmul_2dZ
+a
+
+
+Z
+b
+
+
+b
+c
+
+
+B	
\ No newline at end of file
diff --git a/js/web/test/training/e2e/karma.conf.js b/js/web/test/training/e2e/karma.conf.js
new file mode 100644
index 0000000000000..e441cb65b4125
--- /dev/null
+++ b/js/web/test/training/e2e/karma.conf.js
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const args = require('minimist')(process.argv.slice(2));
+const SELF_HOST = !!args['self-host'];
+const ORT_MAIN = args['ort-main'];
+const TEST_MAIN = args['test-main'];
+if (typeof TEST_MAIN !== 'string') {
+  throw new Error('flag --test-main=<TEST_MAIN_JS_FILE> is required');
+}
+const USER_DATA = args['user-data'];
+if (typeof USER_DATA !== 'string') {
+  throw new Error('flag --user-data=<CHROME_USER_DATA_FOLDER> is required');
+}
+
+module.exports = function(config) {
+  const distPrefix = SELF_HOST ? './node_modules/onnxruntime-web/dist/' : 'http://localhost:8081/dist/';
+  config.set({
+    frameworks: ['mocha'],
+    files: [
+      {pattern: distPrefix + ORT_MAIN},
+      {pattern: './common.js'},
+      {pattern: TEST_MAIN},
+      {pattern: './node_modules/onnxruntime-web/dist/*.wasm', included: false, nocache: true},
+      {pattern: './data/*', included: false},
+    ],
+    plugins: [require('@chiragrupani/karma-chromium-edge-launcher'), ...config.plugins],
+    proxies: {
+      '/model.onnx': '/base/model.onnx',
+      '/data/': '/base/data/',
+    },
+    client: {captureConsole: true, mocha: {expose: ['body'], timeout: 60000}},
+    reporters: ['mocha'],
+    captureTimeout: 120000,
+    reportSlowerThan: 100,
+    browserDisconnectTimeout: 600000,
+    browserNoActivityTimeout: 300000,
+    browserDisconnectTolerance: 0,
+    browserSocketTimeout: 60000,
+    hostname: 'localhost',
+    browsers: [],
+    customLaunchers: {
+      Chrome_default: {base: 'ChromeHeadless', chromeDataDir: USER_DATA},
+      Chrome_no_threads: {
+        base: 'ChromeHeadless',
+        chromeDataDir: USER_DATA,
+        // TODO: no-thread flags
+      },
+      Edge_default: {base: 'Edge', edgeDataDir: USER_DATA}
+    }
+  });
+};
diff --git a/js/web/test/training/e2e/package.json b/js/web/test/training/e2e/package.json
new file mode 100644
index 0000000000000..5f11a27de6dfc
--- /dev/null
+++ b/js/web/test/training/e2e/package.json
@@ -0,0 +1,14 @@
+{
+  "devDependencies": {
+    "@chiragrupani/karma-chromium-edge-launcher": "^2.2.2",
+    "fs-extra": "^11.1.0",
+    "globby": "^13.1.3",
+    "karma": "^6.4.1",
+    "karma-chrome-launcher": "^3.1.1",
+    "karma-mocha": "^2.0.1",
+    "karma-mocha-reporter": "^2.2.5",
+    "light-server": "^2.9.1",
+    "minimist": "^1.2.7",
+    "mocha": "^10.2.0"
+  }
+}
diff --git a/js/web/test/training/e2e/run.js b/js/web/test/training/e2e/run.js
new file mode 100644
index 0000000000000..379a8136f3ff8
--- /dev/null
+++ b/js/web/test/training/e2e/run.js
@@ -0,0 +1,138 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+const path = require('path');
+const fs = require('fs-extra');
+const {spawn} = require('child_process');
+const startServer = require('./simple-http-server');
+const minimist = require('minimist');
+
+// copy whole folder to out-side of <ORT_ROOT>/js/ because we need to test in a folder that no `package.json` file
+// exists in its parent folder.
+// here we use <ORT_ROOT>/build/js/e2e-training/ for the test
+
+const TEST_E2E_SRC_FOLDER = __dirname;
+const JS_ROOT_FOLDER = path.resolve(__dirname, '../../../..');
+const TEST_E2E_RUN_FOLDER = path.resolve(JS_ROOT_FOLDER, '../build/js/e2e-training');
+const NPM_CACHE_FOLDER = path.resolve(TEST_E2E_RUN_FOLDER, '../npm_cache');
+const CHROME_USER_DATA_FOLDER = path.resolve(TEST_E2E_RUN_FOLDER, '../user_data');
+fs.emptyDirSync(TEST_E2E_RUN_FOLDER);
+fs.emptyDirSync(NPM_CACHE_FOLDER);
+fs.emptyDirSync(CHROME_USER_DATA_FOLDER);
+fs.copySync(TEST_E2E_SRC_FOLDER, TEST_E2E_RUN_FOLDER);
+
+// training data to copy
+const ORT_ROOT_FOLDER = path.resolve(JS_ROOT_FOLDER, '..');
+const TRAINING_DATA_FOLDER = path.resolve(ORT_ROOT_FOLDER, 'onnxruntime/test/testdata/training_api');
+const TRAININGDATA_DEST = path.resolve(TEST_E2E_RUN_FOLDER, 'data');
+
+// always use a new folder as user-data-dir
+let nextUserDataDirId = 0;
+function getNextUserDataDir() {
+  const dir = path.resolve(CHROME_USER_DATA_FOLDER, nextUserDataDirId.toString())
+  nextUserDataDirId++;
+  fs.emptyDirSync(dir);
+  return dir;
+}
+
+// commandline arguments
+const BROWSER = minimist(process.argv.slice(2)).browser || 'Chrome_default';
+
+async function main() {
+  // find packed package
+  const {globbySync} = await import('globby');
+
+  const ORT_COMMON_FOLDER = path.resolve(JS_ROOT_FOLDER, 'common');
+  const ORT_COMMON_PACKED_FILEPATH_CANDIDATES = globbySync('onnxruntime-common-*.tgz', {cwd: ORT_COMMON_FOLDER});
+
+  const PACKAGES_TO_INSTALL = [];
+
+  if (ORT_COMMON_PACKED_FILEPATH_CANDIDATES.length === 1) {
+    PACKAGES_TO_INSTALL.push(path.resolve(ORT_COMMON_FOLDER, ORT_COMMON_PACKED_FILEPATH_CANDIDATES[0]));
+  } else if (ORT_COMMON_PACKED_FILEPATH_CANDIDATES.length > 1) {
+    throw new Error('multiple packages found for onnxruntime-common.');
+  }
+
+  const ORT_WEB_FOLDER = path.resolve(JS_ROOT_FOLDER, 'web');
+  const ORT_WEB_PACKED_FILEPATH_CANDIDATES = globbySync('onnxruntime-web-*.tgz', {cwd: ORT_WEB_FOLDER});
+  if (ORT_WEB_PACKED_FILEPATH_CANDIDATES.length !== 1) {
+    throw new Error('cannot find exactly single package for onnxruntime-web.');
+  }
+  PACKAGES_TO_INSTALL.push(path.resolve(ORT_WEB_FOLDER, ORT_WEB_PACKED_FILEPATH_CANDIDATES[0]));
+
+  // we start here:
+
+  // install dev dependencies
+  await runInShell(`npm install`);
+
+  // npm install with "--cache" to install packed packages with an empty cache folder
+  await runInShell(`npm install --cache "${NPM_CACHE_FOLDER}" ${PACKAGES_TO_INSTALL.map(i => `"${i}"`).join(' ')}`);
+
+  // prepare training data
+  prepareTrainingDataByCopying();
+
+  console.log('===============================================================');
+  console.log("Running self-hosted tests");
+  console.log('===============================================================');
+  // test cases with self-host (ort hosted in same origin)
+  await testAllBrowserCases({hostInKarma: true});
+
+  console.log('===============================================================');
+  console.log("Running not self-hosted tests");
+  console.log('===============================================================');
+  // test cases without self-host (ort hosted in same origin)
+  startServer(path.resolve(TEST_E2E_RUN_FOLDER, 'node_modules', 'onnxruntime-web'));
+  await testAllBrowserCases({hostInKarma: false});
+
+  // no error occurs, exit with code 0
+  process.exit(0);
+}
+
+async function testAllBrowserCases({hostInKarma}) {
+  await runKarma({hostInKarma, main: './browser-test-wasm.js'});
+}
+
+async function runKarma({hostInKarma, main, browser = BROWSER, ortMain = 'ort.training.wasm.min.js'}) {
+  console.log('===============================================================');
+  console.log(`Running karma with the following binary: ${ortMain}`);
+  console.log('===============================================================');
+  const selfHostFlag = hostInKarma ? '--self-host' : '';
+  await runInShell(`npx karma start --single-run --browsers ${browser} ${selfHostFlag} --ort-main=${
+      ortMain} --test-main=${main} --user-data=${getNextUserDataDir()}`);
+}
+
+async function runInShell(cmd) {
+  console.log('===============================================================');
+  console.log(' Running command in shell:');
+  console.log(' > ' + cmd);
+  console.log('===============================================================');
+  let complete = false;
+  const childProcess = spawn(cmd, {shell: true, stdio: 'inherit', cwd: TEST_E2E_RUN_FOLDER});
+  childProcess.on('close', function(code) {
+    if (code !== 0) {
+      process.exit(code);
+    } else {
+      complete = true;
+    }
+  });
+  while (!complete) {
+    await delay(100);
+  }
+}
+
+async function delay(ms) {
+  return new Promise(function(resolve) {
+    setTimeout(function() {
+      resolve();
+    }, ms);
+  });
+}
+
+function prepareTrainingDataByCopying() {
+  fs.copySync(TRAINING_DATA_FOLDER, TRAININGDATA_DEST);
+  console.log(`Copied ${TRAINING_DATA_FOLDER} to ${TRAININGDATA_DEST}`);
+}
+
+main();
diff --git a/js/web/test/training/e2e/simple-http-server.js b/js/web/test/training/e2e/simple-http-server.js
new file mode 100644
index 0000000000000..a157c7dd93ad8
--- /dev/null
+++ b/js/web/test/training/e2e/simple-http-server.js
@@ -0,0 +1,64 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+// this is a simple HTTP server that enables CORS.
+// following code is based on https://developer.mozilla.org/en-US/docs/Learn/Server-side/Node_server_without_framework
+
+const http = require('http');
+const fs = require('fs');
+const path = require('path');
+
+const validRequests = {
+  // .wasm files
+  '/dist/ort-wasm.wasm': ['dist/ort-wasm.wasm', 'application/wasm'],
+  '/dist/ort-wasm-simd.wasm': ['dist/ort-wasm-simd.wasm', 'application/wasm'],
+  '/dist/ort-training-wasm-simd.wasm': ['dist/ort-training-wasm-simd.wasm', 'application/wasm'],
+  '/dist/ort-wasm-threaded.wasm': ['dist/ort-wasm-threaded.wasm', 'application/wasm'],
+  '/dist/ort-wasm-simd-threaded.wasm': ['dist/ort-wasm-simd-threaded.wasm', 'application/wasm'],
+
+  // proxied .wasm files:
+  '/test-wasm-path-override/ort-wasm.wasm': ['dist/ort-training-wasm.wasm', 'application/wasm'],
+  //'/test-wasm-path-override/renamed.wasm': ['dist/ort-wasm.wasm', 'application/wasm'],
+
+  // .js files
+  '/dist/ort.min.js': ['dist/ort.min.js', 'text/javascript'],
+  '/dist/ort.training.simd.wasm.min.js': ['dist/ort.training.simd.wasm.min.js', 'text/javascript'],
+  '/dist/ort.training.wasm.min.js': ['dist/ort.training.wasm.min.js', 'text/javascript'],
+  '/dist/ort.js': ['dist/ort.js', 'text/javascript'],
+  '/dist/ort.webgl.min.js': ['dist/ort.webgl.min.js', 'text/javascript'],
+  '/dist/ort.wasm.min.js': ['dist/ort.wasm.min.js', 'text/javascript'],
+  '/dist/ort.wasm-core.min.js': ['dist/ort.wasm-core.min.js', 'text/javascript'],
+};
+
+module.exports = function(dir) {
+  http.createServer(function(request, response) {
+        console.log(`request ${request.url.replace(/\n|\r/g, '')}`);
+
+        const requestData = validRequests[request.url];
+        if (!request) {
+          response.writeHead(404);
+          response.end('404');
+        } else {
+          const [filePath, contentType] = requestData;
+          fs.readFile(path.resolve(dir, filePath), function(error, content) {
+            if (error) {
+              if (error.code == 'ENOENT') {
+                response.writeHead(404);
+                response.end('404');
+              } else {
+                response.writeHead(500);
+                response.end('500');
+              }
+            } else {
+              response.setHeader('access-control-allow-origin', '*');
+              response.writeHead(200, {'Content-Type': contentType});
+              response.end(content, 'utf-8');
+            }
+          });
+        }
+      })
+      .listen(8081);
+  console.log('Server running at http://127.0.0.1:8081/');
+    };
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index b7ec3305003d7..8d4efc79eaca8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -202,6 +202,11 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'E2E package consuming test'
     condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
+  - script: |
+      npm run test:training:e2e
+    workingDirectory: '$(Build.SourcesDirectory)\js\web'
+    displayName: 'E2E training package test'
+    condition: and(succeeded(), eq('${{ parameters.BuildConfig }}', 'Release'))
   - task: CopyFiles@2
     inputs:
       sourceFolder: $(Build.SourcesDirectory)\js\common

From 4a5f13b681c4d8cc4d0bb58f2451342933a201a5 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 12 Jan 2024 13:44:28 -0800
Subject: [PATCH 045/100] fix resize for fp16 (#19110)

resize for fp16 has 2 issues: scales are always f32 and roi can be f32
or f16.
scales:
this is fixed.

roi
this is fixed for the case where roi is not passed as optional input
with f16. To fix this it requires a much larger change and I did not
want to risk this short before a release. For all practical purpose
passing roi as input with f16 should be rare and we can fix it in the
near future.
---
 js/web/lib/wasm/jsep/webgpu/ops/resize.ts | 27 +++++++++++++----------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
index d359580904a7b..f68526acc0e63 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/resize.ts
@@ -70,7 +70,6 @@ const validateInputs =
       const rank = inputs[0].dims.length;
       if (roiInputIndex > 0 && inputs.length > roiInputIndex && inputs[roiInputIndex].dims.length > 0) {
         inputs[roiInputIndex].getFloat32Array().forEach((value) => roi.push(value));
-
       } else if (attributes.coordinateTransformMode === 'tf_crop_and_resize') {
         throw new Error('Resize requires RoI input to be specified when coordinateTransformMode is tfCropAndResize');
       }
@@ -110,20 +109,20 @@ const validateInputs =
 
 const getOriginalCoordinateFromResizedCoordinate =
     (coordinateTransferMode: CoordinateTransformMode, dType: string): string =>
-        `fn getOriginalCoordinateFromResizedCoordinate(xResized: u32, xScale: ${dType}, lengthResized: u32,
-     lengthOriginal: u32, roiStart: ${dType}, roiEnd: ${dType}) -> ${dType} { ` +
+        `fn getOriginalCoordinateFromResizedCoordinate(xResized: u32, xScale: f32, lengthResized: u32,
+     lengthOriginal: u32, roiStart: f32, roiEnd: f32) -> ${dType} { ` +
     (() => {
           switch (coordinateTransferMode) {
             case 'asymmetric':
-              return `return ${dType}(xResized) / xScale;`;
+              return `return ${dType}(xResized) / ${dType}(xScale);`;
             case 'pytorch_half_pixel':
               return `if (lengthResized > 1) {
-                    return (${dType}(xResized) + 0.5) / xScale - 0.5;
+                    return (${dType}(xResized) + 0.5) / ${dType}(xScale) - 0.5;
                   } else {
                     return 0.0;
                   }`;
             case 'tf_half_pixel_for_nn':
-              return `return (${dType}(xResized) + 0.5) / xScale;`;
+              return `return (${dType}(xResized) + 0.5) / ${dType}(xScale);`;
             case 'align_corners':
               return `if (lengthResized == 1) {
                     return 0.0;
@@ -138,20 +137,20 @@ const getOriginalCoordinateFromResizedCoordinate =
                   }`;
             case 'tf_crop_and_resize':
               return `if (lengthResized > 1) {
-                    return roiStart * ${dType}(lengthOriginal - 1) +
-                        (${dType}(xResized) * (roiEnd - roiStart) * ${dType}(lengthOriginal - 1)) /
+                    return ${dType}(roiStart) * ${dType}(lengthOriginal - 1) +
+                        (${dType}(xResized) * ${dType}(roiEnd - roiStart) * ${dType}(lengthOriginal - 1)) /
                         ${dType}(lengthResized - 1);
                   } else {
-                    return 0.5 * (roiStart + roiEnd) * ${dType}(lengthOriginal - 1);
+                    return 0.5 * ${dType}(roiStart + roiEnd) * ${dType}(lengthOriginal - 1);
                   }`;
             case 'half_pixel_symmetric':
-              return `const outputWidth = xScale * ${dType}(lengthResized);
+              return `const outputWidth = ${dType}xScale * ${dType}(lengthResized);
                   const adjustment = ${dType}(lengthResized) / outputWidth;
                   const center = ${dType}(lengthOriginal) / 2;
                   const offset = center * (1 - adjustment);
-                  return offset + ((${dType}(xResized) + 0.5) / xScale) - 0.5;`;
+                  return offset + ((${dType}(xResized) + 0.5) / ${dType}(xScale)) - 0.5;`;
             case 'half_pixel':
-              return `return ((${dType}(xResized) + 0.5) / xScale) - 0.5;`;
+              return `return ((${dType}(xResized) + 0.5) / ${dType}(xScale)) - 0.5;`;
             default:
               throw new Error(`Coordinate transform mode ${coordinateTransferMode} is not supported`);
           }
@@ -663,6 +662,10 @@ export const resize = (context: ComputeContext, attributes: ResizeAttributes): v
   const scales: number[] = [];
   const sizes: number[] = [];
   const roi: number[] = [];
+
+  // Note that scales in resize are always f32. roi can be f32 or f16.
+  // TODO: Currently this code does not support f16 for roi when passed as optional input.
+
   const opsetVersion = getOpsetVersionFromCustomDataBuffer(context);
   if (attributes.antialias !== 0) {
     throw Error('Only default value (0) for Antialias attribute is supported');

From a503561d0c7a0da644b4c6477b4c4f8da9524c88 Mon Sep 17 00:00:00 2001
From: Jiangzhuo <jiangzhuo9357@gmail.com>
Date: Sat, 13 Jan 2024 06:54:05 +0900
Subject: [PATCH 046/100] [js] using OffscreenCanvas when DOM is not available
 (#19033)

### Description
when DOM API is not avaiable, using OffscreenCanvas


### Motivation and Context
In some environment like service worker or web worker, the DOM API is
not avaiable, we can use OffscreenCanvas API to replace
`document.createElement('canvas')`.
Most of the APIs of OffscreenCanvas and HTMLCanvasElement are the same,
except that `toDataUrl` is missing.

It fix this issues #19032
---
 js/common/lib/tensor-conversion-impl.ts | 15 ++++++++---
 js/common/lib/tensor-factory-impl.ts    | 34 +++++++++++++++++++------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/js/common/lib/tensor-conversion-impl.ts b/js/common/lib/tensor-conversion-impl.ts
index 22397321e8c6b..b1de48a10c0e1 100644
--- a/js/common/lib/tensor-conversion-impl.ts
+++ b/js/common/lib/tensor-conversion-impl.ts
@@ -8,10 +8,11 @@ import {Tensor} from './tensor.js';
  * implementation of Tensor.toDataURL()
  */
 export const tensorToDataURL = (tensor: Tensor, options?: TensorToDataUrlOptions): string => {
-  const canvas = document.createElement('canvas');
+  const canvas = typeof document !== 'undefined' ? document.createElement('canvas') : (new OffscreenCanvas(1, 1));
   canvas.width = tensor.dims[3];
   canvas.height = tensor.dims[2];
-  const pixels2DContext = canvas.getContext('2d');
+  const pixels2DContext =
+      canvas.getContext('2d') as (CanvasRenderingContext2D | OffscreenCanvasRenderingContext2D | null);
 
   if (pixels2DContext != null) {
     // Default values for height and width & format
@@ -88,7 +89,11 @@ export const tensorToDataURL = (tensor: Tensor, options?: TensorToDataUrlOptions
         pixels2DContext.fillRect(j, i, 1, 1);
       }
     }
-    return canvas.toDataURL();
+    if ('toDataURL' in canvas) {
+      return canvas.toDataURL();
+    } else {
+      throw new Error('toDataURL is not supported');
+    }
   } else {
     throw new Error('Can not access image data');
   }
@@ -98,7 +103,9 @@ export const tensorToDataURL = (tensor: Tensor, options?: TensorToDataUrlOptions
  * implementation of Tensor.toImageData()
  */
 export const tensorToImageData = (tensor: Tensor, options?: TensorToImageDataOptions): ImageData => {
-  const pixels2DContext = document.createElement('canvas').getContext('2d');
+  const pixels2DContext = typeof document !== 'undefined' ?
+      document.createElement('canvas').getContext('2d') :
+      new OffscreenCanvas(1, 1).getContext('2d') as OffscreenCanvasRenderingContext2D;
   let image: ImageData;
   if (pixels2DContext != null) {
     // Default values for height and width & format
diff --git a/js/common/lib/tensor-factory-impl.ts b/js/common/lib/tensor-factory-impl.ts
index 7228c4a97055b..19c62cb54bfed 100644
--- a/js/common/lib/tensor-factory-impl.ts
+++ b/js/common/lib/tensor-factory-impl.ts
@@ -110,13 +110,31 @@ export const tensorFromImage = async(
   let data: Uint8ClampedArray|undefined;
   let bufferToTensorOptions: BufferToTensorOptions = options ?? {};
 
+  const createCanvas = () => {
+    if (typeof document !== 'undefined') {
+      return document.createElement('canvas');
+    } else if (typeof OffscreenCanvas !== 'undefined') {
+      return new OffscreenCanvas(1, 1);
+    } else {
+      throw new Error('Canvas is not supported');
+    }
+  };
+  const createCanvasContext = (canvas: HTMLCanvasElement|OffscreenCanvas) => {
+    if (canvas instanceof HTMLCanvasElement) {
+      return canvas.getContext('2d');
+    } else if (canvas instanceof OffscreenCanvas) {
+      return canvas.getContext('2d') as OffscreenCanvasRenderingContext2D;
+    } else {
+      return null;
+    }
+  };
   // filling and checking image configuration options
   if (isHTMLImageEle) {
     // HTMLImageElement - image object - format is RGBA by default
-    const canvas = document.createElement('canvas');
+    const canvas = createCanvas();
     canvas.width = image.width;
     canvas.height = image.height;
-    const pixels2DContext = canvas.getContext('2d');
+    const pixels2DContext = createCanvasContext(canvas);
 
     if (pixels2DContext != null) {
       let height = image.height;
@@ -166,12 +184,12 @@ export const tensorFromImage = async(
     bufferToTensorOptions.width = width;
 
     if (options !== undefined) {
-      const tempCanvas = document.createElement('canvas');
+      const tempCanvas = createCanvas();
 
       tempCanvas.width = width;
       tempCanvas.height = height;
 
-      const pixels2DContext = tempCanvas.getContext('2d');
+      const pixels2DContext = createCanvasContext(tempCanvas);
 
       if (pixels2DContext != null) {
         pixels2DContext.putImageData(image, 0, 0);
@@ -188,10 +206,10 @@ export const tensorFromImage = async(
       throw new Error('Please provide image config with format for Imagebitmap');
     }
 
-    const canvas = document.createElement('canvas');
+    const canvas = createCanvas();
     canvas.width = image.width;
     canvas.height = image.height;
-    const pixels2DContext = canvas.getContext('2d');
+    const pixels2DContext = createCanvasContext(canvas);
 
     if (pixels2DContext != null) {
       const height = image.height;
@@ -206,8 +224,8 @@ export const tensorFromImage = async(
     }
   } else if (isString) {
     return new Promise((resolve, reject) => {
-      const canvas = document.createElement('canvas');
-      const context = canvas.getContext('2d');
+      const canvas = createCanvas();
+      const context = createCanvasContext(canvas);
       if (!image || !context) {
         return reject();
       }

From 8f2e57f5d0dcbc4577816fcfee01fecdadc7e646 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Sat, 13 Jan 2024 10:02:43 +1000
Subject: [PATCH 047/100] Make session configuration options available to
 kernels via OpKernelInfo (#18897)

### Description
<!-- Describe your changes. -->
Pass through the ConfigOptions from the session via OpKernelInfo so that
kernel behavior can be configured.

Initial usage would be to optionally enable a fast path for ARM64 bloat16 GEMM - see #17031
Other usages could be things like selected the exact implementations of the activation functions for RNN operators instead of the default approximations (e.g. use [sigmoid_exact instead of sigmoid](https://github.com/microsoft/onnxruntime/blob/2d6e2e243d1a1ab0486f4f191b61ac979c5b978e/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h#L379-L382))

OpKernelInfo is already passing through things from the session state, and adding a new member of ConfigOptions
is the simpler update. It's also a more natural fit given it's providing state/info to the kernel.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/framework/op_kernel_info.h           |   6 +-
 .../core/framework/kernel_registry_manager.cc |   3 +-
 onnxruntime/core/framework/op_kernel_info.cc  |  16 ++-
 .../core/optimizer/constant_folding.cc        |   6 +-
 onnxruntime/core/optimizer/constant_folding.h |   2 +
 .../core/optimizer/graph_transformer_utils.cc |   3 +-
 .../optimizer/optimizer_execution_frame.cc    |  14 +-
 .../optimizer/optimizer_execution_frame.h     |   4 +-
 .../providers/shared_library/provider_api.h   |   1 +
 .../shared_library/provider_interfaces.h      |   6 +
 .../shared_library/provider_wrappedtypes.h    |  10 ++
 .../core/session/provider_bridge_ort.cc       |   7 +
 .../core/session/standalone_op_invoker.cc     |   5 +-
 .../test/framework/allocation_planner_test.cc |   2 +-
 .../test/framework/inference_session_test.cc  | 132 ++++++++----------
 .../test/framework/session_state_test.cc      |   7 +-
 onnxruntime/test/ir/graph_test.cc             |  13 +-
 .../test/onnx/microbenchmark/activation.cc    |  13 +-
 onnxruntime/test/optimizer/cse_test.cc        |  24 ++--
 .../test/optimizer/graph_transform_test.cc    |  78 ++++++-----
 onnxruntime/test/optimizer/optimizer_test.cc  |  10 +-
 .../providers/cpu/controlflow/scan_test.cc    |   5 +-
 .../internal_testing_tests.cc                 |  19 ++-
 .../providers/kernel_compute_test_utils.cc    |   3 +-
 onnxruntime/test/util/include/asserts.h       |  15 ++
 .../core/optimizer/graph_transformer_utils.cc |   4 +-
 26 files changed, 246 insertions(+), 162 deletions(-)

diff --git a/include/onnxruntime/core/framework/op_kernel_info.h b/include/onnxruntime/core/framework/op_kernel_info.h
index b31c85e32f80c..a0bbfe50a700b 100644
--- a/include/onnxruntime/core/framework/op_kernel_info.h
+++ b/include/onnxruntime/core/framework/op_kernel_info.h
@@ -28,7 +28,8 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
                         const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                         const OrtValueNameIdxMap& mlvalue_name_idx_map,
                         const DataTransferManager& data_transfer_mgr,
-                        const AllocatorMap& allocators = {});
+                        const AllocatorMap& allocators,
+                        const ConfigOptions& config_options);
 
   OpKernelInfo(const OpKernelInfo& other);
 
@@ -50,6 +51,8 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
 
   const AllocatorMap& GetAllocators() const { return allocators_; }
 
+  const ConfigOptions& GetConfigOptions() const { return config_options_; }
+
  private:
   ORT_DISALLOW_MOVE(OpKernelInfo);
   ORT_DISALLOW_ASSIGNMENT(OpKernelInfo);
@@ -64,6 +67,7 @@ class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
   const DataTransferManager& data_transfer_mgr_;
   ProtoHelperNodeContext proto_helper_context_;
   const AllocatorMap& allocators_;
+  const ConfigOptions& config_options_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc
index b2ef853119588..f8ccdb8fb0238 100644
--- a/onnxruntime/core/framework/kernel_registry_manager.cc
+++ b/onnxruntime/core/framework/kernel_registry_manager.cc
@@ -24,7 +24,8 @@ Status KernelRegistryManager::CreateKernel(const Node& node,
                            session_state.GetConstantInitializedTensors(),
                            session_state.GetOrtValueNameIdxMap(),
                            session_state.GetDataTransferMgr(),
-                           session_state.GetAllocators());
+                           session_state.GetAllocators(),
+                           session_state.GetSessionOptions().config_options);
 
   return kernel_create_info.kernel_create_func(session_state.GetMutableFuncMgr(), kernel_info, out);
 }
diff --git a/onnxruntime/core/framework/op_kernel_info.cc b/onnxruntime/core/framework/op_kernel_info.cc
index 841fdb585f0d8..28793dae36d20 100644
--- a/onnxruntime/core/framework/op_kernel_info.cc
+++ b/onnxruntime/core/framework/op_kernel_info.cc
@@ -15,7 +15,8 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
                            const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
                            const OrtValueNameIdxMap& ort_value_name_idx_map,
                            const DataTransferManager& data_transfer_mgr,
-                           const AllocatorMap& allocators)
+                           const AllocatorMap& allocators,
+                           const ConfigOptions& config_options)
     : OpNodeProtoHelper(&proto_helper_context_),
       node_(node),
       kernel_def_(kernel_def),
@@ -24,15 +25,22 @@ OpKernelInfo::OpKernelInfo(const onnxruntime::Node& node,
       ort_value_name_idx_map_(ort_value_name_idx_map),
       data_transfer_mgr_(data_transfer_mgr),
       proto_helper_context_(node),
-      allocators_(allocators) {}
+      allocators_(allocators),
+      config_options_(config_options) {
+}
 
 OpKernelInfo::OpKernelInfo(const OpKernelInfo& other)
     : OpKernelInfo(other.node_, other.kernel_def_, *other.execution_provider_, other.constant_initialized_tensors_,
-                   other.ort_value_name_idx_map_, other.data_transfer_mgr_, other.allocators_) {}
+                   other.ort_value_name_idx_map_, other.data_transfer_mgr_,
+                   other.allocators_, other.config_options_) {
+}
 
 AllocatorPtr OpKernelInfo::GetAllocator(OrtMemType mem_type) const {
   auto it = allocators_.find(execution_provider_->GetOrtDeviceByMemType(mem_type));
-  if (it != allocators_.end()) return it->second;
+  if (it != allocators_.end()) {
+    return it->second;
+  }
+
   return nullptr;
 }
 
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index e3a2f2d74c0d4..9df300d6f4f88 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -18,10 +18,12 @@ namespace onnxruntime {
 
 ConstantFolding::ConstantFolding(const IExecutionProvider& execution_provider,
                                  bool skip_dequantize_linear,
+                                 const ConfigOptions& config_options,
                                  const InlinedHashSet<std::string_view>& compatible_execution_providers,
                                  const InlinedHashSet<std::string>& excluded_initializers) noexcept
     : GraphTransformer("ConstantFolding", compatible_execution_providers),
       skip_dequantize_linear_(skip_dequantize_linear),
+      config_options_(config_options),
       excluded_initializers_(excluded_initializers),
       execution_provider_(execution_provider) {
 }
@@ -250,12 +252,12 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
         // override the EP assigned to the node so that it will use the CPU kernel for Compute.
         node->SetExecutionProviderType(kCpuExecutionProvider);
 
-        kernel = info.CreateKernel(node);
+        kernel = info.CreateKernel(node, config_options_);
 
         // undo the EP change to the value that was assigned at graph partitioning time
         node->SetExecutionProviderType(ep_type);
       } else {
-        kernel = info.CreateKernel(node);
+        kernel = info.CreateKernel(node, config_options_);
       }
 
       // We currently constant fold using the CPU EP only.
diff --git a/onnxruntime/core/optimizer/constant_folding.h b/onnxruntime/core/optimizer/constant_folding.h
index 47934307e8930..14eb2a9c5f06b 100644
--- a/onnxruntime/core/optimizer/constant_folding.h
+++ b/onnxruntime/core/optimizer/constant_folding.h
@@ -24,6 +24,7 @@ class ConstantFolding : public GraphTransformer {
   */
   ConstantFolding(const IExecutionProvider& execution_provider,
                   bool skip_dequantize_linear,
+                  const ConfigOptions& config_options,
                   const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
                   const InlinedHashSet<std::string>& excluded_initializers = {}) noexcept;
 
@@ -31,6 +32,7 @@ class ConstantFolding : public GraphTransformer {
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
 
   bool skip_dequantize_linear_;
+  const ConfigOptions& config_options_;
   const InlinedHashSet<std::string> excluded_initializers_;
   const IExecutionProvider& execution_provider_;
 };
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 3d6251a694cfb..cd3c49be15aa4 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -223,7 +223,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       transformers.emplace_back(std::make_unique<ConstantSharing>(no_limit_empty_ep_list, excluded_initializers));
 
       transformers.emplace_back(std::make_unique<CommonSubexpressionElimination>());
-      transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq));
+      transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq,
+                                                                  session_options.config_options));
       transformers.emplace_back(std::make_unique<MatMulAddFusion>());
       transformers.emplace_back(std::make_unique<ReshapeFusion>());
       transformers.emplace_back(std::make_unique<FreeDimensionOverrideTransformer>(
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
index 46041bca9dcc1..1eabc079f3a20 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc
@@ -128,26 +128,34 @@ static Status TryCreateKernel(const Node& node,
                               const OrtValueNameIdxMap& ort_value_name_idx_map,
                               FuncManager& funcs_mgr,
                               const DataTransferManager& data_transfer_mgr,
+                              const ConfigOptions& config_options,
                               /*out*/ std::unique_ptr<OpKernel>& op_kernel) {
   const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{};
   const KernelCreateInfo* kernel_create_info = nullptr;
   ORT_RETURN_IF_ERROR(kernel_registry.TryFindKernel(node, execution_provider.Type(), kernel_type_str_resolver,
                                                     &kernel_create_info));
+
+  static const AllocatorMap dummy_allocators;
+
   OpKernelInfo kernel_info(node,
                            *kernel_create_info->kernel_def,
                            execution_provider,
                            constant_initialized_tensors,
                            ort_value_name_idx_map,
-                           data_transfer_mgr);
+                           data_transfer_mgr,
+                           dummy_allocators,
+                           config_options);
+
   return kernel_create_info->kernel_create_func(funcs_mgr, kernel_info, op_kernel);
 }
 
-std::unique_ptr<const OpKernel> OptimizerExecutionFrame::Info::CreateKernel(const Node* node) const {
+std::unique_ptr<const OpKernel>
+OptimizerExecutionFrame::Info::CreateKernel(const Node* node, const ConfigOptions& config_options) const {
   std::unique_ptr<OpKernel> op_kernel;
   std::shared_ptr<KernelRegistry> kernel_registry = execution_provider_.GetKernelRegistry();
   FuncManager func;
   auto status = TryCreateKernel(*node, *kernel_registry, execution_provider_, initializers_,
-                                ort_value_name_idx_map_, func, data_transfer_mgr_,
+                                ort_value_name_idx_map_, func, data_transfer_mgr_, config_options,
                                 op_kernel);
 
   // Kernel found in the CPU kernel registry
diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.h b/onnxruntime/core/optimizer/optimizer_execution_frame.h
index 13cf9e652c404..3dbf6c1d97aa6 100644
--- a/onnxruntime/core/optimizer/optimizer_execution_frame.h
+++ b/onnxruntime/core/optimizer/optimizer_execution_frame.h
@@ -27,11 +27,13 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
          const Path& model_path,
          const IExecutionProvider& execution_provider,
          const std::function<bool(const std::string&)>& is_sparse_initializer_func);
+
     Info(const std::vector<const Node*>& nodes,
          const std::unordered_map<std::string, OrtValue>& initialized_tensor_set,
          const Path& model_path,
          const IExecutionProvider& execution_provider,
          const std::function<bool(const std::string&)>& is_sparse_initializer_func);
+
     ~Info() = default;
 
     const AllocatorPtr& GetAllocator() const {
@@ -52,7 +54,7 @@ class OptimizerExecutionFrame final : public IExecutionFrame {
       return -1;
     }
 
-    std::unique_ptr<const OpKernel> CreateKernel(const Node* node) const;
+    std::unique_ptr<const OpKernel> CreateKernel(const Node* node, const ConfigOptions& config_options) const;
 
     // Check if an kernel create info can be found in the registry.
     Status TryFindKernel(const Node* node, const KernelCreateInfo** out) const;
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 76533a0061702..53ba4874c643c 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -132,6 +132,7 @@ struct Logger;
 struct Capture;
 }  // namespace logging
 struct ComputeCapability;
+struct ConfigOptions;
 struct DataTransferManager;
 struct IndexedSubGraph;
 struct IndexedSubGraph_MetaDef;
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 2883d92e90dba..21c14ce784a38 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <optional>
+
 // Public wrappers around internal ort interfaces (currently)
 #include "core/providers/shared_library/provider_host_api.h"
 
@@ -426,6 +428,9 @@ struct ProviderHost {
 
   virtual const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) = 0;
 
+  // ConfigOptions
+  virtual std::optional<std::string> ConfigOptions__GetConfigEntry(const ConfigOptions* p, const std::string& config_key) = 0;
+
   // ComputeCapability
   virtual std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) = 0;
   virtual void ComputeCapability__operator_delete(ComputeCapability* p) = 0;
@@ -808,6 +813,7 @@ struct ProviderHost {
   virtual uint32_t OpKernelInfo__GetInputCount(const OpKernelInfo* p) = 0;
   virtual uint32_t OpKernelInfo__GetOutputCount(const OpKernelInfo* p) = 0;
   virtual const Node& OpKernelInfo__node(const OpKernelInfo* p) = 0;
+  virtual const ConfigOptions& OpKernelInfo__GetConfigOptions(const OpKernelInfo* p) = 0;
 
   // SessionState
   virtual const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 149a43222b445..eaf8ef459cf00 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -335,6 +335,14 @@ struct DataTypeUtils final {
 
 }  // namespace Utils
 
+struct ConfigOptions final {
+  std::optional<std::string> GetConfigEntry(const std::string& config_key) const {
+    return g_host->ConfigOptions__GetConfigEntry(this, config_key);
+  }
+
+  PROVIDER_DISALLOW_ALL(ConfigOptions)
+};
+
 struct ComputeCapability final {
   static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
   static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
@@ -901,6 +909,8 @@ struct OpKernelInfo final {
 
   const Node& node() const noexcept { return g_host->OpKernelInfo__node(this); }
 
+  const ConfigOptions& GetConfigOptions() const { return g_host->OpKernelInfo__GetConfigOptions(this); }
+
   OpKernelInfo() = delete;
   OpKernelInfo(const OpKernelInfo&) = delete;
   void operator=(const OpKernelInfo&) = delete;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 2df30ba2de391..b9fd79997a538 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -6,6 +6,7 @@
 
 #include "core/common/inlined_containers.h"
 #include "core/framework/allocator_utils.h"
+#include "core/framework/config_options.h"
 #include "core/framework/compute_capability.h"
 #include "core/framework/data_types.h"
 #include "core/framework/data_transfer_manager.h"
@@ -529,6 +530,11 @@ struct ProviderHostImpl : ProviderHost {
 
   const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) override { return (*p)[index]; }
 
+  // ConfigOptions (wrapped)
+  std::optional<std::string> ConfigOptions__GetConfigEntry(const ConfigOptions* p, const std::string& config_key) override {
+    return p->GetConfigEntry(config_key);
+  }
+
   // ComputeCapability (wrapped)
   std::unique_ptr<ComputeCapability> ComputeCapability__construct(std::unique_ptr<IndexedSubGraph> t_sub_graph) override { return std::make_unique<ComputeCapability>(std::move(t_sub_graph)); }
   void ComputeCapability__operator_delete(ComputeCapability* p) override { delete p; }
@@ -934,6 +940,7 @@ struct ProviderHostImpl : ProviderHost {
   uint32_t OpKernelInfo__GetInputCount(const OpKernelInfo* p) override { return p->GetInputCount(); }
   uint32_t OpKernelInfo__GetOutputCount(const OpKernelInfo* p) override { return p->GetOutputCount(); }
   const Node& OpKernelInfo__node(const OpKernelInfo* p) override { return p->node(); }
+  const ConfigOptions& OpKernelInfo__GetConfigOptions(const OpKernelInfo* p) override { return p->GetConfigOptions(); }
 
   // SessionState (wrapped)
   const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) override { return p->GetDataTransferMgr(); }
diff --git a/onnxruntime/core/session/standalone_op_invoker.cc b/onnxruntime/core/session/standalone_op_invoker.cc
index b3128571f16ff..9cbf01946e92b 100644
--- a/onnxruntime/core/session/standalone_op_invoker.cc
+++ b/onnxruntime/core/session/standalone_op_invoker.cc
@@ -421,7 +421,10 @@ onnxruntime::Status CreateOp(_In_ const OrtKernelInfo* info,
   static const OrtValueNameIdxMap kEmptyNameMap;
 
   OpKernelInfo tmp_kernel_info(*node_ptr.get(), *kernel_def, *ep, kEmptyValueMap, kEmptyNameMap,
-                               kernel_info->GetDataTransferManager(), kernel_info->GetAllocators());
+                               kernel_info->GetDataTransferManager(),
+                               kernel_info->GetAllocators(),
+                               kernel_info->GetConfigOptions());
+
   std::unique_ptr<onnxruntime::OpKernel> op_kernel;
 
   auto& node_repo = NodeRepo::GetInstance();
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 2147a4253ef39..b174ee4138be3 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -254,7 +254,7 @@ class PlannerTest : public ::testing::Test {
     ASSERT_NE(ep, nullptr);
     auto info = std::make_unique<OpKernelInfo>(
         *p_node, kernel_def, *ep, state_->GetInitializedTensors(), state_->GetOrtValueNameIdxMap(),
-        state_->GetDataTransferMgr());
+        state_->GetDataTransferMgr(), state_->GetAllocators(), state_->GetSessionOptions().config_options);
 
     op_kernel_infos_.push_back(std::move(info));
     const auto kernel_type_str_resolver = OpSchemaKernelTypeStrResolver{};
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 2522ee3b496f6..60effda9ec772 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -82,6 +82,11 @@ ProviderInfo_ROCM& GetProviderInfo_ROCM();
 class FuseAdd : public OpKernel {
  public:
   explicit FuseAdd(const OpKernelInfo& info) : OpKernel(info) {
+    // logic for testing that a session options config value can be read here
+    auto test_throw_in_ctor = info.GetConfigOptions().GetConfigEntry("ThrowInKernelCtor");
+    if (test_throw_in_ctor == "1") {
+      ORT_THROW("Test exception in ctor");
+    };
   }
 
   Status Compute(OpKernelContext* context) const override {
@@ -96,6 +101,7 @@ class FuseAdd : public OpKernel {
     return Status::OK();
   }
 };
+
 constexpr const char* kFuseTest = "FuseTest";
 constexpr const char* kFuseExecutionProvider = "FuseExecutionProvider";
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kFuseExecutionProvider, kFuseTest, 1, FuseAdd);
@@ -1263,28 +1269,22 @@ TEST(InferenceSessionTests, TestOptionalInputs) {
       ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
     }
     // required, optional and invalid input
-    status = RunOptionalInputTest(true, true, true, version, sess_env);
-    ASSERT_FALSE(status.IsOK());
-    EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid input name"));
+    ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(RunOptionalInputTest(true, true, true, version, sess_env),
+                                        "Invalid input name");
 
     // missing required
-    status = RunOptionalInputTest(false, true, false, version, sess_env);
-    ASSERT_FALSE(status.IsOK());
-    if (version == 3) {
-      EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid input name"));
-    } else {
-      EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Missing Input:"));
-    }
+    ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(RunOptionalInputTest(false, true, false, version, sess_env),
+                                        (version == 3 ? "Invalid input name" : "Missing Input:"));
   }
 }
 
-TEST(ExecutionProviderTest, FunctionTest) {
-  onnxruntime::Model model("graph_1", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
+static void CreateFuseOpModel(const std::string& model_file_name) {
+  onnxruntime::Model model("graph_1", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
   auto& graph = model.MainGraph();
   std::vector<onnxruntime::NodeArg*> inputs;
   std::vector<onnxruntime::NodeArg*> outputs;
 
-  // FLOAT tensor.
   ONNX_NAMESPACE::TypeProto float_tensor;
   float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
   float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(3);
@@ -1307,18 +1307,19 @@ TEST(ExecutionProviderTest, FunctionTest) {
   outputs.push_back(&output_arg_2);
   graph.AddNode("node_2", "Add", "node 2.", inputs, outputs);
 
-  auto status = graph.Resolve();
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(graph.Resolve());
+  ASSERT_STATUS_OK(onnxruntime::Model::Save(model, model_file_name));
+}
+
+TEST(ExecutionProviderTest, FunctionTest) {
   std::string model_file_name = "execution_provider_test_graph.onnx";
-  status = onnxruntime::Model::Save(model, model_file_name);
+  CreateFuseOpModel(model_file_name);
 
   SessionOptions so;
   so.session_logid = "ExecutionProviderTest.FunctionTest";
-  InferenceSession session_object{so, GetEnvironment()};
-  status = session_object.Load(model_file_name);
-  ASSERT_TRUE(status.IsOK());
-  status = session_object.Initialize();
-  ASSERT_TRUE(status.IsOK());
+  InferenceSession session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.Load(model_file_name));
+  ASSERT_STATUS_OK(session.Initialize());
 
   RunOptions run_options;
   run_options.run_tag = so.session_logid;
@@ -1329,11 +1330,14 @@ TEST(ExecutionProviderTest, FunctionTest) {
   std::vector<int64_t> dims_mul_x = {3, 2};
   std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   OrtValue ml_value_x;
-  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x, &ml_value_x);
+  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x,
+                       &ml_value_x);
   OrtValue ml_value_y;
-  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x, &ml_value_y);
+  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x,
+                       &ml_value_y);
   OrtValue ml_value_z;
-  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x, &ml_value_z);
+  CreateMLValue<float>(testCPUExecutionProvider->CreatePreferredAllocators()[0], dims_mul_x, values_mul_x,
+                       &ml_value_z);
   NameMLValMap feeds;
   feeds.insert(std::make_pair("X", ml_value_x));
   feeds.insert(std::make_pair("Y", ml_value_y));
@@ -1349,67 +1353,33 @@ TEST(ExecutionProviderTest, FunctionTest) {
   std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
 
   // Now run
-  status = session_object.Run(run_options, feeds, output_names, &fetches);
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(session.Run(run_options, feeds, output_names, &fetches));
   VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
 
-  InferenceSession session_object_2{so, GetEnvironment()};
-  ASSERT_STATUS_OK(
-      session_object_2.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
-  ASSERT_STATUS_OK(session_object_2.Load(model_file_name));
-  ASSERT_STATUS_OK(session_object_2.Initialize());
-  ASSERT_STATUS_OK(session_object_2.Run(run_options, feeds, output_names, &fetches));
+  InferenceSession session2{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session2.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
+  ASSERT_STATUS_OK(session2.Load(model_file_name));
+  ASSERT_STATUS_OK(session2.Initialize());
+  ASSERT_STATUS_OK(session2.Run(run_options, feeds, output_names, &fetches));
   VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m);
 }
 
 TEST(ExecutionProviderTest, ShapeInferenceForFusedFunctionTest) {
-  onnxruntime::Model model("graph_1", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
-  auto& graph = model.MainGraph();
-  std::vector<onnxruntime::NodeArg*> inputs;
-  std::vector<onnxruntime::NodeArg*> outputs;
-
-  // FLOAT tensor.
-  ONNX_NAMESPACE::TypeProto float_tensor;
-  float_tensor.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
-  float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(3);
-  float_tensor.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(2);
-
-  auto& input_arg_1 = graph.GetOrCreateNodeArg("X", &float_tensor);
-  auto& input_arg_2 = graph.GetOrCreateNodeArg("Y", &float_tensor);
-  inputs.push_back(&input_arg_1);
-  inputs.push_back(&input_arg_2);
-  auto& output_arg = graph.GetOrCreateNodeArg("node_1_out_1", &float_tensor);
-  outputs.push_back(&output_arg);
-  graph.AddNode("node_1", "Add", "node 1.", inputs, outputs);
-
-  auto& input_arg_3 = graph.GetOrCreateNodeArg("Z", &float_tensor);
-  inputs.clear();
-  inputs.push_back(&output_arg);
-  inputs.push_back(&input_arg_3);
-  auto& output_arg_2 = graph.GetOrCreateNodeArg("M", &float_tensor);
-  outputs.clear();
-  outputs.push_back(&output_arg_2);
-  graph.AddNode("node_2", "Add", "node 2.", inputs, outputs);
-
-  auto status = graph.Resolve();
-  ASSERT_TRUE(status.IsOK());
   std::string model_file_name = "fused_node_shape_inference_test_graph.onnx";
-  status = onnxruntime::Model::Save(model, model_file_name);
+
+  CreateFuseOpModel(model_file_name);
 
   SessionOptions so;
   so.session_logid = "ExecutionProviderTest.ShapeInferenceForFusedFunctionTest";
   InferenceSessionWrapper session{so, GetEnvironment()};
-  ASSERT_STATUS_OK(
-      session.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
-  status = session.Load(model_file_name);
-  ASSERT_TRUE(status.IsOK());
-  status = session.Initialize();
-  ASSERT_TRUE(status.IsOK());
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
+  ASSERT_STATUS_OK(session.Load(model_file_name));
+  ASSERT_STATUS_OK(session.Initialize());
 
   Graph& fused_graph = session.GetMutableGraph();
-  ASSERT_TRUE(fused_graph.NumberOfNodes() == 1);
+  ASSERT_EQ(fused_graph.NumberOfNodes(), 1);
   auto& fused_node = *fused_graph.Nodes().begin();
-  ASSERT_TRUE(fused_node.NodeType() == Node::Type::Fused);
+  ASSERT_EQ(fused_node.NodeType(), Node::Type::Fused);
   ASSERT_TRUE(fused_node.Op()->has_type_and_shape_inference_function());
 
   // Clear shape inference data from output node to verify that assigned inference function is called
@@ -1419,7 +1389,25 @@ TEST(ExecutionProviderTest, ShapeInferenceForFusedFunctionTest) {
   ASSERT_STATUS_OK(fused_graph.Resolve());
 
   ASSERT_TRUE(fused_node_output.Shape() != nullptr);
-  ASSERT_TRUE(utils::GetTensorShapeFromTensorShapeProto(*fused_node_output.Shape()) == utils::GetTensorShapeFromTensorShapeProto(float_tensor.tensor_type().shape()));
+  ASSERT_EQ(utils::GetTensorShapeFromTensorShapeProto(*fused_node_output.Shape()), TensorShape({3, 2}));
+}
+
+TEST(ExecutionProviderTest, OpKernelInfoCanReadConfigOptions) {
+  std::string model_file_name = "OpKernelInfoCanReadConfigOptions.onnx";
+  CreateFuseOpModel(model_file_name);
+
+  SessionOptions so;
+  so.session_logid = "ExecutionProviderTest.OpKernelInfoCanReadConfigOptions";
+
+  // add a config key that if read causes the Fuse op kernel to throw in the ctor. this is just to test the value is passed
+  // through in the simplest way, as the kernel is constructed in InferenceSession::Intialize so we don't need to
+  // actually run the model.
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry("ThrowInKernelCtor", "1"));
+
+  InferenceSession session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::make_unique<::onnxruntime::FuseExecutionProvider>()));
+  ASSERT_STATUS_OK(session.Load(model_file_name));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session.Initialize(), "Test exception in ctor");
 }
 
 TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index e1ce1d4abf81d..8990c23e4af39 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -84,9 +84,10 @@ TEST_P(SessionStateAddGetKernelTest, AddGetKernelTest) {
   auto kernel_def = KernelDefBuilder().SetName("Variable").Provider(kCpuExecutionProvider).SinceVersion(1, 10).Build();
 
   OpKernelInfo p_info(node, *kernel_def, *cpu_execution_provider, s.GetConstantInitializedTensors(),
-                      s.GetOrtValueNameIdxMap(), s.GetDataTransferMgr());
-  unique_ptr<TestOpKernel> p_kernel;
-  p_kernel.reset(new TestOpKernel(p_info));
+                      s.GetOrtValueNameIdxMap(), s.GetDataTransferMgr(), s.GetAllocators(),
+                      s.GetSessionOptions().config_options);
+
+  std::unique_ptr<TestOpKernel> p_kernel = std::make_unique<TestOpKernel>(p_info);
   size_t orig_num_outputs = p_kernel->Node().OutputDefs().size();
   std::cout << "node_idx: " << node.Index() << std::endl;
 
diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc
index 24f34492954aa..4b676021dfe6c 100644
--- a/onnxruntime/test/ir/graph_test.cc
+++ b/onnxruntime/test/ir/graph_test.cc
@@ -1503,10 +1503,8 @@ TEST_F(GraphTest, ShapeInferenceErrorHandling) {
 
   graph.AddNode("node_1", "ShapeInferenceThrowsOp", "node 1", {&input_arg1}, {&output_arg1});
 
-  auto status = graph.Resolve();
-  EXPECT_FALSE(status.IsOK());
-  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Node (node_1) Op (ShapeInferenceThrowsOp) "
-                                                        "[ShapeInferenceError] try harder"));
+  EXPECT_STATUS_NOT_OK_AND_HAS_SUBSTR(graph.Resolve(),
+                                      "Node (node_1) Op (ShapeInferenceThrowsOp) [ShapeInferenceError] try harder");
 }
 
 TEST_F(GraphTest, AddTensorAttribute) {
@@ -2024,10 +2022,9 @@ TEST_F(GraphTest, LoadModelMissingInput) {
   SetTypeAndShape(output->mutable_type()->mutable_tensor_type(), 1, {2, 2});
 
   std::shared_ptr<Model> model;
-  Status st = Model::Load(std::move(m), model, nullptr, *logger_);
-  ASSERT_FALSE(st.IsOK());
-  ASSERT_THAT(st.ErrorMessage(), testing::HasSubstr("Invalid model. Node input 'y' is not a graph input, "
-                                                    "initializer, or output of a previous node."));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(Model::Load(std::move(m), model, nullptr, *logger_),
+                                      "Invalid model. Node input 'y' is not a graph input, "
+                                      "initializer, or output of a previous node.");
 }
 
 // if an initializer is backing an optional graph input, it can't be removed even if unused in the graph.
diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc
index 77590f5c0a304..cf859facf4765 100644
--- a/onnxruntime/test/onnx/microbenchmark/activation.cc
+++ b/onnxruntime/test/onnx/microbenchmark/activation.cc
@@ -69,7 +69,18 @@ struct KernelAndDef {
                   .SetDomain(domain)
                   .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
                   .Build();
-    OpKernelInfo info(main_node, *out.def, *out.a, {}, {}, {});
+
+    // these usually come from the session state. OpKernelInfo stores references to them so we need a valid backing
+    // instance even though we don't use them in this test.
+    static const std::unordered_map<int, OrtValue> constant_initialized_tensors;
+    static const OrtValueNameIdxMap mlvalue_name_idx_map;
+    static const DataTransferManager data_transfer_mgr;
+    static const AllocatorMap allocators;
+    static const ConfigOptions config_options;
+    OpKernelInfo info(main_node, *out.def, *out.a,
+                      constant_initialized_tensors, mlvalue_name_idx_map, data_transfer_mgr, allocators,
+                      config_options);
+
     out.kernel = std::make_unique<KernelType>(info);
     return out;
   }
diff --git a/onnxruntime/test/optimizer/cse_test.cc b/onnxruntime/test/optimizer/cse_test.cc
index cccfc8d77fcea..bad96406df845 100644
--- a/onnxruntime/test/optimizer/cse_test.cc
+++ b/onnxruntime/test/optimizer/cse_test.cc
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "test/framework/test_utils.h"
-#include "test/test_environment.h"
 #include "core/graph/model.h"
 #include "core/optimizer/common_subexpression_elimination.h"
 #include "core/optimizer/graph_transformer_mgr.h"
+#include "test/framework/test_utils.h"
+#include "test/test_environment.h"
+#include "test/util/include/asserts.h"
 
 #ifdef ENABLE_TRAINING
 #include "orttraining/core/optimizer/graph_transformer_utils.h"
@@ -272,20 +273,21 @@ TEST(CseTests, MergedValueAndGraphOutputAreOutputsOfSameNode) {
 TEST(CseTests, MergeConstants) {
   auto model_uri = ORT_TSTR("testdata/transform/cse/cse_merge_constants.onnx");
   std::shared_ptr<Model> model;
-  ASSERT_TRUE(Model::Load(model_uri, model, nullptr,
-                          DefaultLoggingManager().DefaultLogger())
-                  .IsOK());
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()));
+
   Graph& graph = model->MainGraph();
   GraphTransformerManager graph_transformation_mgr(1);
   // In current implementation, equal constants are not merged. So CSE must precede constant folding, otherwise we end up
   // with multiple copies of the same constant.
   std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
-  ASSERT_TRUE(
-      graph_transformation_mgr.Register(std::make_unique<CommonSubexpressionElimination>(), TransformerLevel::Level1).IsOK());
-  ASSERT_TRUE(
-      graph_transformation_mgr.Register(std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1).IsOK());
-  ASSERT_TRUE(
-      graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<CommonSubexpressionElimination>(),
+                                                     TransformerLevel::Level1));
+  const ConfigOptions empty_config_options;
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1,
+                                                              DefaultLoggingManager().DefaultLogger()));
 
   ASSERT_EQ(graph.GetAllInitializedTensors().size(), 1U);
   auto op_count = CountOpsInGraph(graph);
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 5adcb3c150b8d..bf02c1741725f 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -575,12 +575,14 @@ TEST_F(GraphTransformationTests, ConstantFolding) {
   ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
   Graph& graph = model->MainGraph();
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  ASSERT_EQ(op_to_count["Unsqueeze"], 2);
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
@@ -595,11 +597,13 @@ TEST_F(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
   Graph& graph = model->MainGraph();
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
+
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   // assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
   for (auto& node : graph.Nodes()) {
@@ -624,11 +628,12 @@ TEST_F(GraphTransformationTests, ConstantFoldingUnsupportedFloat16) {
   Graph& graph = model->MainGraph();
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   ASSERT_TRUE(op_to_count["Mul"] == 1);
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   // assign all nodes to CUDA. the constant folding should try folding the node on the CPU and fail, thus leaving the
   // EP as CUDA and not constant folding the node.
@@ -707,11 +712,12 @@ TEST_F(GraphTransformationTests, ConstantFoldingSubgraph) {
 
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
   ASSERT_TRUE(op_to_count["Add"] == 2);  // one in each subgraph
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
@@ -731,14 +737,15 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithShapeToInitializer) {
   ASSERT_TRUE(op_to_count["Unsqueeze"] == 3);
 
   InlinedHashSet<std::string_view> compatible_eps;
-  InlinedHashSet<std::string> excluded_initializers;
-  excluded_initializers.insert("matmul_weight");
+  InlinedHashSet<std::string> excluded_initializers = {"matmul_weight"};
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
       std::make_unique<ConstantFolding>(*e.get(),
                                         false /*skip_dequantize_linear*/,
+                                        empty_config_options,
                                         compatible_eps,
                                         excluded_initializers),
       TransformerLevel::Level1));
@@ -763,11 +770,11 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) {
 
   InlinedHashSet<std::string_view> compatible_eps;
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(),
-                                        false /*skip_dequantize_linear*/,
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options,
                                         compatible_eps),
       TransformerLevel::Level1));
 
@@ -792,11 +799,11 @@ TEST_F(GraphTransformationTests, ConstantFoldingForOpsWithMissingOptionalInputs)
 
   InlinedHashSet<std::string_view> compatible_eps;
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(),
-                                        false /*skip_dequantize_linear*/,
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options,
                                         compatible_eps),
       TransformerLevel::Level1));
 
@@ -965,11 +972,12 @@ TEST_F(GraphTransformationTests, ConstantFolding_RemoveDanglingInputNodesToConst
   ASSERT_TRUE(op_to_count["Add"] == 1);            // Input node to Shape
   ASSERT_TRUE(op_to_count["RandomUniform"] == 1);  // Input node to Add
 
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
 
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
@@ -988,10 +996,13 @@ TEST_F(GraphTransformationTests, ConstantFoldingAShapeNodeDeepInTheGraph) {
   ASSERT_TRUE(op_to_count["Shape"] == 4);
 
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  std::unique_ptr<CPUExecutionProvider> e =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
+
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
   op_to_count = CountOpsInGraph(graph);
@@ -1014,9 +1025,12 @@ TEST_F(GraphTransformationTests, ConstantFoldingStringInitializer) {
   ASSERT_EQ(op_to_count["Identity"], 1);
 
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  const ConfigOptions empty_config_options;
   std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+
   ASSERT_STATUS_OK(graph_transformation_mgr.Register(
-      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1));
+      std::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/, empty_config_options),
+      TransformerLevel::Level1));
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
 
   op_to_count = CountOpsInGraph(graph);
diff --git a/onnxruntime/test/optimizer/optimizer_test.cc b/onnxruntime/test/optimizer/optimizer_test.cc
index 2ce1e3881d81d..79704f2cc79e3 100644
--- a/onnxruntime/test/optimizer/optimizer_test.cc
+++ b/onnxruntime/test/optimizer/optimizer_test.cc
@@ -27,7 +27,8 @@ namespace test {
 static const std::string MODEL_FOLDER = "testdata/transform/";
 
 TEST(OptimizerTest, Basic) {
-  Model model("OptimizerBasic", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
+  Model model("OptimizerBasic", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
   auto& graph = model.MainGraph();
 
   constexpr int tensor_dim = 10;
@@ -65,8 +66,7 @@ TEST(OptimizerTest, Basic) {
     nodes.push_back(&node);
   }
 
-  std::unique_ptr<CPUExecutionProvider> cpu_execution_provider =
-      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  auto cpu_execution_provider = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
 #if !defined(DISABLE_SPARSE_TENSORS)
   OptimizerExecutionFrame::Info info(nodes, initialized_tensor_set,
                                      graph.ModelPath(),
@@ -85,8 +85,10 @@ TEST(OptimizerTest, Basic) {
   OptimizerExecutionFrame frame(info, fetch_mlvalue_idxs);
   const logging::Logger& logger = DefaultLoggingManager().DefaultLogger();
 
+  const ConfigOptions empty_config_options;
+
   for (auto& node : graph.Nodes()) {
-    auto kernel = info.CreateKernel(&node);
+    auto kernel = info.CreateKernel(&node, empty_config_options);
 
     // kernel can only be a nullptr if a CPU kernel implementation has been removed,
     // if that is the case, OpKernelContext instance construction will throw in the next step
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index 3d46893cdb82d..e5f3956438b7a 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -248,10 +248,9 @@ static common::Status CreateSubgraph(Graph& graph, RunOptions& options, const st
   auto status = graph.Resolve();
 
   if (failure_message.empty()) {
-    EXPECT_EQ(status, Status::OK());
+    EXPECT_STATUS_OK(status);
   } else {
-    EXPECT_TRUE(!status.IsOK());
-    EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr(failure_message));
+    EXPECT_STATUS_NOT_OK_AND_HAS_SUBSTR(status, failure_message);
   }
 
   return status;
diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
index 8955a83e66c01..aba74484a644b 100644
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@@ -153,9 +153,8 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
       std::make_unique<InternalTestingExecutionProvider>(supported_ops)));
 
   ASSERT_STATUS_OK(session->Load(ort_model_path));
-  auto status = session->Initialize();
-  ASSERT_FALSE(status.IsOK()) << "Initialize should have failed when trying to save model with compiled kernels";
-  ASSERT_THAT(status.ErrorMessage(), ::testing::HasSubstr("Unable to serialize model as it contains compiled nodes"));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session->Initialize(),
+                                      "Unable to serialize model as it contains compiled nodes");
 }
 
 // the internal NHWC operators are only included as part of contrib ops currently. as the EP requests the NHWC
@@ -195,11 +194,10 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
   output_names.push_back("Z");
   std::vector<OrtValue> fetches;
 
-  auto status = session.Run(feeds, output_names, &fetches);
   // Error message should come from the Conv implementation with the statically registered kernel
-  ASSERT_THAT(status.ErrorMessage(),
-              ::testing::HasSubstr("Non-zero status code returned while running Conv node. Name:'Conv' "
-                                   "Status Message: TODO: add NHWC implementation here."));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session.Run(feeds, output_names, &fetches),
+                                      "Non-zero status code returned while running Conv node. Name:'Conv' "
+                                      "Status Message: TODO: add NHWC implementation here.");
 }
 
 TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
@@ -243,10 +241,9 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
   output_names.push_back("softmaxout_1");
   std::vector<OrtValue> fetches;
 
-  auto status = session.Run(feeds, output_names, &fetches);
-  ASSERT_THAT(status.ErrorMessage(),
-              ::testing::HasSubstr("Non-zero status code returned while running Conv node. Name:'Conv' "
-                                   "Status Message: TODO: add NHWC implementation here."));
+  ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session.Run(feeds, output_names, &fetches),
+                                      "Non-zero status code returned while running Conv node. Name:'Conv' "
+                                      "Status Message: TODO: add NHWC implementation here.");
 }
 
 // This test can be deprecated now as the code logic has been changed so the model is not applicable
diff --git a/onnxruntime/test/providers/kernel_compute_test_utils.cc b/onnxruntime/test/providers/kernel_compute_test_utils.cc
index 977a5bd9ea7b8..23ec48fa649dd 100644
--- a/onnxruntime/test/providers/kernel_compute_test_utils.cc
+++ b/onnxruntime/test/providers/kernel_compute_test_utils.cc
@@ -124,7 +124,8 @@ void KernelComputeTester::Run(std::unordered_set<int> strided_outputs) {
     outputs.emplace_back(output);
   }
 
-  auto kernel = info.CreateKernel(&node);
+  static const ConfigOptions empty_config_options;
+  auto kernel = info.CreateKernel(&node, empty_config_options);
   ASSERT_TRUE(kernel);
 
   std::vector<int> fetch_mlvalue_idxs;
diff --git a/onnxruntime/test/util/include/asserts.h b/onnxruntime/test/util/include/asserts.h
index f6edb062f0706..02494951a06ba 100644
--- a/onnxruntime/test/util/include/asserts.h
+++ b/onnxruntime/test/util/include/asserts.h
@@ -6,6 +6,7 @@
 #include "core/common/status.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "gtest/gtest.h"
+#include "gmock/gmock.h"
 
 // helpers to run a function and check the status, outputting any error if it fails.
 // note: wrapped in do{} while(false) so the _tmp_status variable has limited scope
@@ -33,6 +34,20 @@
     EXPECT_FALSE(_tmp_status.IsOK());  \
   } while (false)
 
+#define ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(function, msg)              \
+  do {                                                                  \
+    Status _tmp_status = (function);                                    \
+    ASSERT_FALSE(_tmp_status.IsOK());                                   \
+    ASSERT_THAT(_tmp_status.ErrorMessage(), ::testing::HasSubstr(msg)); \
+  } while (false)
+
+#define EXPECT_STATUS_NOT_OK_AND_HAS_SUBSTR(function, msg)              \
+  do {                                                                  \
+    Status _tmp_status = (function);                                    \
+    EXPECT_FALSE(_tmp_status.IsOK());                                   \
+    EXPECT_THAT(_tmp_status.ErrorMessage(), ::testing::HasSubstr(msg)); \
+  } while (false)
+
 // Same helpers for public API OrtStatus. Get the 'api' instance using:
 //   const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
 #define ASSERT_ORTSTATUS_OK(api, function)                                \
diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
index 6193a1d10c095..894fe3b052fb2 100644
--- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
+++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc
@@ -157,8 +157,10 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
         transformers.emplace_back(std::make_unique<GeluApproximation>(compatible_eps));
       }
       InlinedHashSet<std::string> excluded_initializers(weights_to_train.begin(), weights_to_train.end());
+      static const ConfigOptions empty_config_options;
       transformers.emplace_back(std::make_unique<ConstantFolding>(
-          execution_provider, false /*skip_dequantize_linear*/, compatible_eps, excluded_initializers));
+          execution_provider, false /*skip_dequantize_linear*/, empty_config_options, compatible_eps,
+          excluded_initializers));
       transformers.emplace_back(std::make_unique<ReshapeFusion>(compatible_eps));
       // Put fine-grained optimizer (e.g. ShapeOptimizer) after ReshapeFusion to avoid it breaks the strong patterns
       // it defines. ReshapeFusion depends on subgraph pattern matching and do replacement accordingly, ShapeOptimizer

From 96dbac6e4b996dfc5fcebd60d2f093beea87401e Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 12 Jan 2024 16:04:33 -0800
Subject: [PATCH 048/100] update to emsdk-3.1.51 (#18844)

---
 .gitmodules                                               | 2 +-
 cgmanifests/generated/cgmanifest.json                     | 2 +-
 cmake/adjust_global_compile_flags.cmake                   | 4 +---
 cmake/external/emsdk                                      | 2 +-
 cmake/onnxruntime_unittests.cmake                         | 2 +-
 cmake/onnxruntime_webassembly.cmake                       | 1 +
 tools/ci_build/build.py                                   | 2 +-
 .../github/azure-pipelines/templates/linux-wasm-ci.yml    | 8 ++++----
 tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml | 2 +-
 9 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 7bb49e98bfec1..f874660971d41 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,4 +7,4 @@
 [submodule "cmake/external/emsdk"]
 	path = cmake/external/emsdk
 	url = https://github.com/emscripten-core/emsdk.git
-	branch = 3.1.44
+	branch = 3.1.51
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 137ea8a50c011..bcd0b2a92a5c3 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -6,7 +6,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "a896e3d066448b3530dbcaa48869fafefd738f57",
+          "commitHash": "4e2496141eda15040c44e9bbf237a1326368e34c",
           "repositoryUrl": "https://github.com/emscripten-core/emsdk.git"
         },
         "comments": "git submodule at cmake/external/emsdk"
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 94884a3973ef6..30d8cbf78fb1a 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -16,9 +16,7 @@ if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE)
 endif()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-  string(APPEND CMAKE_C_FLAGS " -s STRICT=1 -s DEFAULT_TO_CXX=1")
-  string(APPEND CMAKE_CXX_FLAGS " -s STRICT=1 -s DEFAULT_TO_CXX=1")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s ALLOW_UNIMPLEMENTED_SYSCALLS=1")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s ALLOW_UNIMPLEMENTED_SYSCALLS=1 -s DEFAULT_TO_CXX=1")
 
   # Enable LTO for release single-thread build
   if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
diff --git a/cmake/external/emsdk b/cmake/external/emsdk
index a896e3d066448..4e2496141eda1 160000
--- a/cmake/external/emsdk
+++ b/cmake/external/emsdk
@@ -1 +1 @@
-Subproject commit a896e3d066448b3530dbcaa48869fafefd738f57
+Subproject commit 4e2496141eda15040c44e9bbf237a1326368e34c
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index ed878e16c546e..fa395802d95ff 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -906,7 +906,7 @@ if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 endif()
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   set_target_properties(onnxruntime_test_all PROPERTIES LINK_DEPENDS ${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js)
-  set_target_properties(onnxruntime_test_all PROPERTIES LINK_FLAGS "-s STACK_SIZE=5242880 -s ALLOW_MEMORY_GROWTH=1 -s MAXIMUM_MEMORY=4294967296 --pre-js \"${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js\" -s \"EXPORTED_RUNTIME_METHODS=['FS']\" --preload-file ${CMAKE_CURRENT_BINARY_DIR}/testdata@/testdata -s EXIT_RUNTIME=1 -s DEMANGLE_SUPPORT=1")
+  set_target_properties(onnxruntime_test_all PROPERTIES LINK_FLAGS "-s STACK_SIZE=5242880 -s ALLOW_MEMORY_GROWTH=1 -s MAXIMUM_MEMORY=4294967296 -s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm] --pre-js \"${TEST_SRC_DIR}/wasm/onnxruntime_test_all_adapter.js\" -s \"EXPORTED_RUNTIME_METHODS=['FS']\" --preload-file ${CMAKE_CURRENT_BINARY_DIR}/testdata@/testdata -s EXIT_RUNTIME=1 -s DEMANGLE_SUPPORT=1")
   if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
     set_property(TARGET onnxruntime_test_all APPEND_STRING PROPERTY LINK_FLAGS " -s DEFAULT_PTHREAD_STACK_SIZE=131072 -s PROXY_TO_PTHREAD=1")
   endif()
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 1dc982aea5f2f..858583e64e9df 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -225,6 +225,7 @@ else()
     "SHELL:-s EXPORT_ALL=0"
     "SHELL:-s VERBOSE=0"
     "SHELL:-s FILESYSTEM=0"
+    "SHELL:-s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm,mainScriptUrlOrBlob]"
     ${WASM_API_EXCEPTION_CATCHING}
     --no-entry
   )
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 592e6d6a564fb..315b9a237b1c4 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -435,7 +435,7 @@ def convert_arg_line_to_args(self, arg_line):
     # WebAssembly build
     parser.add_argument("--build_wasm", action="store_true", help="Build for WebAssembly")
     parser.add_argument("--build_wasm_static_lib", action="store_true", help="Build for WebAssembly static library")
-    parser.add_argument("--emsdk_version", default="3.1.44", help="Specify version of emsdk")
+    parser.add_argument("--emsdk_version", default="3.1.51", help="Specify version of emsdk")
 
     parser.add_argument("--enable_wasm_simd", action="store_true", help="Enable WebAssembly SIMD")
     parser.add_argument("--enable_wasm_threads", action="store_true", help="Enable WebAssembly multi-threads support")
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index d67af8d23706f..e6693a6f6d26a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -93,15 +93,15 @@ jobs:
       - script: |
           set -ex
           cd '$(Build.SourcesDirectory)/cmake/external/emsdk'
-          ./emsdk install 3.1.44 ccache-git-emscripten-64bit
-          ./emsdk activate 3.1.44 ccache-git-emscripten-64bit
+          ./emsdk install 3.1.51 ccache-git-emscripten-64bit
+          ./emsdk activate 3.1.51 ccache-git-emscripten-64bit
         displayName: 'emsdk install and activate ccache for emscripten'
   - ${{if eq(parameters.WithCache, false)}}:
       - script: |
           set -ex
           cd '$(Build.SourcesDirectory)/cmake/external/emsdk'
-          ./emsdk install 3.1.44
-          ./emsdk activate 3.1.44
+          ./emsdk install 3.1.51
+          ./emsdk activate 3.1.51
         displayName: 'emsdk install and activate ccache for emscripten'
 
   - template: build-linux-wasm-step.yml
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index 63dabf5eab9d9..e352a04068ee8 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -62,4 +62,4 @@ stages:
     RunWebGpuTestsForReleaseBuild: true
     WebGpuPoolName: 'onnxruntime-Win2022-webgpu-A10'
     WebCpuPoolName: 'onnxruntime-Win-CPU-2022-web'
-    WithCache: true
+    WithCache: false

From 8deeba3ad01ec2b7f815a620630659a1d2722f7f Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 12 Jan 2024 17:02:32 -0800
Subject: [PATCH 049/100] [Quantization] Fix get_qnn_qdq_config to use new
 scale/zp np.array data types (#19114)

### Description
- Updates `get_qnn_qdq_config()` to use new scale/zp np.array data
types.
- Adds missing unit test to help prevent future regression.


### Motivation and Context
https://github.com/microsoft/onnxruntime/pull/18043 changed the usage of
`extra_options["TensorQuantizationOverrides"]`. We need to update its
use in quantization/execution_providers/qnn/quant_config.py
---
 .../execution_providers/qnn/quant_config.py   |  17 +-
 .../test_tensor_quant_overrides_option.py     | 217 +++++++++++-------
 2 files changed, 148 insertions(+), 86 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index eea3a045619fe..7c2fa4f65ae1b 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -5,6 +5,7 @@
 # --------------------------------------------------------------------------
 from pathlib import Path
 
+import numpy as np
 import onnx
 
 from ...calibrate import CalibrationDataReader, CalibrationMethod
@@ -55,14 +56,22 @@ def get_qnn_qdq_config(
                     tensor_quant_overrides[input_name] = [{"quant_type": weight_type, "symmetric": weight_symmetric}]
         elif node.op_type == "Sigmoid":
             if activation_type == QuantType.QUInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 65536.0, "zero_point": 0}]
+                tensor_quant_overrides[node.output[0]] = [
+                    {"scale": np.array(1.0 / 65536.0, dtype=np.float32), "zero_point": np.array(0, dtype=np.uint16)}
+                ]
             elif activation_type == QuantType.QInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 32768.0, "zero_point": 0}]
+                tensor_quant_overrides[node.output[0]] = [
+                    {"scale": np.array(1.0 / 32768.0, dtype=np.float32), "zero_point": np.array(0, dtype=np.int16)}
+                ]
         elif node.op_type == "Tanh":
             if activation_type == QuantType.QUInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 32768.0, "zero_point": 32768}]
+                tensor_quant_overrides[node.output[0]] = [
+                    {"scale": np.array(1.0 / 32768.0, dtype=np.float32), "zero_point": np.array(32768, dtype=np.uint16)}
+                ]
             elif activation_type == QuantType.QInt16:
-                tensor_quant_overrides[node.output[0]] = [{"scale": 1.0 / 32768.0, "zero_point": 0}]
+                tensor_quant_overrides[node.output[0]] = [
+                    {"scale": np.array(1.0 / 32768.0, dtype=np.float32), "zero_point": np.array(0, dtype=np.int16)}
+                ]
 
     extra_options = {
         "MinimumRealRange": 0.0001,
diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
index 9f0ee380cad15..0470953e385b6 100644
--- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
+++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py
@@ -12,9 +12,18 @@
 import onnx
 
 from onnxruntime import quantization
+from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config
 from onnxruntime.quantization.quant_utils import compute_scale_zp, get_qmin_qmax_for_qType
 
 
+class DummyDataReader(quantization.CalibrationDataReader):
+    def __init__(self, activations):
+        self.iterator = ({"INP": act} for act in activations)
+
+    def get_next(self):
+        return next(self.iterator, None)
+
+
 class TestTensorQuantOverridesOption(unittest.TestCase):
     def setUp(self):
         self.activations = [
@@ -43,7 +52,7 @@ def setUp(self):
             "OUT": (0, np.float32(0.005075461231172085)),
         }
 
-    def perform_qdq_quantization(self, output_model_name, tensor_quant_overrides=None, per_channel=False):
+    def build_float32_model(self):
         #    (input)
         #       |
         #    Sigmoid
@@ -66,24 +75,18 @@ def perform_qdq_quantization(self, output_model_name, tensor_quant_overrides=Non
         model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)])
         onnx.save(model, "model.onnx")
 
-        # Quantize model
-        class DummyDataReader(quantization.CalibrationDataReader):
-            def __init__(self, activations):
-                self.iterator = ({"INP": act} for act in activations)
-
-            def get_next(self):
-                return next(self.iterator, None)
+    def perform_qdq_quantization(self, output_model_name, extra_options=None, per_channel=False, activation_type=None):
+        self.build_float32_model()
 
-        extra_options = {}
-        if tensor_quant_overrides is not None:
-            extra_options["TensorQuantOverrides"] = tensor_quant_overrides
+        if activation_type is None:
+            activation_type = self.default_act_qtype
 
         quantization.quantize_static(
             model_input="model.onnx",
             model_output=output_model_name,
             calibration_data_reader=DummyDataReader(self.activations),
             quant_format=quantization.QuantFormat.QDQ,
-            activation_type=self.default_act_qtype,
+            activation_type=activation_type,
             weight_type=self.default_wgt_qtype,
             per_channel=per_channel,
             op_types_to_quantize=["Conv", "Sigmoid"],
@@ -129,7 +132,7 @@ def test_qdq_default(self):
             out_sc,
         ) = self.perform_qdq_quantization(
             "model_default_quant_overrides.onnx",
-            tensor_quant_overrides=None,  # default behavior
+            extra_options=None,  # default behavior
         )
 
         # No overrides set. Expect default values
@@ -171,7 +174,7 @@ def test_qdq_default_per_channel(self):
             out_sc,
         ) = self.perform_qdq_quantization(
             "model_default_per_channel_quant_overrides.onnx",
-            tensor_quant_overrides=None,  # default behavior
+            extra_options=None,  # default behavior
             per_channel=True,
         )
 
@@ -215,10 +218,14 @@ def test_qdq_overrides1(self):
         """
         inp_zp, inp_sc, sig_out_zp, sig_out_sc, wgt_zp, wgt_sc, bias_zp, bias_sc, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides1.onnx",
-            tensor_quant_overrides={
-                "SIG_OUT": [{"scale": np.array(1.0, dtype=np.float32), "zero_point": np.array(127, dtype=np.uint8)}],
-                "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
-                "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "SIG_OUT": [
+                        {"scale": np.array(1.0, dtype=np.float32), "zero_point": np.array(127, dtype=np.uint8)}
+                    ],
+                    "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+                    "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}],
+                }
             },
         )
 
@@ -257,7 +264,7 @@ def test_qdq_overrides2(self):
         sigmoid_rmin, sigmoid_rmax = np.array(0.0, dtype=np.float32), np.array(0.5, dtype=np.float32)
         inp_zp, inp_sc, sig_out_zp, sig_out_sc, _, _, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides2.onnx",
-            tensor_quant_overrides={"SIG_OUT": [{"rmin": sigmoid_rmin, "rmax": sigmoid_rmax}]},
+            extra_options={"TensorQuantOverrides": {"SIG_OUT": [{"rmin": sigmoid_rmin, "rmax": sigmoid_rmax}]}},
         )
 
         # Input should have same quant params
@@ -280,8 +287,10 @@ def test_qdq_overrides3(self):
         wgt_rmin, wgt_rmax = np.array(0.0, dtype=np.float32), np.array(1.0, dtype=np.float32)
         _, _, _, _, wgt_zp, wgt_sc, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides3.onnx",
-            tensor_quant_overrides={
-                "WGT": [{"rmin": wgt_rmin, "rmax": wgt_rmax}],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [{"rmin": wgt_rmin, "rmax": wgt_rmax}],
+                }
             },
         )
 
@@ -302,8 +311,10 @@ def test_qdq_overrides4(self):
         wgt_zp_val, wgt_scale_val = np.array(4, dtype=np.float32), np.array(0.5, dtype=np.float32)
         _, _, _, _, wgt_zp, wgt_sc, _, _, _, _ = self.perform_qdq_quantization(
             "model_quant_overrides4.onnx",
-            tensor_quant_overrides={
-                "WGT": [{"zero_point": wgt_zp_val, "scale": wgt_scale_val}],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [{"zero_point": wgt_zp_val, "scale": wgt_scale_val}],
+                }
             },
         )
 
@@ -330,15 +341,17 @@ def test_qdq_overrides_per_channel1(self):
             _,
         ) = self.perform_qdq_quantization(
             "model_per_channel_quant_overrides1.onnx",
-            tensor_quant_overrides={
-                "WGT": [
-                    {"zero_point": zp_vals[0], "scale": scale_vals[0]},
-                    {"zero_point": zp_vals[1], "scale": scale_vals[1]},
-                ],
-                "BIAS": [
-                    {"zero_point": zp_vals[0], "scale": scale_vals[0]},
-                    {"zero_point": zp_vals[1], "scale": scale_vals[1]},
-                ],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [
+                        {"zero_point": zp_vals[0], "scale": scale_vals[0]},
+                        {"zero_point": zp_vals[1], "scale": scale_vals[1]},
+                    ],
+                    "BIAS": [
+                        {"zero_point": zp_vals[0], "scale": scale_vals[0]},
+                        {"zero_point": zp_vals[1], "scale": scale_vals[1]},
+                    ],
+                }
             },
             per_channel=True,
         )
@@ -377,21 +390,23 @@ def test_qdq_overrides_per_channel2(self):
             _,
         ) = self.perform_qdq_quantization(
             "model_per_channel_quant_overrides2.onnx",
-            tensor_quant_overrides={
-                "WGT": [
-                    {
-                        "quant_type": quant_type,
-                        "rmin": np.array(rmin_vals[0], dtype=np.float32),
-                        "rmax": np.array(rmax_vals[0], dtype=np.float32),
-                        "reduce_range": reduce_ranges[0],
-                    },
-                    {
-                        "quant_type": quant_type,
-                        "rmin": np.array(rmin_vals[1], dtype=np.float32),
-                        "rmax": np.array(rmax_vals[1], dtype=np.float32),
-                        "reduce_range": reduce_ranges[1],
-                    },
-                ],
+            extra_options={
+                "TensorQuantOverrides": {
+                    "WGT": [
+                        {
+                            "quant_type": quant_type,
+                            "rmin": np.array(rmin_vals[0], dtype=np.float32),
+                            "rmax": np.array(rmax_vals[0], dtype=np.float32),
+                            "reduce_range": reduce_ranges[0],
+                        },
+                        {
+                            "quant_type": quant_type,
+                            "rmin": np.array(rmin_vals[1], dtype=np.float32),
+                            "rmax": np.array(rmax_vals[1], dtype=np.float32),
+                            "reduce_range": reduce_ranges[1],
+                        },
+                    ],
+                }
             },
             per_channel=True,
         )
@@ -415,8 +430,12 @@ def test_override_validation_nonexisting_tensor(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={
-                    "NON_EXISTING": [{"rmin": np.array(0.0, dtype=np.float32), "rmax": np.array(0.5, dtype=np.float32)}]
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "NON_EXISTING": [
+                            {"rmin": np.array(0.0, dtype=np.float32), "rmax": np.array(0.5, dtype=np.float32)}
+                        ]
+                    }
                 },
             )
 
@@ -429,7 +448,7 @@ def test_override_validation_scale_missing_zp(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={"SIG_OUT": [{"scale": np.array(0.0, dtype=np.float32)}]},
+                extra_options={"TensorQuantOverrides": {"SIG_OUT": [{"scale": np.array(0.0, dtype=np.float32)}]}},
             )
 
         self.assertIn("Must provide both 'scale' and 'zero_point'", str(context.exception))
@@ -441,14 +460,16 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={
-                    "SIG_OUT": [
-                        {
-                            "scale": np.array(0, dtype=np.float32),
-                            "zero_point": np.array(0, dtype=np.int8),
-                            "rmax": np.array(10.0, dtype=np.float32),
-                        }
-                    ]
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "rmax": np.array(10.0, dtype=np.float32),
+                            }
+                        ]
+                    }
                 },
             )
 
@@ -457,14 +478,16 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={
-                    "SIG_OUT": [
-                        {
-                            "scale": np.array(0, dtype=np.float32),
-                            "zero_point": np.array(0, dtype=np.int8),
-                            "rmax": np.array(10.0, dtype=np.float32),
-                        }
-                    ]
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "rmax": np.array(10.0, dtype=np.float32),
+                            }
+                        ]
+                    }
                 },
             )
 
@@ -473,14 +496,16 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={
-                    "SIG_OUT": [
-                        {
-                            "scale": np.array(0, dtype=np.float32),
-                            "zero_point": np.array(0, dtype=np.int8),
-                            "symmetric": True,
-                        }
-                    ]
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "symmetric": True,
+                            }
+                        ]
+                    }
                 },
             )
 
@@ -489,19 +514,47 @@ def test_override_validation_bad_combination(self):
         with self.assertRaises(ValueError) as context:
             self.perform_qdq_quantization(
                 "model_validation.onnx",
-                tensor_quant_overrides={
-                    "SIG_OUT": [
-                        {
-                            "scale": np.array(0, dtype=np.float32),
-                            "zero_point": np.array(0, dtype=np.int8),
-                            "reduce_range": True,
-                        }
-                    ]
+                extra_options={
+                    "TensorQuantOverrides": {
+                        "SIG_OUT": [
+                            {
+                                "scale": np.array(0, dtype=np.float32),
+                                "zero_point": np.array(0, dtype=np.int8),
+                                "reduce_range": True,
+                            }
+                        ]
+                    }
                 },
             )
 
         self.assertIn("option 'reduce_range' is invalid with 'scale' and 'zero_point'", str(context.exception))
 
+    def test_get_qnn_qdq_config(self):
+        """
+        Test that the QNN-specific configs override the scale and zero-point of Sigmoid.
+        """
+        self.build_float32_model()
+
+        qnn_config = get_qnn_qdq_config(
+            "model.onnx", DummyDataReader(self.activations), activation_type=quantization.QuantType.QUInt16
+        )
+
+        self.assertEqual(qnn_config.extra_options["MinimumRealRange"], 0.0001)
+
+        inp_zp, inp_sc, sig_out_zp, sig_out_sc, _, _, _, _, _, _ = self.perform_qdq_quantization(
+            "model_qnn_quant_overrides.onnx",
+            extra_options=qnn_config.extra_options,
+            activation_type=quantization.QuantType.QUInt16,
+        )
+
+        # Input should have uint16 quant type
+        self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16)
+
+        # Sigmoid output should have overridden scale/zp
+        self.assertEqual(sig_out_zp.int32_data[0], 0)
+        self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16)
+        self.assertEqual(sig_out_sc.float_data[0], np.float32(1.0 / 65536.0))
+
 
 if __name__ == "__main__":
     t = TestTensorQuantOverridesOption()

From a756017e9f9a6a5d66a4b5b36888c275d84220d5 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Fri, 12 Jan 2024 17:47:37 -0800
Subject: [PATCH 050/100] [js/webgpu] more fixes for access above 2GB (#19065)

when jsep calls javascript with an index to HEAP8 or HEAP32 the index is
negative when the heap is above 2GB, even if we pass it as uint32_t it
remains negative. So in javascript use >>> 0 to make it unsigned.
---
 js/web/lib/wasm/jsep/init.ts                  |  5 +-
 onnxruntime/core/providers/js/js_kernel.h     |  5 +-
 .../core/providers/js/operators/conv.h        | 29 ++++++-----
 .../providers/js/operators/conv_transpose.h   | 48 +++++++++----------
 onnxruntime/core/providers/js/operators/pad.h |  6 +--
 .../core/providers/js/operators/reduce.h      |  6 +--
 .../core/providers/js/operators/resize.h      |  6 +--
 .../core/providers/js/operators/slice.h       | 18 +++----
 .../core/providers/js/operators/split.h       |  6 +--
 .../core/providers/js/operators/transpose.h   |  9 ++--
 10 files changed, 67 insertions(+), 71 deletions(-)

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 3c6edf3ebb35d..935f0dcabcd73 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -170,7 +170,7 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
           backend.memcpy(src, dst);
         } else {
           LOG_DEBUG('verbose', () => `[WebGPU] jsepCopyCpuToGpu: dataOffset=${src}, gpuDataId=${dst}, size=${size}`);
-          const data = module.HEAPU8.subarray(src, src + size);
+          const data = module.HEAPU8.subarray(src >>> 0, (src >>> 0) + size);
           backend.upload(dst, data);
         }
       },
@@ -182,7 +182,8 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
                 'verbose',
                 () => `[WebGPU] jsepCopyGpuToCpu: gpuDataId=${gpuDataId}, dataOffset=${dataOffset}, size=${size}`);
 
-            await backend.download(gpuDataId, () => module.HEAPU8.subarray(dataOffset, dataOffset + size));
+            await backend.download(
+                gpuDataId, () => module.HEAPU8.subarray(dataOffset >>> 0, (dataOffset >>> 0) + size));
           },
 
       // jsepCreateKernel
diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index b850bea4bc275..7324b0d69474c 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -67,7 +67,10 @@ namespace js {
                             float value;                                           \
                             ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value));, \
                                                                                  , ({#attr_name : $1}), static_cast<double>(value))
-#define JSEP_HEAP_PTR(ptr) reinterpret_cast<uintptr_t>(ptr)
+
+#define JSEP_HEAP8_INDEX(ptr) reinterpret_cast<uintptr_t>(ptr)
+#define JSEP_HEAP32_INDEX_START(vec) ((vec.size() > 0) ? reinterpret_cast<uintptr_t>(vec.data()) >> 2 : 0)
+#define JSEP_HEAP32_INDEX_END(vec) ((reinterpret_cast<uintptr_t>(vec.data()) >> 2) + vec.size())
 
 // TODO:
 // class JsMultiProgramKernel : public OpKernel { /* TBD */ };
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 98a530c6b77f6..89719f6ba6657 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -29,7 +29,6 @@ class ConvBase : public JsKernel {
     }
     conv_attrs_.activation = info.GetAttrOrDefault<std::string>("activation", "");
     std::vector<float> activation_params = info.GetAttrsOrDefault<float>("activation_params");
-    const auto* activation_params_ptr = activation_params.size() > 0 ? activation_params.data() : nullptr;
     int64_t channels_last = is_channels_last ? 1 : info.GetAttrOrDefault<int64_t>("channels_last", 0);
     auto kernel_shape_0 = conv_attrs_.kernel_shape_specified && kernel_shape.size() > 0 ? kernel_shape[0] : 0;
     auto kernel_shape_1 = conv_attrs_.kernel_shape_specified && kernel_shape.size() > 1 ? kernel_shape[1] : 0;
@@ -43,24 +42,24 @@ class ConvBase : public JsKernel {
                                    "dilations" : [$2],
                                    "group" : $3,
                                    "kernel_shape" : [$4],
-                                   "pads" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : [],
+                                   "pads" : $5 ? Array.from(HEAP32.subarray($5, $6)) : [],
                                    "strides" : [$7],
                                    "w_is_const" : () JS_ARROW(!!HEAP8[$9]),
                                    "activation" : UTF8ToString($10),
-                                   "activation_params" : $11 ? Array.from(HEAPF32.subarray($12, $12 + $11)) : []
+                                   "activation_params" : $11 ? Array.from(HEAPF32.subarray($11, $12)) : []
                                  }),
                                  static_cast<int32_t>(conv_attrs_.auto_pad),
                                  static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
-                                 static_cast<int32_t>(local_pads.size()),
-                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_pads),
+                                 JSEP_HEAP32_INDEX_END(local_pads),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 JSEP_HEAP_PTR(&w_is_const_),
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
-                                 activation_params.size(),
-                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
+                                 JSEP_HEAP32_INDEX_START(activation_params),
+                                 JSEP_HEAP32_INDEX_END(activation_params));
     } else {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
                                    "format" : $11 ? "NHWC" : "NCHW",
@@ -68,11 +67,11 @@ class ConvBase : public JsKernel {
                                    "dilations" : [ $2, $3 ],
                                    "group" : $4,
                                    "kernel_shape" : [ $5, $6 ],
-                                   "pads" : $7 ? Array.from(HEAP32.subarray($8, $8 + $7)) : [],
+                                   "pads" : $7 ? Array.from(HEAP32.subarray($7, $8)) : [],
                                    "strides" : [ $9, $10 ],
                                    "w_is_const" : () JS_ARROW(!!HEAP8[$12]),
                                    "activation" : UTF8ToString($13),
-                                   "activation_params" : $14 ? Array.from(HEAPF32.subarray($15, $15 + $14)) : []
+                                   "activation_params" : $14 ? Array.from(HEAPF32.subarray($14, $15)) : []
                                  }),
                                  static_cast<int32_t>(conv_attrs_.auto_pad),
                                  static_cast<int32_t>(conv_attrs_.dilations.size() > 0 ? conv_attrs_.dilations[0] : 0),
@@ -80,15 +79,15 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(kernel_shape_1),
-                                 static_cast<int32_t>(local_pads.size()),
-                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_pads),
+                                 JSEP_HEAP32_INDEX_END(local_pads),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 JSEP_HEAP_PTR(&w_is_const_),
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
-                                 activation_params.size(),
-                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
+                                 JSEP_HEAP32_INDEX_START(activation_params),
+                                 JSEP_HEAP32_INDEX_END(activation_params));
     }
   }
 
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index 353a946e95c21..258f5676eb93e 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -29,10 +29,6 @@ class ConvTranspose : public JsKernel {
                                             conv_transpose_attrs_.output_shape.end());
     std::vector<int32_t> local_output_padding(conv_transpose_attrs_.output_padding.begin(),
                                               conv_transpose_attrs_.output_padding.end());
-    const auto* local_output_padding_ptr =
-        local_output_padding.size() > 0 ? local_output_padding.data() : nullptr;
-    const auto* local_output_shape_ptr =
-        local_output_shape.size() > 0 ? local_output_shape.data() : nullptr;
 
     // currently only support Conv 1D/2D. TODO: support Conv3D and other
     if (conv_transpose_attrs_.dilations.size() == 1 ||
@@ -52,8 +48,8 @@ class ConvTranspose : public JsKernel {
                                    "pads" : [ $5, $6 ],
                                    "strides" : [$7],
                                    "wIsConst" : () JS_ARROW(!!HEAP8[$9]),
-                                   "outputPadding" : $10 ? Array.from(HEAP32.subarray($11, $11 + $10)) : [],
-                                   "outputShape" : $12 ? Array.from(HEAP32.subarray($13, $13 + $12)) : [],
+                                   "outputPadding" : $10 ? Array.from(HEAP32.subarray($10, $11)) : [],
+                                   "outputShape" : $12 ? Array.from(HEAP32.subarray($12, $13)) : [],
                                    "activation" : UTF8ToString($14)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
@@ -64,11 +60,11 @@ class ConvTranspose : public JsKernel {
                                  static_cast<int32_t>(pads_1),
                                  static_cast<int32_t>(strides),
                                  static_cast<int32_t>(channels_last),
-                                 JSEP_HEAP_PTR(&w_is_const_),
-                                 gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
-                                 gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
+                                 JSEP_HEAP32_INDEX_START(local_output_padding),
+                                 JSEP_HEAP32_INDEX_END(local_output_padding),
+                                 JSEP_HEAP32_INDEX_START(local_output_shape),
+                                 JSEP_HEAP32_INDEX_END(local_output_shape),
                                  conv_transpose_attrs_.activation.c_str());
     } else {
       constexpr size_t pads_vec_size = 4;
@@ -103,28 +99,28 @@ class ConvTranspose : public JsKernel {
       JSEP_INIT_KERNEL_ATTRIBUTE(ConvTranspose, ({
                                    "format" : $7 ? "NHWC" : "NCHW",
                                    "autoPad" : $1,
-                                   "dilations" : Array.from(HEAP32.subarray($2, $2 + /* dialations_vec_size */ 2)),
+                                   "dilations" : Array.from(HEAP32.subarray($2, ($2 >>> 0) + /* dialations_vec_size */ 2)),
                                    "group" : $3,
-                                   "kernelShape" : Array.from(HEAP32.subarray($4, $4 + /* kernel_shape_vec_size */ 2)),
-                                   "pads" : Array.from(HEAP32.subarray($5, $5 + /* pads_vec_size */ 4)),
-                                   "strides" : Array.from(HEAP32.subarray($6, $6 + /* strides_vec_size */ 2)),
+                                   "kernelShape" : Array.from(HEAP32.subarray($4, ($4 >>> 0) + /* kernel_shape_vec_size */ 2)),
+                                   "pads" : Array.from(HEAP32.subarray($5, ($5 >>> 0) + /* pads_vec_size */ 4)),
+                                   "strides" : Array.from(HEAP32.subarray($6, ($6 >>> 0) + /* strides_vec_size */ 2)),
                                    "wIsConst" : () JS_ARROW(!!HEAP8[$8]),
-                                   "outputPadding" : ($9 > 0) ? Array.from(HEAP32.subarray($10, $10 + $9)) : [],
-                                   "outputShape" : ($11 > 0) ? Array.from(HEAP32.subarray($12, $12 + $11)) : [],
+                                   "outputPadding" : $9 ? Array.from(HEAP32.subarray($9, $10)) : [],
+                                   "outputShape" : $11 ? Array.from(HEAP32.subarray($11, $12)) : [],
                                    "activation" : UTF8ToString($13)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
-                                 JSEP_HEAP_PTR(local_dilations.data()) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_dilations),
                                  static_cast<int32_t>(conv_transpose_attrs_.group),
-                                 JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2,
-                                 JSEP_HEAP_PTR(local_pads.data()) >> 2,
-                                 JSEP_HEAP_PTR(local_strides.data()) >> 2,
+                                 JSEP_HEAP32_INDEX_START(local_kernel_shape),
+                                 JSEP_HEAP32_INDEX_START(local_pads),
+                                 JSEP_HEAP32_INDEX_START(local_strides),
                                  static_cast<int32_t>(channels_last),
-                                 JSEP_HEAP_PTR(&w_is_const_),
-                                 gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
-                                 gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP8_INDEX(&w_is_const_),
+                                 JSEP_HEAP32_INDEX_START(local_output_padding),
+                                 JSEP_HEAP32_INDEX_END(local_output_padding),
+                                 JSEP_HEAP32_INDEX_START(local_output_shape),
+                                 JSEP_HEAP32_INDEX_END(local_output_shape),
                                  conv_transpose_attrs_.activation.c_str());
     }
   }
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
index bf808be949cf8..c18c7dd456dc2 100644
--- a/onnxruntime/core/providers/js/operators/pad.h
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -22,11 +22,11 @@ class Pad : public JsKernel, public PadBase {
 
     JSEP_INIT_KERNEL_ATTRIBUTE(Pad, ({"mode" : $1,
                                       "value" : $2,
-                                      "pads" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : []}),
+                                      "pads" : $3 ? Array.from(HEAP32.subarray($3, $4)) : []}),
                                static_cast<int32_t>(mode_),
                                static_cast<double>(value_),
-                               gsl::narrow_cast<int32_t>(pads.size()),
-                               JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+                               JSEP_HEAP32_INDEX_START(pads),
+                               JSEP_HEAP32_INDEX_END(pads));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index 95c4f2bec230d..937f1f990dc67 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -24,12 +24,12 @@ namespace js {
       JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                       \
                                    "keepDims" : !!$1,                                                   \
                                    "noopWithEmptyAxes" : !!$2,                                          \
-                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],       \
+                                   "axes" : $3 ? (Array.from(HEAP32.subarray($3, $4))) : [],            \
                                  }),                                                                    \
                                  static_cast<int32_t>(keepdims_),                                       \
                                  static_cast<int32_t>(noop_with_empty_axes_),                           \
-                                 gsl::narrow_cast<int32_t>(axes.size()),                                \
-                                 JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);        \
+                                 JSEP_HEAP32_INDEX_START(axes),                                         \
+                                 JSEP_HEAP32_INDEX_END(axes));                                          \
     }                                                                                                   \
   };
 
diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h
index 4b1c288ae3015..134eb4bf5a7f4 100644
--- a/onnxruntime/core/providers/js/operators/resize.h
+++ b/onnxruntime/core/providers/js/operators/resize.h
@@ -23,7 +23,7 @@ class Resize : public JsKernel, public UpsampleBase {
     std::transform(axes_.begin(), axes_.end(), std::back_inserter(axes), [](auto& axis) { return gsl::narrow_cast<int32_t>(axis); });
     JSEP_INIT_KERNEL_ATTRIBUTE(Resize, ({
                                  "antialias" : $1,
-                                 "axes" : $2 ? Array.from(HEAP32.subarray($3, $3 + $2)) : [],
+                                 "axes" : $2 ? Array.from(HEAP32.subarray($2, $3)) : [],
                                  "coordinateTransformMode" : UTF8ToString($4),
                                  "cubicCoeffA" : $5,
                                  "excludeOutside" : $6,
@@ -33,8 +33,8 @@ class Resize : public JsKernel, public UpsampleBase {
                                  "nearestMode" : UTF8ToString($10),
                                }),
                                static_cast<int32_t>(antialias_),
-                               gsl::narrow_cast<int32_t>(axes.size()),
-                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2,
+                               JSEP_HEAP32_INDEX_START(axes),
+                               JSEP_HEAP32_INDEX_END(axes),
                                resize_coordinate_transformation_mode.c_str(),
                                static_cast<double>(cubic_coeff_a_),
                                static_cast<int32_t>(exclude_outside_),
diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h
index 989adabf029a5..daeffaa664741 100644
--- a/onnxruntime/core/providers/js/operators/slice.h
+++ b/onnxruntime/core/providers/js/operators/slice.h
@@ -20,15 +20,15 @@ class Slice : public JsKernel, public SliceBase {
     std::vector<int32_t> starts(attr_starts.begin(), attr_starts.end());
     std::vector<int32_t> ends(attr_ends.begin(), attr_ends.end());
 
-    JSEP_INIT_KERNEL_ATTRIBUTE(Slice, ({"starts" : $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : [],
-                                        "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [],
-                                        "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}),
-                               gsl::narrow_cast<int32_t>(starts.size()),
-                               JSEP_HEAP_PTR((starts.size() > 0) ? starts.data() : nullptr) >> 2,
-                               gsl::narrow_cast<int32_t>(ends.size()),
-                               JSEP_HEAP_PTR((ends.size() > 0) ? ends.data() : nullptr) >> 2,
-                               gsl::narrow_cast<int32_t>(axes.size()),
-                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);
+    JSEP_INIT_KERNEL_ATTRIBUTE(Slice, ({"starts" : $1 ? Array.from(HEAP32.subarray($1, $2)) : [],
+                                        "ends" : $3 ? Array.from(HEAP32.subarray($3, $4)) : [],
+                                        "axes" : $5 ? Array.from(HEAP32.subarray($5, $6)) : []}),
+                               JSEP_HEAP32_INDEX_START(starts),
+                               JSEP_HEAP32_INDEX_END(starts),
+                               JSEP_HEAP32_INDEX_START(ends),
+                               JSEP_HEAP32_INDEX_END(ends),
+                               JSEP_HEAP32_INDEX_START(axes),
+                               JSEP_HEAP32_INDEX_END(axes));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h
index 1c1874e5aa98e..4fdbab00e739c 100644
--- a/onnxruntime/core/providers/js/operators/split.h
+++ b/onnxruntime/core/providers/js/operators/split.h
@@ -49,11 +49,11 @@ class Split : public JsKernel, public SplitBase {
 
     JSEP_INIT_KERNEL_ATTRIBUTE(Split, ({"axis" : $1,
                                         "numOutputs" : $2,
-                                        "splitSizes" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : []}),
+                                        "splitSizes" : $3 ? Array.from(HEAP32.subarray($3, $4)) : []}),
                                static_cast<int32_t>(axis_),
                                static_cast<int32_t>(num_outputs_),
-                               gsl::narrow_cast<int32_t>(split_sizes.size()),
-                               JSEP_HEAP_PTR((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
+                               JSEP_HEAP32_INDEX_START(split_sizes),
+                               JSEP_HEAP32_INDEX_END(split_sizes));
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index dae442b9f5a13..f43dd814aa959 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -21,13 +21,10 @@ class Transpose final : public JsKernel, public TransposeBase {
       }
     }
     JSEP_INIT_KERNEL_ATTRIBUTE(Transpose, ({
-                                 "perm" : $1 ? Array.from(HEAP32.subarray($2, $2 + $1)) : []
+                                 "perm" : $1 ? Array.from(HEAP32.subarray($1, $2)) : []
                                }),
-                               // $1: length of attribute "perm" (int32[])
-                               gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
-                               // $2: index to HEAP32 of the first int32 element. calculated from right shift memory
-                               //     address by 2
-                               JSEP_HEAP_PTR(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
+                               JSEP_HEAP32_INDEX_START(perm),
+                               JSEP_HEAP32_INDEX_END(perm));
   }
 };
 

From 150c4cb8fe005d8f0befc46239eac8770d4d5c68 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 12 Jan 2024 17:58:08 -0800
Subject: [PATCH 051/100] [MLAS AArch64] SQNBitGemm CompInt8 kernel (#18953)

Implement ARM NEON SQNBitGemm kernel that first block quantizes A to int8 and then does int8 multiplication.
---
 cmake/onnxruntime_mlas.cmake                  |  49 +-
 .../cpu/quantization/matmul_nbits.cc          | 107 ++-
 onnxruntime/core/mlas/inc/mlas_qnbit.h        | 133 +++-
 onnxruntime/core/mlas/lib/platform.cpp        |   4 +-
 onnxruntime/core/mlas/lib/sqnbitgemm.cpp      | 635 ++++++++++++++++--
 onnxruntime/core/mlas/lib/sqnbitgemm.h        | 394 +++++------
 .../core/mlas/lib/sqnbitgemm_kernel_neon.cpp  | 498 ++++++++++----
 onnxruntime/test/mlas/bench/bench_q4gemm.cpp  |  21 +-
 onnxruntime/test/mlas/bench/bench_sconv.cpp   |   3 +-
 onnxruntime/test/mlas/bench/bench_sgemm.cpp   |  10 +-
 .../test/mlas/bench/bench_sqnbitgemm.cpp      |  99 ++-
 onnxruntime/test/mlas/bench/bench_util.cpp    |  11 +-
 onnxruntime/test/mlas/bench/bench_util.h      |   8 +-
 .../test/mlas/unittest/test_sqnbitgemm.cpp    | 264 ++++++--
 14 files changed, 1672 insertions(+), 564 deletions(-)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index bee83ff07c74b..b995b27123218 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -1,7 +1,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
+set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
+set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
+set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
 
 #
 # All hardware agnostic source files here
@@ -9,6 +11,7 @@ set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib)
 # multi-target build
 #
 onnxruntime_add_static_library(onnxruntime_mlas
+  ${MLAS_SRC_DIR}/mlasi.h
   ${MLAS_SRC_DIR}/platform.cpp
   ${MLAS_SRC_DIR}/threading.cpp
   ${MLAS_SRC_DIR}/sgemm.cpp
@@ -33,9 +36,18 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qpostprocessor.cpp
   ${MLAS_SRC_DIR}/qlgavgpool.cpp
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
+  ${MLAS_SRC_DIR}/sqnbitgemm.h
   ${MLAS_SRC_DIR}/sqnbitgemm.cpp
 )
 
+target_sources(onnxruntime_mlas PRIVATE
+  ${MLAS_INC_DIR}/mlas_float16.h
+  ${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
+  ${MLAS_INC_DIR}/mlas_q4.h
+  ${MLAS_INC_DIR}/mlas_qnbit.h
+  ${MLAS_INC_DIR}/mlas.h
+)
+
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
   target_sources(onnxruntime_mlas PRIVATE
     ${MLAS_SRC_DIR}/q4_dq.cpp
@@ -46,7 +58,7 @@ endif()
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
 function(add_jblas)
-    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas) 
+    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
     target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
     target_sources(onnxruntime_mlas PRIVATE
         ${MLAS_SRC_DIR}/jblas_gemm.cpp
@@ -143,10 +155,6 @@ function(setup_mlas_source_for_windows)
     target_sources(onnxruntime_mlas PRIVATE
       ${MLAS_SRC_DIR}/arm/sgemmc.cpp
     )
-    # it should be removed after Visual Stuio is upgraded to 17.7
-    if (MSVC)
-      add_compile_options("-d2SSAOptimizer-")
-    endif()
   elseif(onnxruntime_target_platform STREQUAL "x64")
 
     file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
@@ -300,8 +308,8 @@ else()
     if(APPLE)
       get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
     endif()
-    list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH  ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH)
-    if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH GREATER 1)
+    list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
+    if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
         set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
     endif()
     #If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
@@ -348,6 +356,8 @@ else()
           ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
         )
+        set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
+                                    PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
         if (NOT APPLE)
           set(mlas_platform_srcs
             ${mlas_platform_srcs}
@@ -617,10 +627,12 @@ if(USE_JBLAS)
 endif()
 
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
-    target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+    target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
+
+    set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
 endforeach()
-set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime")
+
 if (WIN32)
   target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
   if (onnxruntime_ENABLE_STATIC_ANALYSIS)
@@ -636,6 +648,21 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
 
+# set up source group for MLAS source files
+block()
+  set(source_group_srcs)
+  foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
+    get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
+    foreach(mlas_target_src ${mlas_target_srcs})
+      cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
+      if(in_mlas_root)
+        list(APPEND source_group_srcs ${mlas_target_src})
+      endif()
+    endforeach()
+  endforeach()
+  source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
+endblock()
+
 
 if (NOT onnxruntime_ORT_MINIMAL_BUILD)
 
@@ -647,7 +674,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
   onnxruntime_add_executable(onnxruntime_mlas_q4dq
     ${MLAS_SRC_DIR}/q4_dq_cli.cpp
   )
-  target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR})
+  target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
   set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
 
   target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index a9703dc68dd26..406c73c95d444 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -64,6 +64,9 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   if (!all_constant_) {
     return Status::OK();
   }
+
+#if defined(MLAS_JBLAS)
+
   auto compt_type = static_cast<MLAS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
   MLAS_THREADPOOL* pool = NULL;
   if (input_idx == 1) {
@@ -101,12 +104,32 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
+#else  // defined(MLAS_JBLAS)
+
+  if (input_idx == 1) {
+    packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_);
+    if (packed_b_size_ == 0) return Status::OK();
+    auto qptr = tensor.DataRaw();
+    packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
+    MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, qptr, packed_b_.get());
+    if (prepacked_weights) {
+      prepacked_weights->buffers_.push_back(std::move(packed_b_));
+      prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
+    }
+    is_packed = true;
+  }
+
+#endif  // defined(MLAS_JBLAS)
+
   return Status::OK();
 }
 
 Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                               /*out*/ bool& used_shared_buffers) {
   used_shared_buffers = false;
+
+#if defined(MLAS_JBLAS)
+
   // Pack three tensors into one buffer
   if (input_idx == 1) {
     used_shared_buffers = true;
@@ -120,6 +143,15 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
+
+#else  // defined(MLAS_JBLAS)
+
+  if (input_idx == 1) {
+    used_shared_buffers = true;
+    packed_b_ = std::move(prepacked_buffers[0]);
+  }
+
+#endif  // defined(MLAS_JBLAS)
   return Status::OK();
 }
 
@@ -129,6 +161,8 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   const Tensor* a = ctx->Input<Tensor>(0);
   const auto* a_data = a->Data<float>();
 
+#if defined(MLAS_JBLAS)
+
   if (packed_b_.get()) {
     TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
 
@@ -158,7 +192,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
       gemm_params[i].C = y_data + helper.OutputOffsets()[i];
       gemm_params[i].ldc = N;
     }
-    auto ws_size = MlasSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
+    auto ws_size = MlasSQNBitsGemmBatchPackedBWorkspaceSize(M, N, K, max_len, gemm_params.data());
     // workspace for activation process(dynamic quantization and others)
     auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
     MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
@@ -166,10 +200,10 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
     return Status::OK();
   }
 
-  const Tensor* b = ctx->Input<Tensor>(1);
+#endif  // defined(MLAS_JBLAS)
+
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);
-  const uint8_t* b_data = b->Data<uint8_t>();
   const auto* scales_data = scales->Data<float>();
   const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data<uint8_t>();
 
@@ -181,8 +215,9 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
   // Bail out early if the output is going to be empty
-  if (y->Shape().Size() == 0)
+  if (y->Shape().Size() == 0) {
     return Status::OK();
+  }
 
   auto* y_data = y->MutableData<float>();
 
@@ -192,36 +227,46 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
   const size_t K = static_cast<size_t>(helper.K());
   const size_t lda = helper.Lda(false);
 
-  if (MlasIsSQNBitGemmAvailable(nbits_, block_size_)) {
-    // number of bytes or elements between adjacent matrices
-    size_t b_data_matrix_stride_in_bytes, b_scale_matrix_stride, b_zero_point_matrix_stride_in_bytes;
-    MlasBlockwiseQuantizedBufferSizes(static_cast<int>(nbits_), static_cast<int>(block_size_), /* columnwise */ true,
-                                      static_cast<int>(K), static_cast<int>(N),
-                                      b_data_matrix_stride_in_bytes, b_scale_matrix_stride,
-                                      &b_zero_point_matrix_stride_in_bytes);
-
-    const size_t b_matrix_size = K * N;
-
-    InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
-    for (size_t i = 0; i < batch_count; ++i) {
-      const size_t b_matrix_offset = helper.RightOffsets()[i] / b_matrix_size;
-
-      data[i].A = a_data + helper.LeftOffsets()[i];
-      data[i].lda = lda;
-      data[i].QuantBData = b_data + b_matrix_offset * b_data_matrix_stride_in_bytes;
-      data[i].QuantBScale = scales_data + b_matrix_offset * b_scale_matrix_stride;
-      data[i].QuantBZeroPoint = zero_points_data != nullptr
-                                    ? zero_points_data + b_matrix_offset * b_zero_point_matrix_stride_in_bytes
-                                    : nullptr;
-      data[i].C = y_data + helper.OutputOffsets()[i];
-      data[i].ldc = N;
+  const bool has_single_b_matrix = std::all_of(helper.RightOffsets().begin(), helper.RightOffsets().end(),
+                                               [](size_t offset) { return offset == 0; });
+
+  if (has_single_b_matrix && packed_b_) {
+    for (int64_t accuracy_level = accuracy_level_;
+         accuracy_level >= static_cast<int64_t>(CompMostAccurate);
+         --accuracy_level) {
+      const auto compute_type = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(accuracy_level);
+      if (MlasIsSQNBitGemmAvailable(M, N, K, nbits_, block_size_, compute_type)) {
+        IAllocatorUniquePtr<std::byte> workspace{};
+        if (const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, batch_count,
+                                                                           nbits_, block_size_, compute_type);
+            workspace_size > 0) {
+          AllocatorPtr allocator;
+          ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator));
+          workspace = IAllocator::MakeUniquePtr<std::byte>(allocator, workspace_size);
+        }
+
+        InlinedVector<MLAS_SQNBIT_GEMM_DATA_PARAMS> data(batch_count);
+        for (size_t i = 0; i < batch_count; ++i) {
+          data[i].A = a_data + helper.LeftOffsets()[i];
+          data[i].lda = lda;
+          data[i].QuantBData = packed_b_.get();
+          data[i].QuantBScale = scales_data;
+          data[i].QuantBZeroPoint = zero_points_data;
+          data[i].C = y_data + helper.OutputOffsets()[i];
+          data[i].ldc = N;
+        }
+
+        MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type, data.data(), workspace.get(),
+                            thread_pool);
+
+        return Status::OK();
+      }
     }
-
-    MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, data.data(), thread_pool);
-
-    return Status::OK();
   }
 
+  const Tensor* b = ctx->Input<Tensor>(1);
+  const uint8_t* b_data = b->Data<uint8_t>();
+
   const size_t ldb = helper.Ldb(true);
 
   AllocatorPtr allocator;
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index 1e83dd1cec400..bc0bfc92c85a0 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -23,19 +23,36 @@ Module Name:
 #include "mlas.h"
 #include "mlas_gemm_postprocessor.h"
 
+/**
+ * @brief Define compute types of block quantization, in order of decreasing accuracy.
+ */
+typedef enum {
+    CompUndef = 0, /*!< undef */
+    CompFp32,      /*!< input fp32, accumulator fp32 */
+    CompFp16,      /*!< input fp16, accumulator fp16 */
+    CompBf16,      /*!< input bf16, accumulator fp32 */
+    CompInt8,      /*!< input int8, accumulator int32 */
+
+    // special values that should be the first and last actual values
+
+    CompMostAccurate = CompUndef,
+    CompLeastAccurate = CompInt8,
+} MLAS_SQNBIT_COMPUTE_TYPE;
+
+using MLAS_SQNBIT_GEMM_COMPUTE_TYPE = MLAS_SQNBIT_COMPUTE_TYPE;  // TODO consolidate these
+
 /**
  * @brief Data parameters for float/n-bit quantized int GEMM routine.
  */
 struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
-    const float* A = nullptr;                ///< address of A (float32 matrix)
-    size_t lda = 0;                          ///< leading dimension of A
-    const void* QuantBData = nullptr;        ///< address of quantized B (quantized n-bit int values)
-    const float* QuantBScale = nullptr;      ///< address of scale values of quantized B, one per block
-    const void* QuantBZeroPoint = nullptr;   ///< optional address of zero point values of quantized B, one per block
-    bool IsBPacked = false;                  ///< whether B values are packed in an optimized format for the computation
-    const float* Bias = nullptr;             ///< optional address of Bias, vector size N
-    float* C = nullptr;                      ///< address of result matrix
-    size_t ldc = 0;                          ///< leading dimension of C
+    const float* A = nullptr;               ///< address of A (float32 matrix)
+    size_t lda = 0;                         ///< leading dimension of A
+    const void* QuantBData = nullptr;       ///< address of quantized B (quantized n-bit int values)
+    const float* QuantBScale = nullptr;     ///< address of scale values of quantized B, one per block
+    const void* QuantBZeroPoint = nullptr;  ///< optional address of zero point values of quantized B, one per block
+    const float* Bias = nullptr;            ///< optional address of Bias, vector size N
+    float* C = nullptr;                     ///< address of result matrix
+    size_t ldc = 0;                         ///< leading dimension of C
 
     ///< optional post processing to apply to result matrix
     MLAS_GEMM_POSTPROCESSOR<float>* PostProcessor = nullptr;
@@ -46,13 +63,26 @@ struct MLAS_SQNBIT_GEMM_DATA_PARAMS {
  *        A must be a float32 matrix
  *        B must be a quantized and packed n-bit int matrix
  *
+ *        Call MlasIsSQNBitGemmAvailable() with the same parameters to determine whether this function may be called.
+ *
+ *        Call MlasSQNBitGemmPackQuantBDataSize() with the same parameters to determine whether
+ *          MLAS_SQNBIT_GEMM_DATA_PARAMS::QuantBData in `DataParams` should point to a buffer packed with
+ *          MlasSQNBitGemmPackQuantBData().
+ *
+ *        Call MlasSQNBitGemmBatchWorkspaceSize() with the same parameters to determine whether `Workspace` should
+ *          point to an intermediate workspace buffer.
+ *
  * @param[in]       M               row size of matrix A and C
  * @param[in]       N               column size of matrix B and C
  * @param[in]       K               column size of matrix A and row size of matrix B
  * @param[in]       BatchN          number of batches
  * @param[in]       BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]       BlkLen          number of quantized values per block
+ * @param[in]       ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  * @param[inout]    DataParams      An array (size BatchN) of parameter blocks
+ * @param[in]       Workspace       Address of intermediate workspace buffer.
+                                    If MlasSQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a
+                                    buffer with at least that many bytes. Otherwise, it may be nullptr.
  * @param[in]       ThreadPool      optional thread pool to use
  */
 void MLASCALL
@@ -63,31 +93,96 @@ MlasSQNBitGemmBatch(
     size_t BatchN,
     size_t BlkBitWidth,
     size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
 
 /**
  * @brief Determines whether a float32/quantized n-bit int GEMM implementation is available on the current platform.
+ *
+ * @param[in]   M               row size of matrix A and C
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
  * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
  * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
  */
 bool MLASCALL
 MlasIsSQNBitGemmAvailable(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+);
+
+/**
+ * @brief Gets the size in bytes of the intermediate workspace buffer required by the float32/quantized n-bit int GEMM
+ * implementation. If zero, no intermediate workspace is required.
+ *
+ * @param[in]   M               row size of matrix A and C
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
+ * @param[in]   BatchN          number of batches
+ * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen          number of quantized values per block
+ * @param[in]   ComputeType     GEMM compute type (e.g., multiplying float or int8 values)
+ */
+size_t MLASCALL
+MlasSQNBitGemmBatchWorkspaceSize(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+);
+
+/**
+ * @brief Gets the size in bytes of the packed quantized B data.
+ * If non-zero, the quantized B data must first be packed by calling MlasSQNBitGemmPackQuantBData() with a buffer of
+ * this size, and then that packed quantized B data buffer must be passed to MlasSQNBitGemmBatch().
+ * If zero, MlasSQNBitGemmPackQuantBData() must not be called and the quantized B data must be directly passed to
+ * MlasSQNBitGemmBatch().
+ *
+ * @param[in]   N               column size of matrix B and C
+ * @param[in]   K               column size of matrix A and row size of matrix B
+ * @param[in]   BlkBitWidth     quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen          number of quantized values per block
+ */
+size_t MLASCALL
+MlasSQNBitGemmPackQuantBDataSize(
+    size_t N,
+    size_t K,
     size_t BlkBitWidth,
     size_t BlkLen
 );
 
 /**
- * @brief Define compute types of block quantization
+ * @brief Packs the quantized B data in a format that the kernel expects.
+ *
+ * @param[in]   N                   column size of matrix B and C
+ * @param[in]   K                   column size of matrix A and row size of matrix B
+ * @param[in]   BlkBitWidth         quantized value bit width (e.g., 4 means 4 bit ints)
+ * @param[in]   BlkLen              number of quantized values per block
+ * @param[in]   QuantBData          quantized B data
+ * @param[out]  PackedQuantBData    packed quantized B data
+ * @param[in]   ThreadPool          optional thread pool to use
  */
-typedef enum {
-    CompUndef = 0, /*!< undef */
-    CompFp32 = 1,  /*!< input fp32, accumulator fp32 */
-    CompFp16 = 2,  /*!< input fp16, accumulator fp16 */
-    CompBf16 = 3,  /*!< input bf16, accumulator fp32 */
-    CompInt8 = 4   /*!< input int8, accumulator int32 */
-} MLAS_SQNBIT_COMPUTE_TYPE;
+void MLASCALL
+MlasSQNBitGemmPackQuantBData(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    const void* QuantBData,
+    void* PackedQuantBData,
+    MLAS_THREADPOOL* ThreadPool = nullptr
+);
 
 /**
  * @brief Data parameters for NBits GEMM routine
@@ -139,7 +234,7 @@ MlasNBitsGemmPackBSize(
  * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
  * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
  * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale 
+ * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
  * (is_asym is false) and Zp(is_asym is true).
  * @param thread_pool
  */
@@ -186,7 +281,7 @@ MlasNBitsGemmUnPackB(
  * @return     Workspace size in bytes
  */
 size_t MLASCALL
-MlasSQNBitsGemmBatchWorkspaceSize(
+MlasSQNBitsGemmBatchPackedBWorkspaceSize(
     const size_t M,
     const size_t N,
     const size_t K,
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 8329a34f1338f..1310ed3f384b9 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -482,7 +482,6 @@ Return Value:
     this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon;
     this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon;
     this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
-    this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
 
     //
     // Check if the processor supports ASIMD dot product instructions.
@@ -512,6 +511,9 @@ Return Value:
         this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot;
         this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot;
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
+
+        // MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions
+        this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
     }
 
 #if defined(__linux__)
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
index 7f1d1b084aec0..7d877848017fe 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
@@ -11,10 +11,14 @@ Module Name:
 Abstract:
 
     This module implements the float/quantized n-bit integer matrix
-    multiplication hardware agnostic entrypoint, MlasSQNBitGemmBatch.
+    multiplication hardware agnostic entrypoint, MlasSQNBitGemmBatch,
+    as well as some SQNBitGemm-related query functions.
 --*/
 
 #include "sqnbitgemm.h"
+
+#include <cassert>
+
 #ifdef MLAS_JBLAS
 #include "jblas_gemm.h"
 #endif
@@ -22,29 +26,564 @@ Module Name:
 namespace
 {
 
-// Get quantization variant based on `BlkBitWidth` and `BlkLen`.
-// Return -1 if the input values are unsupported.
-int32_t
-GetDispatchQuantVariant(size_t BlkBitWidth, size_t BlkLen)
+enum SQNBitGemmVariant {
+    SQNBitGemmVariantInvalid = -1,
+
+    // Valid variants
+
+    SQNBitGemmVariant_BitWidth4_CompFp32 = 0,
+    SQNBitGemmVariant_BitWidth4_CompInt8,
+
+    // End of valid variants
+
+    // Keep this element last and ensure that its value is the number of valid SQNBitGemmVariant values.
+    // Its value is used as an array size.
+    SQNBitGemmVariantCount,
+};
+
+SQNBitGemmVariant
+GetSQNBitGemmVariant(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(N);
+    MLAS_UNREFERENCED_PARAMETER(K);
+
+    if (BlkBitWidth == 4 &&
+        (BlkLen == 16 || BlkLen == 32 || BlkLen == 64 || BlkLen == 128 || BlkLen == 256)) {
+        if (ComputeType == CompFp32 ||
+            ComputeType == CompUndef) {  // treat CompUndef (undefined) as CompFp32
+            return SQNBitGemmVariant_BitWidth4_CompFp32;
+        } else if (ComputeType == CompInt8 && M == 1) {
+            return SQNBitGemmVariant_BitWidth4_CompInt8;
+        }
+    }
+
+    return SQNBitGemmVariantInvalid;
+}
+
+}  // namespace
+
+bool MLASCALL
+MlasIsSQNBitGemmAvailable(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
+{
+    const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch;
+    if (Dispatch == nullptr) {
+        return false;
+    }
+
+    const auto Variant = GetSQNBitGemmVariant(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+
+    switch (Variant) {
+        case SQNBitGemmVariant_BitWidth4_CompFp32: {
+            return Dispatch->SQ4BitGemmM1Kernel_CompFp32 != nullptr &&
+                   Dispatch->Q4BitBlkDequantBForSgemm_CompFp32 != nullptr;
+        }
+        case SQNBitGemmVariant_BitWidth4_CompInt8: {
+            return Dispatch->SQ4BitGemmM1Kernel_CompInt8 != nullptr &&
+                   Dispatch->QuantizeARow_CompInt8 != nullptr;
+        }
+        default: {
+            return false;
+        }
+    }
+}
+
+namespace
+{
+
+size_t
+SQNBitGemmWorkspaceAlignment(SQNBitGemmVariant Variant)
+{
+    switch (Variant) {
+        case SQNBitGemmVariant_BitWidth4_CompInt8: {
+            return Q8BlkAlignment();
+        }
+        default: {
+            return 1;
+        }
+    }
+}
+
+size_t
+SQNBitGemmPerGemmWorkspaceSize(
+    SQNBitGemmVariant Variant,
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkLen
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(N);
+
+    switch (Variant) {
+        case SQNBitGemmVariant_BitWidth4_CompInt8: {
+            // workspace buffer is used for block quantization of A to int8
+            const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+            const size_t PerGemmWorkspaceSize = M * BlockCountK * Q8BlkSize(BlkLen);
+            return PerGemmWorkspaceSize;
+        }
+        default: {
+            return 0;
+        }
+    }
+}
+
+size_t
+SQNBitGemmPerGemmWorkspaceStride(
+    SQNBitGemmVariant Variant,
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BlkLen
+)
+{
+    const auto Size = SQNBitGemmPerGemmWorkspaceSize(Variant, M, N, K, BlkLen);
+    const auto Alignment = SQNBitGemmWorkspaceAlignment(Variant);
+    return MlasDivRoundup(Size, Alignment) * Alignment;
+}
+
+}  // namespace
+
+size_t MLASCALL
+MlasSQNBitGemmBatchWorkspaceSize(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType
+)
 {
-    int32_t type = -1;
-    if (BlkBitWidth == 4 && BlkLen == 16) {
-        type = QuantVariant_BitWidth4_BlockSize16;
-    } else if (BlkBitWidth == 4 && BlkLen == 32) {
-        type = QuantVariant_BitWidth4_BlockSize32;
-    } else if (BlkBitWidth == 4 && BlkLen == 64) {
-        type = QuantVariant_BitWidth4_BlockSize64;
-    } else if (BlkBitWidth == 4 && BlkLen == 128) {
-        type = QuantVariant_BitWidth4_BlockSize128;
-    } else if (BlkBitWidth == 4 && BlkLen == 256) {
-        type = QuantVariant_BitWidth4_BlockSize256;
+    const auto Variant = GetSQNBitGemmVariant(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+
+    const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(Variant, M, N, K, BlkLen);
+    if (PerGemmWorkspaceStride == 0) {
+        return 0;
     }
 
-    return type;
+    const size_t Alignment = SQNBitGemmWorkspaceAlignment(Variant);
+
+    const size_t WorkspaceSize = BatchN * PerGemmWorkspaceStride;
+
+    return WorkspaceSize + Alignment - 1;
+}
+
+namespace
+{
+
+void
+SQ4BitGemmPackQuantBData(
+    size_t N,
+    size_t K,
+    size_t BlkLen,
+    const std::byte* QuantBDataBegin,
+    std::byte* PackedQuantBDataBegin,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+
+    assert(BlkLen % 16 == 0);
+
+    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+    const size_t BlkDataSize = MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t Iterations = N * BlockCountK;  // one iteration per block
+
+    MlasTrySimpleParallel(
+        ThreadPool, Iterations,
+        [&](ptrdiff_t tid) {
+            const size_t n = tid / BlockCountK;
+            const size_t k_blk = tid % BlockCountK;
+
+            const size_t data_offset = n * BlockCountK * BlkDataSize + k_blk * BlkDataSize;
+            const std::byte* QuantBData = QuantBDataBegin + data_offset;
+            std::byte* PackedQuantBData = PackedQuantBDataBegin + data_offset;
+
+            //
+            // Pack 16 4-bit values (8 bytes) at a time like this:
+            //
+            // src: | v0 v1 | v2 v3 | v4 v5 | v6 v7 | v8 v9 | vA vB | vC vD | vE vF |
+            //   =>
+            // dst: | v0 v8 | v1 v9 | v2 vA | v3 vB | v4 vC | v5 vD | v6 vE | v7 vF |
+            //
+            for (size_t kk = 0; kk < BlkLen; kk += 16) {
+                for (size_t byte_pair_idx = 0; byte_pair_idx < 4; ++byte_pair_idx) {
+                    const std::byte src0 = QuantBData[byte_pair_idx];
+                    const std::byte src1 = QuantBData[byte_pair_idx + 4];
+
+                    std::byte& dst0 = PackedQuantBData[2 * byte_pair_idx];
+                    std::byte& dst1 = PackedQuantBData[2 * byte_pair_idx + 1];
+
+                    dst0 = (src0 & std::byte{0x0F}) | ((src1 & std::byte{0x0F}) << 4);
+                    dst1 = (src0 >> 4) | ((src1 >> 4) << 4);
+                }
+
+                QuantBData += 8;
+                PackedQuantBData += 8;
+            }
+        }
+    );
 }
 
 }  // namespace
 
+size_t MLASCALL
+MlasSQNBitGemmPackQuantBDataSize(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen
+)
+{
+    // Ensure that a general implementation is available on this platform.
+    // For now, all implementations share the same packed format.
+    {
+        // Currently, there are implementations specific to M = 1, so pick a more general M > 1.
+        constexpr size_t M = 2;
+        // A CompUndef implementation should be available if any is available.
+        constexpr MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType = CompUndef;
+        const bool HasGeneralImplementation =
+            MlasIsSQNBitGemmAvailable(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+        if (!HasGeneralImplementation) {
+            return 0;
+        }
+    }
+
+    if (BlkBitWidth == 4) {
+        const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+        const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+        return PackedQuantBDataSize;
+    }
+
+    return 0;
+}
+
+void MLASCALL
+MlasSQNBitGemmPackQuantBData(
+    size_t N,
+    size_t K,
+    size_t BlkBitWidth,
+    size_t BlkLen,
+    const void* QuantBData,
+    void* PackedQuantBData,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    if (BlkBitWidth == 4) {
+        SQ4BitGemmPackQuantBData(
+            N,
+            K,
+            BlkLen,
+            static_cast<const std::byte*>(QuantBData),
+            static_cast<std::byte*>(PackedQuantBData),
+            ThreadPool
+        );
+    }
+}
+
+namespace
+{
+
+MLAS_FORCEINLINE void
+AddBiasForGemm(const float* Bias, float* C, size_t CountM, size_t CountN, size_t ldc)
+{
+    for (size_t m = 0; m < CountM; m++) {
+        const float* bias = Bias;
+        float* sum = C;
+        for (size_t n = 0; n < CountN; n += 4) {
+            if (CountN - n < 4) {
+                for (size_t nn = n; nn < CountN; nn++) {
+                    *sum += *bias;
+                    sum++;
+                    bias++;
+                }
+                break;
+            }
+
+            MLAS_FLOAT32X4 acc_x = MlasLoadFloat32x4(sum);
+            acc_x = MlasAddFloat32x4(acc_x, MlasLoadFloat32x4(bias));
+            MlasStoreFloat32x4(sum, acc_x);
+            bias += 4;
+            sum += 4;
+        }
+        C += ldc;
+    }
+}
+
+typedef void(SQNBitGemmFn)(
+    size_t BlkLen,
+    size_t K,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* PerGemmWorkspace,
+    size_t RangeStartM,
+    size_t RangeCountM,
+    size_t RangeStartN,
+    size_t RangeCountN
+);
+
+void
+SQ4BitGemm_CompFp32(
+    const size_t BlkLen,
+    const size_t K,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
+    void* const PerGemmWorkspace,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+
+    MLAS_UNREFERENCED_PARAMETER(PerGemmWorkspace);
+
+    const size_t lda = DataParams->lda;
+    const size_t ldc = DataParams->ldc;
+
+    const size_t k_blks = MlasDivRoundup(K, BlkLen);
+    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
+
+    const float* A = DataParams->A + RangeStartM * lda;
+
+    const std::byte* QuantBData = static_cast<const std::byte*>(DataParams->QuantBData) + RangeStartN * ldb;
+    const float* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blks;
+    const std::byte* QuantBZeroPoint =
+        (DataParams->QuantBZeroPoint == nullptr)
+            ? nullptr
+            : static_cast<const std::byte*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
+
+    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+
+    const float* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
+
+    if (RangeCountM == 1) {
+        size_t CountN;
+        for (size_t n = 0; n < RangeCountN; n += CountN) {
+            CountN = std::min(RangeCountN - n, size_t{128});
+
+            const float* a_row = A;
+            const std::byte* b_col = QuantBData + n * ldb;
+            const float* b_col_scale = QuantBScale + n * k_blks;
+            const std::byte* b_col_zp =
+                (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+            float* c_blk = C + n;
+            const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompFp32(
+                BlkLen,
+                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
+            );
+
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM, RangeStartN + n,
+                    RangeCountM, CountN, ldc
+                );
+            }
+        }
+        return;
+    }
+
+    constexpr size_t StrideN = 32;
+    size_t bufsize = k_blks * BlkLen * StrideN * sizeof(float);
+    MlasThreadedBufAlloc(bufsize);
+    auto* dequant_b = reinterpret_cast<float*>(ThreadedBufHolder.get());
+
+    //
+    // Step through each slice of matrix B along the N dimension.
+    //
+    size_t CountN;
+    for (size_t n = 0; n < RangeCountN; n += CountN) {
+        CountN = std::min(RangeCountN - n, StrideN);
+
+        //
+        // Step through each slice of matrix A along the M dimension.
+        //
+        const float* a_row = A;
+        const std::byte* b_col = QuantBData + n * ldb;
+        const float* b_col_scale = QuantBScale + n * k_blks;
+        const std::byte* b_col_zp =
+            (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+        float* c_blk = C + n;
+        const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+        GetMlasPlatform().SQNBitGemmDispatch->Q4BitBlkDequantBForSgemm_CompFp32(
+            BlkLen,
+            dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks
+        );
+
+        size_t RowsRemaining = RangeCountM;
+        while (RowsRemaining > 0) {
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
+            auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
+                a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
+            );
+#else
+            auto RowsHandled = MlasSgemmKernelZero(a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f);
+#endif
+
+            if (bias) {
+                AddBiasForGemm(bias, c_blk, RowsHandled, CountN, ldc);
+            }
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM + RangeCountM - RowsRemaining, RangeStartN,
+                    RowsHandled, CountN, ldc
+                );
+            }
+
+            c_blk += ldc * RowsHandled;
+            a_row += lda * RowsHandled;
+            RowsRemaining -= RowsHandled;
+        }
+    }
+}
+
+void
+SQ4BitGemm_CompInt8(
+    const size_t BlkLen,
+    const size_t K,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
+    void* const PerGemmWorkspace,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+
+    const size_t k_blks = MlasDivRoundup(K, BlkLen);
+
+    const size_t lda = k_blks * Q8BlkSize(BlkLen);
+    const size_t ldc = DataParams->ldc;
+    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
+
+    const std::byte* QuantA = static_cast<const std::byte*>(PerGemmWorkspace) + RangeStartM * lda;
+
+    const std::byte* QuantBData = static_cast<const std::byte*>(DataParams->QuantBData) + RangeStartN * ldb;
+    const float* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blks;
+    const std::byte* QuantBZeroPoint =
+        (DataParams->QuantBZeroPoint == nullptr)
+            ? nullptr
+            : static_cast<const std::byte*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
+
+    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+
+    const float* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
+
+    if (RangeCountM == 1) {
+        size_t CountN;
+        for (size_t n = 0; n < RangeCountN; n += CountN) {
+            CountN = std::min(RangeCountN - n, size_t{128});
+
+            const std::byte* a_row = QuantA;
+            const std::byte* b_col = QuantBData + n * ldb;
+            const float* b_col_scale = QuantBScale + n * k_blks;
+            const std::byte* b_col_zp =
+                (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
+            float* c_blk = C + n;
+            const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+
+            GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompInt8(
+                BlkLen,
+                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
+            );
+
+            if (DataParams->PostProcessor != nullptr) {
+                DataParams->PostProcessor->Process(
+                    DataParams->C, RangeStartM, RangeStartN + n,
+                    RangeCountM, CountN, ldc
+                );
+            }
+        }
+        return;
+    }
+
+    assert(false && "not implemented for M > 1");
+}
+
+typedef void(InitializeWorkspaceFn)(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkLen,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
+    size_t PerGemmWorkspaceStride,
+    MLAS_THREADPOOL* ThreadPool
+);
+
+void
+InitializeWorkspace_CompInt8(
+    size_t M,
+    size_t N,
+    size_t K,
+    size_t BatchN,
+    size_t BlkLen,
+    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
+    size_t PerGemmWorkspaceStride,
+    MLAS_THREADPOOL* ThreadPool
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(N);
+
+    const auto QuantizeARow = GetMlasPlatform().SQNBitGemmDispatch->QuantizeARow_CompInt8;
+
+    const size_t BlockCountK = MlasDivRoundup(K, BlkLen);
+    const size_t QuantAStride = BlockCountK * Q8BlkSize(BlkLen);
+
+    MlasTrySimpleParallel(ThreadPool, BatchN, [&](ptrdiff_t gemm_idx) {
+        const auto& data = DataParams[gemm_idx];
+
+        const float* ARowPtr = data.A;
+        std::byte* QuantARowPtr = static_cast<std::byte*>(Workspace) + gemm_idx * PerGemmWorkspaceStride;
+
+        for (size_t m = 0; m < M; ++m) {
+            QuantizeARow(BlkLen, ARowPtr, K, QuantARowPtr);
+
+            ARowPtr += data.lda;
+            QuantARowPtr += QuantAStride;
+        }
+    });
+}
+
+struct Operations {
+    InitializeWorkspaceFn* InitializeWorkspace = nullptr;
+    SQNBitGemmFn* SQNBitGemm = nullptr;
+};
+
+constexpr auto OperationMap = []() {
+    std::array<Operations, SQNBitGemmVariantCount> ops;
+
+    ops[SQNBitGemmVariant_BitWidth4_CompFp32].SQNBitGemm = SQ4BitGemm_CompFp32;
+
+    ops[SQNBitGemmVariant_BitWidth4_CompInt8].InitializeWorkspace = InitializeWorkspace_CompInt8;
+    ops[SQNBitGemmVariant_BitWidth4_CompInt8].SQNBitGemm = SQ4BitGemm_CompInt8;
+
+    return ops;
+}();
+
+}  // namespace
+
 void MLASCALL
 MlasSQNBitGemmBatch(
     const size_t M,
@@ -53,17 +592,43 @@ MlasSQNBitGemmBatch(
     const size_t BatchN,
     const size_t BlkBitWidth,
     const size_t BlkLen,
+    MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
     const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
+    void* Workspace,
     MLAS_THREADPOOL* ThreadPool
 )
 {
-    const int32_t QuantVariant = GetDispatchQuantVariant(BlkBitWidth, BlkLen);
-    MLAS_SQNBIT_GEMM_OPERATION* const Operation = GetMlasPlatform().SQNBitGemmDispatch->Operations[QuantVariant];
+    const auto Variant = GetSQNBitGemmVariant(M, N, K, BlkBitWidth, BlkLen, ComputeType);
+    assert(Variant != SQNBitGemmVariantInvalid);
+
+    //
+    // Ensure `Workspace` has correct alignment.
+    //
+    if (Workspace != nullptr) {
+        const size_t Alignment = SQNBitGemmWorkspaceAlignment(Variant);
+        const uintptr_t WorkspaceAddress = reinterpret_cast<uintptr_t>(Workspace);
+        Workspace = reinterpret_cast<void*>(
+            (WorkspaceAddress + Alignment - 1) & (~(Alignment - 1))
+        );
+    }
+
+    const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(Variant, M, N, K, BlkLen);
+
+    if (const auto InitializeWorkspaceOperation = OperationMap[Variant].InitializeWorkspace;
+        InitializeWorkspaceOperation != nullptr) {
+        InitializeWorkspaceOperation(
+            M, N, K, BatchN, BlkLen, DataParams, Workspace, PerGemmWorkspaceStride, ThreadPool
+        );
+    }
+
+    const auto ComputeOperation = OperationMap[Variant].SQNBitGemm;
 
     if (ThreadPool == nullptr) {
         for (size_t gemm_i = 0; gemm_i < BatchN; gemm_i++) {
-            auto Data = &DataParams[gemm_i];
-            Operation(K, Data, 0, M, 0, N);
+            const auto* Data = &DataParams[gemm_i];
+            void* PerGemmWorkspace =
+                reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride;
+            ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, 0, M, 0, N);
         }
         return;
     }
@@ -112,7 +677,10 @@ MlasSQNBitGemmBatch(
     MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * BatchN, [&](ptrdiff_t tid) {
         const auto gemm_i = tid / ThreadsPerGemm;
         const auto blk_i = tid % ThreadsPerGemm;
-        auto Data = &DataParams[gemm_i];
+        const auto* Data = &DataParams[gemm_i];
+        void* PerGemmWorkspace = reinterpret_cast<void*>(
+            reinterpret_cast<std::byte*>(Workspace) + gemm_i * PerGemmWorkspaceStride
+        );
 
         const ptrdiff_t ThreadIdN = blk_i / ThreadCountM;
         const ptrdiff_t ThreadIdM = blk_i % ThreadCountM;
@@ -123,29 +691,10 @@ MlasSQNBitGemmBatch(
         const size_t RangeStartN = ThreadIdN * StrideN;
         const size_t RangeCountN = std::min(N - RangeStartN, (size_t)StrideN);
 
-        Operation(K, Data, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
+        ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
     });
 }
 
-bool MLASCALL
-MlasIsSQNBitGemmAvailable(
-    size_t BlkBitWidth,
-    size_t BlkLen
-)
-{
-    const int32_t QuantVariant = GetDispatchQuantVariant(BlkBitWidth, BlkLen);
-    if (QuantVariant == -1) {
-        return false;
-    }
-
-    if (GetMlasPlatform().SQNBitGemmDispatch == nullptr ||
-        GetMlasPlatform().SQNBitGemmDispatch->Operations[QuantVariant] == nullptr) {
-        return false;
-    }
-
-    return true;
-}
-
 size_t MLASCALL
 MlasNBitsGemmPackBSize(
     size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
@@ -224,7 +773,7 @@ MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, s
 }
 
 size_t MLASCALL
-MlasSQNBitsGemmBatchWorkspaceSize(
+MlasSQNBitsGemmBatchPackedBWorkspaceSize(
     const size_t M,
     const size_t N,
     const size_t K,
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/sqnbitgemm.h
index 90fdd710e2773..a66db79dc290a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.h
@@ -10,98 +10,23 @@ Module Name:
 
 Abstract:
 
-    This module includes:
+    This module includes kernel function prototypes and helper functions for
+    implementing SQNBitGemm.
 
-    - Declaration of the set of template functions used to implement a kernel
-    for a matrix/matrix multiplication, A*B, where A is a float matrix and B is
-    a n-bit quantized integer matrix (QNBitGemm).
-
-    - A shared kernel driver function template, MlasSQNBitGemmOperation.
-
-    - Kernel dispatch structure.
-
-    The B matrix is block quantized, which means that its values are grouped
-    into blocks which each have one scale and optional zero point. Each
-    quantized value in B is n-bits wide.
+    SQNBitGemm is a matrix/matrix multiplication, A*B, where A is a float
+    matrix and B is a n-bit quantized integer matrix. B is block quantized,
+    meaning values of B are divided into blocks and each block has its own
+    scale and optional zero point.
 
 --*/
 
 #pragma once
 
+#include <cassert>
+
 #include "mlas_qnbit.h"
 #include "mlasi.h"
 
-//
-// Kernel implementation template declarations
-//
-
-/**
- * @brief Multiply float matrix A with quantized n-bit integer matrix B.
- *        B is block quantized and column major.
- *        This kernel handles the special case where M, the number of rows of A and C, is 1.
- *
- * @tparam BlkBitWidth  Bit width of each value in a block.
- * @tparam BlkLen       Number of values in a block.
- * @tparam KernelType   Hardware-specific kernel type.
- *
- * @param       A                   Supplies the A matrix.
- * @param       QuantBData          Supplies the quantized B matrix block data.
- * @param       QuantBScale         Supplies the quantized B matrix block scale values.
- * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
- * @param[out]  C                   Supplies the output C matrix.
- * @param       CountN              Number of columns of B and C.
- * @param       CountK              Number of columns of A and rows of B.
- * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
- * @param       Bias                Bias vector of length N.
- */
-template <size_t BlkBitWidth, size_t BlkLen, typename KernelType>
-MLAS_FORCEINLINE void
-MlasSQNBitGemmM1Kernel(
-    const float* A,
-    const uint8_t* QuantBData,
-    const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
-    float* C,
-    size_t CountN,
-    size_t CountK,
-    size_t BlockStrideQuantB,
-    const float* Bias
-);
-
-/**
- * @brief Dequantize B into the format expected by the Sgemm kernel.
- *        B is block quantized and column major.
- *        This is equivalent to dequantizing B and then running
- *        MlasSgemmCopyPackB.
- *
- * @tparam BlkBitWidth  Bit width of each value in a block.
- * @tparam BlkLen       Number of values in a block.
- * @tparam KernelType   Hardware-specific kernel type.
- *
- * @param[out]  FpData              Supplies the output buffer for the dequantized B float data.
- * @param       QuantBData          Supplies the quantized B matrix block data.
- * @param       QuantBScale         Supplies the quantized B matrix block scale values.
- * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
- * @param       CountN              Number of columns of B.
- * @param       CountK              Number of rows of B.
- * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
- */
-template <size_t BlkBitWidth, size_t BlkLen, typename KernelType>
-MLAS_FORCEINLINE void
-MlasQNBitBlkDequantBForSgemm(
-    float* FpData,
-    const uint8_t* QuantBData,
-    const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
-    size_t CountN,
-    size_t CountK,
-    size_t BlockStrideQuantB
-);
-
-//
-// MlasQNBitGemmOperation and helpers
-//
-
 constexpr MLAS_FORCEINLINE size_t
 MlasQNBitBlkDataSizeInBytes(size_t BlkBitWidth, size_t BlkLen)
 {
@@ -119,169 +44,174 @@ MlasQNBitZeroPointsForBlksSizeInBytes(size_t BlkCount)
     }
 }
 
-MLAS_FORCEINLINE void
-MlasAddBiasForGemm(const float* Bias, float* C, size_t CountM, size_t CountN, size_t ldc)
-{
-    for (size_t m = 0; m < CountM; m++) {
-        const float* bias = Bias;
-        float* sum = C;
-        for (size_t n = 0; n < CountN; n += 4) {
-            if (CountN - n < 4) {
-                for (size_t nn = n; nn < CountN; nn++) {
-                    *sum += *bias;
-                    sum++;
-                    bias++;
-                }
-                break;
-            }
+//
+// Quantized int8 block helpers.
+//
 
-            MLAS_FLOAT32X4 acc_x = MlasLoadFloat32x4(sum);
-            acc_x = MlasAddFloat32x4(acc_x, MlasLoadFloat32x4(bias));
-            MlasStoreFloat32x4(sum, acc_x);
-            bias += 4;
-            sum += 4;
-        }
-        C += ldc;
-    }
+MLAS_FORCEINLINE
+const float&
+Q8BlkScale(const std::byte* BlkPtr)
+{
+    return *reinterpret_cast<const float*>(BlkPtr);
 }
 
-template <size_t BlkBitWidth, size_t BlkLen, typename KernelType>
-MLAS_FORCEINLINE void MLASCALL
-MlasSQNBitGemmOperation(
-    const size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams,
-    const size_t RangeStartM,
-    const size_t RangeCountM,
-    const size_t RangeStartN,
-    const size_t RangeCountN
-)
+MLAS_FORCEINLINE
+float&
+Q8BlkScale(std::byte* BlkPtr)
 {
-    const size_t lda = DataParams->lda;
-    const size_t ldc = DataParams->ldc;
-
-    const size_t k_blks = MlasDivRoundup(K, BlkLen);
-    const size_t ldb = k_blks * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
-    const size_t k_blks_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(k_blks);
-
-    const float* A = DataParams->A + RangeStartM * lda;
-
-    const uint8_t* QuantBData = static_cast<const uint8_t*>(DataParams->QuantBData) + RangeStartN * ldb;
-    const float* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blks;
-    const uint8_t* QuantBZeroPoint =
-        (DataParams->QuantBZeroPoint == nullptr)
-            ? nullptr
-            : static_cast<const uint8_t*>(DataParams->QuantBZeroPoint) + RangeStartN * k_blks_zp_bytes;
-
-    float* C = DataParams->C + RangeStartM * ldc + RangeStartN;
-
-    const float* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias + RangeStartN;
-
-    if (RangeCountM == 1) {
-        size_t CountN;
-        for (size_t n = 0; n < RangeCountN; n += CountN) {
-            CountN = std::min(RangeCountN - n, size_t{128});
-
-            const float* a_row = A;
-            const uint8_t* b_col = QuantBData + n * ldb;
-            const float* b_col_scale = QuantBScale + n * k_blks;
-            const uint8_t* b_col_zp =
-                (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
-            float* c_blk = C + n;
-            const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
-
-            MlasSQNBitGemmM1Kernel<BlkBitWidth, BlkLen, KernelType>(
-                a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias
-            );
-
-            if (DataParams->PostProcessor != nullptr) {
-                DataParams->PostProcessor->Process(
-                    DataParams->C, RangeStartM, RangeStartN + n,
-                    RangeCountM, CountN, ldc
-                );
-            }
-        }
-        return;
-    }
-
-    constexpr size_t StrideN = 32;
-    size_t bufsize = k_blks * BlkLen * StrideN * sizeof(float);
-    MlasThreadedBufAlloc(bufsize);
-    auto* dequant_b = reinterpret_cast<float*>(ThreadedBufHolder.get());
-    //
-    // Step through each slice of matrix B along the N dimension.
-    //
-
-    size_t CountN;
-    for (size_t n = 0; n < RangeCountN; n += CountN) {
-        CountN = std::min(RangeCountN - n, StrideN);
-
-        //
-        // Step through each slice of matrix A along the M dimension.
-        //
-        const float* a_row = A;
-        const uint8_t* b_col = QuantBData + n * ldb;
-        const float* b_col_scale = QuantBScale + n * k_blks;
-        const uint8_t* b_col_zp =
-            (QuantBZeroPoint == nullptr) ? nullptr : QuantBZeroPoint + n * k_blks_zp_bytes;
-        float* c_blk = C + n;
-        const float* bias = (Bias == nullptr) ? nullptr : Bias + n;
+    return *reinterpret_cast<float*>(BlkPtr);
+}
 
-        MlasQNBitBlkDequantBForSgemm<BlkBitWidth, BlkLen, KernelType>(
-            dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks
-        );
+MLAS_FORCEINLINE
+const int8_t*
+Q8BlkData(const std::byte* BlkPtr)
+{
+    return reinterpret_cast<const int8_t*>(BlkPtr + sizeof(float));
+}
 
-        size_t RowsRemaining = RangeCountM;
-        while (RowsRemaining > 0) {
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
-            auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
-                a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
-            );
-#else
-            auto RowsHandled = MlasSgemmKernelZero(a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f);
-#endif
+MLAS_FORCEINLINE
+int8_t*
+Q8BlkData(std::byte* BlkPtr)
+{
+    return reinterpret_cast<int8_t*>(BlkPtr + sizeof(float));
+}
 
-            if (bias) {
-                MlasAddBiasForGemm(bias, c_blk, RowsHandled, CountN, ldc);
-            }
-            if (DataParams->PostProcessor != nullptr) {
-                DataParams->PostProcessor->Process(
-                    DataParams->C, RangeStartM + RangeCountM - RowsRemaining, RangeStartN,
-                    RowsHandled, CountN, ldc
-                );
-            }
+MLAS_FORCEINLINE
+constexpr size_t
+Q8BlkSize(size_t BlkLen)
+{
+    const size_t BlkSize = sizeof(float) + BlkLen * sizeof(int8_t);
+    // Currently, the strictest alignment requirement of a block is for a float.
+    // Ensure contiguous blocks are suitably aligned.
+    assert(BlkSize % alignof(float) == 0);
+    return BlkSize;
+}
 
-            c_blk += ldc * RowsHandled;
-            a_row += lda * RowsHandled;
-            RowsRemaining -= RowsHandled;
-        }
-    }
+MLAS_FORCEINLINE
+constexpr size_t
+Q8BlkAlignment()
+{
+    return alignof(float);
 }
 
 //
 // Kernel dispatch structure.
 //
 
-typedef void(MLASCALL MLAS_SQNBIT_GEMM_OPERATION)(
-    size_t K,
-    const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams,
-    size_t RangeStartM,
-    size_t RangeCountM,
-    size_t RangeStartN,
-    size_t RangeCountN
-);
+struct MLAS_SQNBIT_GEMM_DISPATCH {
+    //
+    // CompFp32 kernel function prototypes.
+    //
+
+    /**
+     * @brief Multiply float matrix A with quantized 4-bit integer matrix B.
+     *        B is block quantized and column major.
+     *        This kernel handles the special case where M, the number of rows of A and C, is 1.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param       A                   Supplies the A matrix.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param[out]  C                   Supplies the output C matrix.
+     * @param       CountN              Number of columns of B and C.
+     * @param       CountK              Number of columns of A and rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     * @param       Bias                Bias vector of length N.
+     */
+    typedef void(SQ4BitGemmM1Kernel_CompFp32_Fn)(
+        size_t BlkLen,
+        const float* A,
+        const std::byte* QuantBData,
+        const float* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        float* C,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB,
+        const float* Bias
+    );
+
+    SQ4BitGemmM1Kernel_CompFp32_Fn* SQ4BitGemmM1Kernel_CompFp32 = nullptr;
+
+    /**
+     * @brief Dequantize B into the format expected by the Sgemm kernel.
+     *        B is a quantized 4-bit integer matrix that is block quantized and column major.
+     *        This is equivalent to dequantizing B and then running MlasSgemmCopyPackB.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param[out]  FpData              Supplies the output buffer for the dequantized B float data.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param       CountN              Number of columns of B.
+     * @param       CountK              Number of rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     */
+    typedef void(Q4BitBlkDequantBForSgemm_CompFp32_Fn)(
+        size_t BlkLen,
+        float* FpData,
+        const std::byte* QuantBData,
+        const float* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB
+    );
+
+    Q4BitBlkDequantBForSgemm_CompFp32_Fn* Q4BitBlkDequantBForSgemm_CompFp32 = nullptr;
 
-enum QuantVariant {
-    QuantVariant_BitWidth4_BlockSize16,
-    QuantVariant_BitWidth4_BlockSize32,
-    QuantVariant_BitWidth4_BlockSize64,
-    QuantVariant_BitWidth4_BlockSize128,
-    QuantVariant_BitWidth4_BlockSize256,
-    QuantVariantCount,  // Keep this element last and ensure that its value is the number of other QuantVariant values.
-                        // Its value is used as an array size.
-};
+    //
+    // CompInt8 kernel function prototypes.
+    //
 
-struct MLAS_SQNBIT_GEMM_DISPATCH {
-    MLAS_SQNBIT_GEMM_OPERATION* Operations[QuantVariantCount] = {
-        // Initialized to nullptrs. Overwrite in hardware-specific kernel implementation.
-    };
+    /**
+     * @brief Multiply quantized 8-bit integer matrix A with quantized 4-bit integer matrix B.
+     *        A and B are block quantized and B is column major.
+     *        This kernel handles the special case where M, the number of rows of A and C, is 1.
+     *
+     * @param       BlkLen              Number of values in a block.
+     * @param       QuantA              Supplies the quantized A matrix.
+                                        Binary data containing block quantized int8 data and scale values.
+     * @param       QuantBData          Supplies the quantized B matrix block data.
+     * @param       QuantBScale         Supplies the quantized B matrix block scale values.
+     * @param       QuantBZeroPoint     Supplies the quantized B matrix block zero point values. Optional.
+     * @param[out]  C                   Supplies the output C matrix.
+     * @param       CountN              Number of columns of B and C.
+     * @param       CountK              Number of columns of A and rows of B.
+     * @param       BlockStrideQuantB   Number of blocks between adjacent columns of the quantized B matrix.
+     * @param       Bias                Bias vector of length N.
+     */
+    typedef void(SQ4BitGemmM1Kernel_CompInt8_Fn)(
+        size_t BlkLen,
+        const std::byte* QuantA,
+        const std::byte* QuantBData,
+        const float* QuantBScale,
+        const std::byte* QuantBZeroPoint,
+        float* C,
+        size_t CountN,
+        size_t CountK,
+        size_t BlockStrideQuantB,
+        const float* Bias
+    );
+
+    SQ4BitGemmM1Kernel_CompInt8_Fn* SQ4BitGemmM1Kernel_CompInt8 = nullptr;
+
+    /**
+     * @brief Block quantize values from one row of matrix A from floats to quantized 8-bit integers.
+     *
+     * @param       BlkLen  Number of values in a block.
+     * @param       A       Supplies the A matrix.
+     * @param       CountK  Number of columns of A.
+     * @param[out]  QuantA  Supplies the output quantized A matrix.
+     *                      Binary data containing block quantized int8 data and scale values.
+     */
+    typedef void(QuantizeARow_CompInt8_Fn)(
+        size_t BlkLen,
+        const float* A,
+        size_t CountK,
+        std::byte* QuantA
+    );
+
+    QuantizeARow_CompInt8_Fn* QuantizeARow_CompInt8 = nullptr;
 };
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
index 63afe57dd9137..69fd427fa574a 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp
@@ -23,12 +23,6 @@ Module Name:
 #include <cassert>
 #include <utility>
 
-//
-// Hardware-specific kernel type.
-//
-struct MLAS_SQNBIT_GEMM_KERNEL_NEON {
-};
-
 namespace
 {
 
@@ -70,7 +64,7 @@ FoldAccumulators(float32x4_t a0, float32x4_t a1, float32x4_t a2, float32x4_t a3)
 
 template <size_t Capacity>
 MLAS_FORCEINLINE void
-LoadData(const float* src, size_t count, float32x4_t (& dst)[Capacity / 4])
+LoadFloatData(const float* src, size_t count, float32x4_t (&dst)[Capacity / 4])
 {
     static_assert(Capacity % 4 == 0, "Capacity must be divisible by 4.");
 
@@ -101,13 +95,14 @@ LoadData(const float* src, size_t count, float32x4_t (& dst)[Capacity / 4])
     }
 }
 
-template <size_t BlkBitWidth, size_t BlkLen, size_t NCols>
+template <size_t NCols>
 MLAS_FORCEINLINE void
-ComputeDotProducts(
+ComputeDotProducts_BlkBitWidth4_CompFp32(
+    size_t BlkLen,
     const float* ARowPtr,
-    const uint8_t* QuantBDataColPtr,
+    const std::byte* QuantBDataColPtr,
     const float* QuantBScaleColPtr,
-    const uint8_t* QuantBZeroPointColPtr,
+    const std::byte* QuantBZeroPointColPtr,
     float* SumPtr,
     size_t CountK,
     size_t StrideQuantBData,
@@ -116,8 +111,13 @@ ComputeDotProducts(
     const float* BiasPtr
 )
 {
+    constexpr size_t BlkBitWidth = 4;
+
     static_assert(NCols == 1 || NCols == 4, "NCols must be 1 or 4");
 
+    constexpr size_t SubBlkLen = 16;  // number of block elements to process in a sub-block iteration
+    assert(BlkLen % SubBlkLen == 0);
+
     const uint8x8_t LowMask = vdup_n_u8(0x0F);
 
     // Manual conversion to float takes place in two steps:
@@ -135,7 +135,7 @@ ComputeDotProducts(
 
     float32x4_t acc[NCols]{};
 
-    const uint8_t* QuantBData = QuantBDataColPtr;
+    const std::byte* QuantBData = QuantBDataColPtr;
     const float* QuantBScale = QuantBScaleColPtr;
     size_t QuantBZeroPointIdx = 0;  // track half byte increments with this index instead of a pointer
 
@@ -150,10 +150,12 @@ ComputeDotProducts(
         float offset[NCols];  // Includes zero point and float conversion offset of 16.
         if (QuantBZeroPointColPtr != nullptr) {
             UnrolledLoop<NCols>([&](size_t i) {
-                const uint8_t zp_packed =
+                const std::byte zp_packed =
                     QuantBZeroPointColPtr[i * StrideQuantBZeroPoint + QuantBZeroPointIdx / 2];
-                const uint8_t zp = ((QuantBZeroPointIdx & 1) == 1) ? (zp_packed >> 4) : (zp_packed & 0x0F);
-                offset[i] = 16.0f + zp;
+                const std::byte zp = ((QuantBZeroPointIdx & 1) == 1)
+                                         ? (zp_packed >> 4)
+                                         : (zp_packed & std::byte{0x0F});
+                offset[i] = 16.0f + std::to_integer<uint8_t>(zp);
             });
         } else {
             UnrolledLoop<NCols>([&](size_t i) {
@@ -162,33 +164,27 @@ ComputeDotProducts(
             });
         }
 
-        constexpr size_t SubBlkLen = 16;  // number of block elements to process in one iteration
-
         for (size_t k_idx_in_blk = 0; k_idx_in_blk < k_blk_len; k_idx_in_blk += SubBlkLen) {
             // load A row vector elements
 
             // load `SubBlkLen` elements from A, padded with 0's if there aren't enough
             const size_t k_subblk_len = std::min(k_blk_len - k_idx_in_blk, SubBlkLen);
             float32x4_t av[4]{};
-            LoadData<SubBlkLen>(ARowPtr + k + k_idx_in_blk, k_subblk_len, av);
+            LoadFloatData<SubBlkLen>(ARowPtr + k + k_idx_in_blk, k_subblk_len, av);
 
             // load B column vectors
             uint8x8_t bv_packed[NCols];
+            const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8;
             UnrolledLoop<NCols>([&](size_t i) {
-                const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8;
-                bv_packed[i] = vld1_u8(QuantBData + i * StrideQuantBData + b_data_block_offset);
-            });
-
-            uint8x8_t bv_u8_unzipped[NCols][2];
-            UnrolledLoop<NCols>([&](size_t i) {
-                bv_u8_unzipped[i][0] = vand_u8(bv_packed[i], LowMask);
-                bv_u8_unzipped[i][1] = vand_u8(vshr_n_u8(bv_packed[i], 4), LowMask);
+                bv_packed[i] = vld1_u8(
+                    reinterpret_cast<const uint8_t*>(QuantBData) + i * StrideQuantBData + b_data_block_offset
+                );
             });
 
             uint8x8_t bv_u8[NCols][2];
             UnrolledLoop<NCols>([&](size_t i) {
-                bv_u8[i][0] = vzip1_u8(bv_u8_unzipped[i][0], bv_u8_unzipped[i][1]);
-                bv_u8[i][1] = vzip2_u8(bv_u8_unzipped[i][0], bv_u8_unzipped[i][1]);
+                bv_u8[i][0] = vand_u8(bv_packed[i], LowMask);
+                bv_u8[i][1] = vshr_n_u8(bv_packed[i], 4);
             });
 
             // dequantize B
@@ -262,19 +258,13 @@ ComputeDotProducts(
     }
 }
 
-}  // namespace
-
-//
-// MlasSQNBitGemmKernel and helpers.
-//
-
-template <size_t BlkBitWidth, size_t BlkLen>
 MLAS_FORCEINLINE void
-MlasSQNBitGemmM1KernelNeon(
+SQ4BitGemmM1Kernel_CompFp32(
+    size_t BlkLen,
     const float* A,
-    const uint8_t* QuantBData,
+    const std::byte* QuantBData,
     const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
+    const std::byte* QuantBZeroPoint,
     float* C,
     size_t CountN,
     size_t CountK,
@@ -282,6 +272,7 @@ MlasSQNBitGemmM1KernelNeon(
     const float* Bias
 )
 {
+    constexpr size_t BlkBitWidth = 4;
     constexpr size_t NCols = 4;
 
     const float* ARowPtr = A;
@@ -295,16 +286,17 @@ MlasSQNBitGemmM1KernelNeon(
 
     const float* BiasPtr = Bias;
 
-    const uint8_t* QuantBDataColPtr = QuantBData;
+    const std::byte* QuantBDataColPtr = QuantBData;
     const float* QuantBScaleColPtr = QuantBScale;
-    const uint8_t* QuantBZeroPointColPtr = QuantBZeroPoint;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
 
     float* SumPtr = CRowPtr;
 
     int64_t nblk = static_cast<int64_t>(CountN) - NCols;
 
     while (nblk >= 0) {
-        ComputeDotProducts<BlkBitWidth, BlkLen, NCols>(
+        ComputeDotProducts_BlkBitWidth4_CompFp32<NCols>(
+            BlkLen,
             ARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
             StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
             BiasPtr
@@ -327,7 +319,8 @@ MlasSQNBitGemmM1KernelNeon(
     // left over columns less than `NCols`?
     nblk += NCols;
     for (int64_t n = 0; n < nblk; ++n) {
-        ComputeDotProducts<BlkBitWidth, BlkLen, 1>(
+        ComputeDotProducts_BlkBitWidth4_CompFp32<1>(
+            BlkLen,
             ARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
             StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
             BiasPtr
@@ -346,59 +339,26 @@ MlasSQNBitGemmM1KernelNeon(
     }
 }
 
-#define SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(BlkBitWidth, BlkLen)                  \
-    template <>                                                                \
-    MLAS_FORCEINLINE void                                                      \
-    MlasSQNBitGemmM1Kernel<BlkBitWidth, BlkLen, MLAS_SQNBIT_GEMM_KERNEL_NEON>( \
-        const float* A,                                                        \
-        const uint8_t* QuantBData,                                             \
-        const float* QuantBScale,                                              \
-        const uint8_t* QuantBZeroPoint,                                        \
-        float* C,                                                              \
-        size_t CountN,                                                         \
-        size_t CountK,                                                         \
-        size_t BlockStrideQuantB,                                              \
-        const float* Bias                                                      \
-    )                                                                          \
-    {                                                                          \
-        return MlasSQNBitGemmM1KernelNeon<BlkBitWidth, BlkLen>(                \
-            A, QuantBData, QuantBScale, QuantBZeroPoint, C, CountN, CountK,    \
-            BlockStrideQuantB, Bias                                            \
-        );                                                                     \
-    }
-
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 16)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 32)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 64)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 128)
-SPECIALIZE_SQNBIT_GEMM_M1_KERNEL(4, 256)
-
-#undef SPECIALIZE_SQNBIT_GEMM_M1_KERNEL
-
-//
-// MlasQNBitBlkDequantBForSgemm and helpers.
-//
-
-template <size_t BlkBitWidth, size_t BlkLen>
 MLAS_FORCEINLINE void
-MlasQNBitBlkDequantBForSgemmNeon(
+Q4BitBlkDequantBForSgemm_CompFp32(
+    size_t BlkLen,
     float* FpData,
-    const uint8_t* QuantBData,
+    const std::byte* QuantBData,
     const float* QuantBScale,
-    const uint8_t* QuantBZeroPoint,
+    const std::byte* QuantBZeroPoint,
     size_t CountN,
     size_t CountK,
     size_t BlockStrideQuantB
 )
 {
     auto impl0_reference = [&]() {
-        static_assert(BlkBitWidth == 4);
+        constexpr size_t BlkBitWidth = 4;
 
         float* Dst = FpData;
 
-        const uint8_t* QuantBDataCol = QuantBData;
+        const std::byte* QuantBDataCol = QuantBData;
         const float* QuantBScaleCol = QuantBScale;
-        const uint8_t* QuantBZeroPointCol = QuantBZeroPoint;
+        const std::byte* QuantBZeroPointCol = QuantBZeroPoint;
 
         for (size_t n = 0; n < CountN; n += 16) {
             const size_t nnlen = std::min(CountN - n, size_t{16});
@@ -407,20 +367,26 @@ MlasQNBitBlkDequantBForSgemmNeon(
                 for (size_t k = 0, k_blk_idx = 0; k < CountK; k += BlkLen, k_blk_idx += 1) {
                     const size_t kklen = std::min(CountK - k, BlkLen);
 
-                    const uint8_t* b_data =
+                    const std::byte* b_data =
                         QuantBDataCol + k_blk_idx * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
                     const float b_s = QuantBScaleCol[k_blk_idx];
                     const uint8_t b_z =
                         (QuantBZeroPointCol != nullptr)
                             ? ((k_blk_idx & 1) == 1)
-                                  ? QuantBZeroPointCol[k_blk_idx / 2] >> 4
-                                  : QuantBZeroPointCol[k_blk_idx / 2] & 0x0F
+                                  ? std::to_integer<uint8_t>(QuantBZeroPointCol[k_blk_idx / 2] >> 4)
+                                  : std::to_integer<uint8_t>(QuantBZeroPointCol[k_blk_idx / 2] & std::byte{0x0F})
                             : 8;
 
                     for (size_t kk = 0; kk < kklen; ++kk) {
-                        const uint8_t b_packed = b_data[kk / 2];
-                        const uint8_t b_byte = ((kk & 1) == 1) ? b_packed >> 4 : b_packed & 0x0F;
-                        const float b_value = (b_byte - b_z) * b_s;
+                        const size_t packed_idx = kk % 16;
+
+                        const bool is_low_half = packed_idx < 8;
+                        const size_t packed_byte_idx = packed_idx % 8;
+                        const size_t packed_range_offset = (kk / 16) * 8;
+
+                        const std::byte b_packed = b_data[packed_range_offset + packed_byte_idx];
+                        const std::byte b_byte = is_low_half ? (b_packed & std::byte{0x0F}) : (b_packed >> 4);
+                        const float b_value = (std::to_integer<int8_t>(b_byte) - b_z) * b_s;
 
                         Dst[(k + kk) * 16 + nn] = b_value;
                     }
@@ -448,31 +414,332 @@ MlasQNBitBlkDequantBForSgemmNeon(
     impl0_reference();
 }
 
-#define SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(BlkBitWidth, BlkLen)                           \
-    template <>                                                                                 \
-    MLAS_FORCEINLINE void                                                                       \
-    MlasQNBitBlkDequantBForSgemm<BlkBitWidth, BlkLen, MLAS_SQNBIT_GEMM_KERNEL_NEON>(            \
-        float* FpData,                                                                          \
-        const uint8_t* QuantBData,                                                              \
-        const float* QuantBScale,                                                               \
-        const uint8_t* QuantBZeroPoint,                                                         \
-        size_t CountN,                                                                          \
-        size_t CountK,                                                                          \
-        size_t BlockStrideQuantB                                                                \
-    )                                                                                           \
-    {                                                                                           \
-        MlasQNBitBlkDequantBForSgemmNeon<BlkBitWidth, BlkLen>(                                  \
-            FpData, QuantBData, QuantBScale, QuantBZeroPoint, CountN, CountK, BlockStrideQuantB \
-        );                                                                                      \
+//
+// CompInt8 kernel implementation and related helpers
+//
+
+template <size_t SubBlkLen>
+MLAS_FORCEINLINE void
+QuantizeBlock(
+    size_t BlkLen,
+    const float* A,
+    size_t ElementCount,
+    std::byte* QuantA
+)
+{
+    static_assert(SubBlkLen >= 16 && SubBlkLen % 16 == 0);
+
+    assert(BlkLen % SubBlkLen == 0);
+
+    constexpr size_t VectorCount = SubBlkLen / 4;
+
+    //
+    // Scan block values first to determine scale.
+    //
+
+    float amax = 0.0f;  // max of absolute values of A block
+
+    size_t k;
+    for (k = 0; k < ElementCount; k += SubBlkLen) {
+        const size_t SubBlkElementCount = std::min(ElementCount - k, SubBlkLen);
+
+        float32x4_t a[VectorCount]{};
+        LoadFloatData<SubBlkLen>(A + k, SubBlkElementCount, a);
+
+        float32x4_t abs_a[VectorCount];
+        UnrolledLoop<VectorCount>([&](size_t i) {
+            abs_a[i] = vabsq_f32(a[i]);
+        });
+
+        // find amax of SubBlkLen elements
+        for (size_t interval = VectorCount / 2; interval > 0; interval /= 2) {
+            for (size_t i = 0; i < interval; ++i) {
+                abs_a[i] = vmaxq_f32(abs_a[i], abs_a[i + interval]);
+            }
+        }
+
+        // update existing amax
+        amax = std::max(amax, vmaxvq_f32(abs_a[0]));
+    }
+
+    constexpr float range_max = (1 << 7) - 1;
+    const float scale = amax / range_max;
+    const float scale_reciprocal = scale != 0.0f ? 1.0f / scale : 0.0f;
+
+    Q8BlkScale(QuantA) = scale;
+
+    //
+    // Compute quantized block values.
+    //
+
+    int8_t* QuantAData = Q8BlkData(QuantA);
+
+    for (k = 0; k < ElementCount; k += SubBlkLen) {
+        const size_t SubBlkElementCount = std::min(ElementCount - k, SubBlkLen);
+
+        float32x4_t a[VectorCount]{};
+        LoadFloatData<SubBlkLen>(A + k, SubBlkElementCount, a);
+
+        UnrolledLoop<VectorCount>([&](size_t i) {
+            a[i] = vmulq_n_f32(a[i], scale_reciprocal);
+        });
+
+        int32x4_t a_s32[VectorCount];
+        UnrolledLoop<VectorCount>([&](size_t i) {
+            a_s32[i] = vcvtaq_s32_f32(a[i]);
+        });
+
+        UnrolledLoop<VectorCount>([&](size_t i) {
+            QuantAData[k + i * 4 + 0] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 0));
+            QuantAData[k + i * 4 + 1] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 1));
+            QuantAData[k + i * 4 + 2] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 2));
+            QuantAData[k + i * 4 + 3] = static_cast<int8_t>(vgetq_lane_s32(a_s32[i], 3));
+        });
+    }
+
+    //
+    // Zero out any remaining sub-block elements.
+    //
+
+    for (; k < BlkLen; k += SubBlkLen) {
+        const int8x16_t Zeros = vdupq_n_s8(0);
+        UnrolledLoop<SubBlkLen / 16>([&](size_t i) {
+            vst1q_s8(QuantAData + k + i * 16, Zeros);
+        });
+    }
+}
+
+void MLASCALL
+QuantizeARow_CompInt8(
+    size_t BlkLen,
+    const float* A,
+    size_t CountK,
+    std::byte* QuantA
+)
+{
+    const float* ADataBlkPtr = A;
+    std::byte* QuantABlkPtr = QuantA;
+
+    for (size_t k = 0; k < CountK; k += BlkLen) {
+        const size_t k_blk_len = std::min(CountK - k, BlkLen);
+
+        QuantizeBlock<16>(BlkLen, ADataBlkPtr, k_blk_len, QuantABlkPtr);
+
+        ADataBlkPtr += BlkLen;
+        QuantABlkPtr += Q8BlkSize(BlkLen);
+    }
+}
+
+template <size_t NCols>
+MLAS_FORCEINLINE void
+ComputeDotProducts_BlkBitWidth4_CompInt8(
+    size_t BlkLen,
+    const std::byte* QuantARowPtr,
+    const std::byte* QuantBDataColPtr,
+    const float* QuantBScaleColPtr,
+    const std::byte* QuantBZeroPointColPtr,
+    float* SumPtr,
+    size_t CountK,
+    size_t StrideQuantBData,
+    size_t StrideQuantBScale,
+    size_t StrideQuantBZeroPoint,
+    const float* BiasPtr
+)
+{
+    static_assert(NCols == 1 || NCols == 4, "NCols must be 1 or 4");
+
+    constexpr size_t BlkBitWidth = 4;
+
+    constexpr size_t SubBlkLen = 16;  // number of block elements to process in a sub-block iteration
+    assert(BlkLen % SubBlkLen == 0);
+
+    const uint8x8_t LowMask = vdup_n_u8(0x0F);
+
+    const std::byte* QuantA = QuantARowPtr;
+
+    const std::byte* QuantBData = QuantBDataColPtr;
+    const float* QuantBScale = QuantBScaleColPtr;
+    size_t QuantBZeroPointIdx = 0;  // track half byte increments with this index instead of a pointer
+
+    float32x4_t acc[NCols]{};
+
+    for (size_t k = 0; k < CountK; k += BlkLen) {
+        const size_t k_blk_len = std::min(CountK - k, BlkLen);
+
+        const float a_scale = Q8BlkScale(QuantA);
+        const int8_t* a_data = Q8BlkData(QuantA);
+
+        float b_scale[NCols];
+        UnrolledLoop<NCols>([&](size_t i) { b_scale[i] = QuantBScale[i * StrideQuantBScale]; });
+
+        int8_t b_zp[NCols];
+        if (QuantBZeroPointColPtr != nullptr) {
+            UnrolledLoop<NCols>([&](size_t i) {
+                const std::byte zp_packed =
+                    QuantBZeroPointColPtr[i * StrideQuantBZeroPoint + QuantBZeroPointIdx / 2];
+                b_zp[i] = ((QuantBZeroPointIdx & 1) == 1)
+                              ? std::to_integer<int8_t>(zp_packed >> 4)
+                              : std::to_integer<int8_t>(zp_packed & std::byte{0x0F});
+            });
+        } else {
+            UnrolledLoop<NCols>([&](size_t i) {
+                b_zp[i] = 8;
+            });
+        }
+
+        for (size_t k_idx_in_blk = 0; k_idx_in_blk < k_blk_len; k_idx_in_blk += SubBlkLen) {
+            // load A row vector
+            int8x16_t av = vld1q_s8(a_data + k_idx_in_blk);
+
+            // load B column vectors
+            uint8x8_t bv_packed[NCols];
+            const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8;
+            UnrolledLoop<NCols>([&](size_t i) {
+                bv_packed[i] = vld1_u8(
+                    reinterpret_cast<const uint8_t*>(QuantBData) + i * StrideQuantBData + b_data_block_offset
+                );
+            });
+
+            int8x16_t bv[NCols];
+            UnrolledLoop<NCols>([&](size_t i) {
+                const int8x8_t lo = vreinterpret_s8_u8(vand_u8(bv_packed[i], LowMask));
+                const int8x8_t hi = vreinterpret_s8_u8(vshr_n_u8(bv_packed[i], 4));
+                bv[i] = vcombine_s8(lo, hi);
+            });
+
+            // subtract B zero point
+            UnrolledLoop<NCols>([&](size_t i) {
+                const int8x16_t zp_v = vdupq_n_s8(b_zp[i]);
+                bv[i] = vsubq_s8(bv[i], zp_v);
+            });
+
+            // compute quantized dot product
+            int32x4_t dot[NCols]{};
+            UnrolledLoop<NCols>([&](size_t i) {
+                dot[i] = vdotq_s32(dot[i], av, bv[i]);
+            });
+
+            // convert dot product result to float
+            float32x4_t dot_f32[NCols];
+            UnrolledLoop<NCols>([&](size_t i) {
+                dot_f32[i] = vcvtq_f32_s32(dot[i]);
+            });
+
+            // multiply dot product result by scale and update accumulator
+            UnrolledLoop<NCols>([&](size_t i) {
+                const float32x4_t scale_v = vdupq_n_f32(a_scale * b_scale[i]);
+                acc[i] = vfmaq_f32(acc[i], dot_f32[i], scale_v);
+            });
+        }
+
+        // increment pointers to next block
+        QuantA += Q8BlkSize(BlkLen);
+        QuantBData += MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+        QuantBScale += 1;
+        QuantBZeroPointIdx += 1;
+    }
+
+    if constexpr (NCols == 4) {
+        float32x4_t sum = FoldAccumulators(acc[0], acc[1], acc[2], acc[3]);
+
+        if (BiasPtr != nullptr) {
+            sum = vaddq_f32(sum, vld1q_f32(BiasPtr));
+        }
+
+        vst1q_f32(SumPtr, sum);
+    } else {
+        for (size_t i = 0; i < NCols; ++i) {
+            SumPtr[i] = vaddvq_f32(acc[i]);
+            if (BiasPtr != nullptr) {
+                SumPtr[i] += BiasPtr[i];
+            }
+        }
+    }
+}
+
+MLAS_FORCEINLINE
+void
+SQ4BitGemmM1Kernel_CompInt8(
+    size_t BlkLen,
+    const std::byte* QuantA,
+    const std::byte* QuantBData,
+    const float* QuantBScale,
+    const std::byte* QuantBZeroPoint,
+    float* C,
+    size_t CountN,
+    size_t CountK,
+    size_t BlockStrideQuantB,
+    const float* Bias
+)
+{
+    constexpr size_t BlkBitWidth = 4;
+    constexpr size_t NCols = 4;
+
+    const std::byte* QuantARowPtr = QuantA;
+    float* CRowPtr = C;
+
+    const size_t BlockCountK = BlockStrideQuantB;
+
+    const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen);
+    const size_t StrideQuantBScale = BlockCountK;
+    const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes<BlkBitWidth>(BlockCountK);
+
+    const float* BiasPtr = Bias;
+
+    const std::byte* QuantBDataColPtr = QuantBData;
+    const float* QuantBScaleColPtr = QuantBScale;
+    const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint;
+
+    float* SumPtr = CRowPtr;
+
+    int64_t nblk = static_cast<int64_t>(CountN) - NCols;
+
+    while (nblk >= 0) {
+        ComputeDotProducts_BlkBitWidth4_CompInt8<NCols>(
+            BlkLen,
+            QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
+            StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
+            BiasPtr
+        );
+
+        // move to next `NCols` columns
+
+        QuantBDataColPtr += NCols * StrideQuantBData;
+        QuantBScaleColPtr += NCols * StrideQuantBScale;
+        if (QuantBZeroPointColPtr != nullptr) {
+            QuantBZeroPointColPtr += NCols * StrideQuantBZeroPoint;
+        }
+
+        BiasPtr += BiasPtr != nullptr ? NCols : 0;
+        SumPtr += NCols;
+
+        nblk -= NCols;
     }
 
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 16)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 32)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 64)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 128)
-SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 256)
+    // left over columns less than `NCols`?
+    nblk += NCols;
+    for (int64_t n = 0; n < nblk; ++n) {
+        ComputeDotProducts_BlkBitWidth4_CompInt8<1>(
+            BlkLen,
+            QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK,
+            StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint,
+            BiasPtr
+        );
 
-#undef SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM
+        // move to next column
+
+        QuantBDataColPtr += StrideQuantBData;
+        QuantBScaleColPtr += StrideQuantBScale;
+        if (QuantBZeroPointColPtr != nullptr) {
+            QuantBZeroPointColPtr += StrideQuantBZeroPoint;
+        }
+
+        BiasPtr += BiasPtr != nullptr ? 1 : 0;
+        SumPtr += 1;
+    }
+}
+
+}  // namespace
 
 //
 // Kernel dispatch structure definition.
@@ -480,10 +747,11 @@ SPECIALIZE_QNBIT_BLK_DEQUANT_B_FOR_SGEMM(4, 256)
 
 const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() {
     MLAS_SQNBIT_GEMM_DISPATCH d;
-    d.Operations[QuantVariant_BitWidth4_BlockSize16] = MlasSQNBitGemmOperation<4, 16, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize32] = MlasSQNBitGemmOperation<4, 32, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize64] = MlasSQNBitGemmOperation<4, 64, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize128] = MlasSQNBitGemmOperation<4, 128, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
-    d.Operations[QuantVariant_BitWidth4_BlockSize256] = MlasSQNBitGemmOperation<4, 256, MLAS_SQNBIT_GEMM_KERNEL_NEON>;
+
+    d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32;
+    d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32;
+    d.SQ4BitGemmM1Kernel_CompInt8 = SQ4BitGemmM1Kernel_CompInt8;
+    d.QuantizeARow_CompInt8 = QuantizeARow_CompInt8;
+
     return d;
 }();
diff --git a/onnxruntime/test/mlas/bench/bench_q4gemm.cpp b/onnxruntime/test/mlas/bench/bench_q4gemm.cpp
index 87e3601612761..61b3f57d8daac 100644
--- a/onnxruntime/test/mlas/bench/bench_q4gemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_q4gemm.cpp
@@ -109,12 +109,19 @@ void Q8Q4GEMM(benchmark::State& state, MLAS_BLK_QUANT_TYPE qtype) {
 
 static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
   b->ArgNames(q4gemm_bench_arg_names);
-  ArgsProduct(b, {{1, 1024, 2048}, {4096}, {4096}, {8}});
+  b->ArgsProduct({{1, 1024, 2048}, {4096}, {4096}, {8}});
 }
 
-BENCHMARK_CAPTURE(Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM, Q4Sym128, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym128, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+[[maybe_unused]] static const bool benchmarks_registered = []() {
+  const bool is_q4gemm_supported = MlasQ4GemmPackBSize(BlkQ4Sym, 1, 1) > 0;
+  if (is_q4gemm_supported) {
+    BENCHMARK_CAPTURE(Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q4GEMM, Q4Sym128, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym, BlkQ4Sym)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Zp8, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+    BENCHMARK_CAPTURE(Q8Q4GEMM, Q4Sym128, BlkQ4Zp8)->Apply(GemmSizeProducts)->UseRealTime();
+    return true;
+  }
+  return false;
+}();
diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 115641f6a6efb..39d135236b89c 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -224,8 +224,7 @@ BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();
 
 static void General_Conv2d(benchmark::internal::Benchmark* b) {
   b->ArgNames(ArgNamesForConv(2));
-  ArgsProduct(
-      b,
+  b->ArgsProduct(
       {{2},       // Rank,
        {1},       // N
        {1, 2},    // Groups
diff --git a/onnxruntime/test/mlas/bench/bench_sgemm.cpp b/onnxruntime/test/mlas/bench/bench_sgemm.cpp
index e6e34bc88ad59..a94d33cd77f63 100644
--- a/onnxruntime/test/mlas/bench/bench_sgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sgemm.cpp
@@ -103,14 +103,14 @@ void SGEMM(benchmark::State& state, bool pack_b, bool trans_a, bool trans_b, flo
 
 static void GemmSizeWithOne(benchmark::internal::Benchmark* b) {
   b->ArgNames(sgemm_bench_arg_names);
-  ArgsProduct(b, {{1}, {63, 255, 1023}, {63, 255, 1023}});
-  ArgsProduct(b, {{63, 255, 1023}, {1}, {63, 255, 1023}});
-  ArgsProduct(b, {{63, 255, 1023}, {63, 255, 1023}, {1}});
+  b->ArgsProduct({{1}, {63, 255, 1023}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {1}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {1}});
 }
 
 static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
   b->ArgNames(sgemm_bench_arg_names);
-  ArgsProduct(b, {{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}});
 }
 
 BENCHMARK_CAPTURE(SGEMM, NORMAL_NoTrans, false, false, false)->Apply(GemmSizeProducts)->UseRealTime();
@@ -128,7 +128,7 @@ BENCHMARK_CAPTURE(SGEMM, PACKB_TransA, true, true, false)->Apply(GemmSizeProduct
 
 static void GemmLLMSizeProducts(benchmark::internal::Benchmark* b) {
   b->ArgNames(sgemm_bench_arg_names);
-  ArgsProduct(b, {{1, 1024, 2048}, {4096, 11008}, {4096, 11008}});
+  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}});
 }
 
 BENCHMARK_CAPTURE(SGEMM, LLM, false, false, true)->Apply(GemmLLMSizeProducts)->UseRealTime();
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
index cf67ef6f82051..2a56d37b899f8 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
@@ -4,33 +4,36 @@
 #include "mlas_q4.h"
 #include "mlas_qnbit.h"
 
+#include <memory>
 #include <stdexcept>
+#include <vector>
 
 #include "benchmark/benchmark.h"
 
 #include "bench_util.h"
 #include "core/util/thread_utils.h"
+#include "core/common/narrow.h"
 
-template <size_t BlkBitWidth, size_t BlkLen, bool Symmetric>
-void SQNBITGEMM(benchmark::State& state) {
-  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
-  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
-  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
-  if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!");
+using onnxruntime::narrow;
 
-  const size_t M = static_cast<size_t>(state.range(0));
-  const size_t N = static_cast<size_t>(state.range(1));
-  const size_t K = static_cast<size_t>(state.range(2));
-  const size_t threads = static_cast<size_t>(state.range(3));
+template <size_t BlkBitWidth>
+void SQNBITGEMM(benchmark::State& state) {
+  const auto BlkLen = narrow<size_t>(state.range(0));
+  const auto M = narrow<size_t>(state.range(1));
+  const auto N = narrow<size_t>(state.range(2));
+  const auto K = narrow<size_t>(state.range(3));
+  const auto Threads = narrow<size_t>(state.range(4));
+  const auto Symmetric = narrow<bool>(state.range(5));
+  const auto ComputeType = static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(state.range(6));
 
   size_t QuantBDataSizeInBytes, QuantBScaleSize, QuantBZeroPointSizeInBytes;
   MlasBlockwiseQuantizedBufferSizes(
-      BlkBitWidth, BlkLen, /* columnwise */ true,
+      BlkBitWidth, static_cast<int>(BlkLen), /* columnwise */ true,
       static_cast<int>(K), static_cast<int>(N),
       QuantBDataSizeInBytes, QuantBScaleSize, &QuantBZeroPointSizeInBytes);
 
   OrtThreadPoolParams tpo;
-  tpo.thread_pool_size = static_cast<int>(threads);
+  tpo.thread_pool_size = static_cast<int>(Threads);
   tpo.auto_set_affinity = true;
 
   std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(
@@ -47,14 +50,29 @@ void SQNBITGEMM(benchmark::State& state) {
 
   MlasQuantizeBlockwise<float, BlkBitWidth>(QuantBData.data(), QuantBScale.data(),
                                             Symmetric ? nullptr : QuantBZeroPoint.data(),
-                                            B.data(), BlkLen, /* columnwise */ true,
+                                            B.data(), static_cast<int>(BlkLen), /* columnwise */ true,
                                             static_cast<int>(K), static_cast<int>(N), static_cast<int>(N),
                                             tp.get());
 
+  std::unique_ptr<std::byte[]> Workspace;
+  if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+      WorkspaceSize > 0) {
+    Workspace = std::make_unique<std::byte[]>(WorkspaceSize);
+  }
+
+  std::unique_ptr<std::byte[]> PackedQuantBData;
+  if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen);
+      PackedQuantBDataSize > 0) {
+    PackedQuantBData = std::make_unique<std::byte[]>(PackedQuantBDataSize);
+    MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, QuantBData.data(), PackedQuantBData.get(), tp.get());
+  }
+
   MLAS_SQNBIT_GEMM_DATA_PARAMS params{};
   params.A = A.data();
   params.lda = K;
-  params.QuantBData = QuantBData.data();
+  params.QuantBData = PackedQuantBData != nullptr
+                          ? static_cast<const void*>(PackedQuantBData.get())
+                          : static_cast<const void*>(QuantBData.data());
   params.QuantBScale = QuantBScale.data();
   params.QuantBZeroPoint = Symmetric ? nullptr : QuantBZeroPoint.data();
   params.Bias = nullptr;
@@ -62,30 +80,41 @@ void SQNBITGEMM(benchmark::State& state) {
   params.ldc = N;
 
   // warm up run
-  MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, &params, tp.get());
+  MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
 
   for (auto _ : state) {
-    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, &params, tp.get());
+    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace.get(), tp.get());
   }
 }
 
-static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
-  b->ArgNames({"M", "N", "K", "Threads"});
-  ArgsProduct(b, {{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}});
+static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "ComputeType"});
+
+  ArgsProductWithFilter(b,
+
+                        {{16, 32, 64, 128, 256},                   // BlkLen
+                         {1, 1024, 2048},                          // M
+                         {4096, 11008},                            // N
+                         {4096, 11008},                            // K
+                         {8},                                      // Threads
+                         {int64_t{false}, int64_t{true}},          // Symmetric
+                         {int64_t{CompFp32}, int64_t{CompInt8}}},  // ComputeType
+
+                        [](const std::vector<int64_t>& args) {
+                          return MlasIsSQNBitGemmAvailable(
+                              // M, N, K
+                              narrow<size_t>(args[1]), narrow<size_t>(args[2]), narrow<size_t>(args[3]),
+                              // BlkBitWidth, BlkLen
+                              4, narrow<size_t>(args[0]),
+                              // ComputeType
+                              static_cast<MLAS_SQNBIT_GEMM_COMPUTE_TYPE>(args[6]));
+                        });
 }
 
-BENCHMARK(SQNBITGEMM<4, 16, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 16, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 32, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 32, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 64, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 64, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 128, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 128, true>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 256, false>)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK(SQNBITGEMM<4, 256, true>)->Apply(GemmSizeProducts)->UseRealTime();
-
-#ifdef MLAS_JBLAS
+BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
+
+#if defined(MLAS_JBLAS)
+
 void Q4GEMM_Jblas(benchmark::State& state, int block_size, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE cmp_type) {
   if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
   if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
@@ -130,6 +159,11 @@ void Q4GEMM_Jblas(benchmark::State& state, int block_size, bool is_asym, MLAS_SQ
   }
 }
 
+static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"M", "N", "K", "Threads"});
+  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}});
+}
+
 BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymInt8, 32, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
 BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymInt8, 128, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
 BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymInt8, -1, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
@@ -137,4 +171,5 @@ BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymFp32, 32, false, CompFp32)->Apply(GemmSi
 BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymFp32, 128, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
 BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymFp32, -1, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
 BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32AsymFp32, 32, true, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-#endif
+
+#endif  // defined(MLAS_JBLAS)
diff --git a/onnxruntime/test/mlas/bench/bench_util.cpp b/onnxruntime/test/mlas/bench/bench_util.cpp
index b79cd3a2a40aa..d57564615b04e 100644
--- a/onnxruntime/test/mlas/bench/bench_util.cpp
+++ b/onnxruntime/test/mlas/bench/bench_util.cpp
@@ -23,10 +23,9 @@ std::vector<float> RandomVectorUniform(std::vector<int64_t> shape, float min_val
   return RandomVectorUniform(static_cast<size_t>(sz), min_value, max_value);
 }
 
-// The Benchmark used here do not contains this as in newer version.
-// Use the code from newer version.
-void ArgsProduct(benchmark::internal::Benchmark* bench,
-                 const std::vector<std::vector<int64_t>>& arglists) {
+void ArgsProductWithFilter(benchmark::internal::Benchmark* bench,
+                           const std::vector<std::vector<int64_t>>& arglists,
+                           std::function<bool(const std::vector<int64_t>& args)> include_filter) {
   std::vector<std::size_t> indices(arglists.size(), 0);
   const std::size_t total = std::accumulate(
       std::begin(arglists), std::end(arglists), std::size_t{1},
@@ -39,7 +38,9 @@ void ArgsProduct(benchmark::internal::Benchmark* bench,
     for (std::size_t arg = 0; arg < arglists.size(); arg++) {
       args.push_back(arglists[arg][indices[arg]]);
     }
-    bench->Args(args);
+    if (include_filter(args)) {
+      bench->Args(args);
+    }
     args.clear();
 
     std::size_t arg = 0;
diff --git a/onnxruntime/test/mlas/bench/bench_util.h b/onnxruntime/test/mlas/bench/bench_util.h
index a2b49e117da38..ee2ec42d0f755 100644
--- a/onnxruntime/test/mlas/bench/bench_util.h
+++ b/onnxruntime/test/mlas/bench/bench_util.h
@@ -5,10 +5,14 @@
 
 #include <benchmark/benchmark.h>
 
+#include <functional>
 #include <random>
 
-void ArgsProduct(benchmark::internal::Benchmark* bench,
-                 const std::vector<std::vector<int64_t>>& arglists);
+// Specifies benchmark arguments from the cartesian product of `arglists`, like Benchmark::ArgsProduct().
+// `include_filter` is called to determine whether a given set of arguments should be included.
+void ArgsProductWithFilter(benchmark::internal::Benchmark* bench,
+                           const std::vector<std::vector<int64_t>>& arglists,
+                           std::function<bool(const std::vector<int64_t>& args)> include_filter);
 
 template <typename ElementType>
 std::vector<ElementType> RandomVectorUniform(
diff --git a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
index 6c97d60301573..4fb8ab41745d5 100644
--- a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp
@@ -18,6 +18,17 @@ Module Name:
 #include "mlas_q4.h"
 #include "mlas_qnbit.h"
 
+static constexpr const char* ComputeTypeName(MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType) {
+  switch (ComputeType) {
+    case CompFp32:
+      return "Fp32";
+    case CompInt8:
+      return "Int8";
+    default:
+      return "unknown";
+  }
+}
+
 /**
  * @brief Test class for n-bit int block quantized GEMM
  *        Note: only 2-D matmul supported for now
@@ -26,12 +37,16 @@ template <size_t BlkBitWidth, size_t BlkLen>
 class MlasSQNBitGemmTest : public MlasTestBase {
  private:
   MatrixGuardBuffer<float> BufferA;
+  MatrixGuardBuffer<int8_t> BufferQuantAData;
+  MatrixGuardBuffer<float> BufferQuantAScale;
   MatrixGuardBuffer<float> BufferB;
   MatrixGuardBuffer<uint8_t> BufferQuantBData;
+  MatrixGuardBuffer<std::byte> BufferPackedQuantBData;
   MatrixGuardBuffer<uint8_t> BufferQuantBZeroPoint;
   MatrixGuardBuffer<float> BufferQuantBScale;
   MatrixGuardBuffer<float> BufferDequantizedB;
   MatrixGuardBuffer<float> BufferBias;
+  MatrixGuardBuffer<std::byte> BufferWorkspace;
   MatrixGuardBuffer<float> BufferC;
   MatrixGuardBuffer<float> BufferCReference;
 
@@ -40,12 +55,15 @@ class MlasSQNBitGemmTest : public MlasTestBase {
                 size_t K,
                 const float* A,
                 size_t lda,
-                const uint8_t* QuantBData,
+                const void* QuantBData,
+                const void* PackedQuantBData,
                 const float* QuantBScale,
-                const uint8_t* QuantBZeroPoint,
+                const void* QuantBZeroPoint,
                 const float* Bias,
                 float* C,
                 size_t ldc,
+                void* Workspace,
+                MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
                 MLAS_THREADPOOL* Threadpool) {
     MLAS_SQNBIT_GEMM_DATA_PARAMS params;
     params.A = A;
@@ -53,23 +71,106 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     params.Bias = Bias;
     params.C = C;
     params.ldc = ldc;
-    params.QuantBData = QuantBData;
+    params.QuantBData = PackedQuantBData != nullptr ? PackedQuantBData : QuantBData;
     params.QuantBScale = QuantBScale;
     params.QuantBZeroPoint = QuantBZeroPoint;
     params.PostProcessor = nullptr;
 
-    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, &params, Threadpool);
+    MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, &params, Workspace, Threadpool);
+  }
+
+  void QuantizeA(size_t M, size_t K, const float* A, int8_t* QuantAData, float* QuantAScale) {
+    const size_t BlockCountK = (K + BlkLen - 1) / BlkLen;
+    const size_t lda = K;
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t k = 0, k_blk = 0; k < K; k += BlkLen, ++k_blk) {
+        const size_t local_blk_len = std::min(K - k, BlkLen);
+        float blk_a[BlkLen]{};
+        std::copy_n(A + m * lda + k, local_blk_len, blk_a);
+
+        float amax = 0.0f;  // max of absolute values of A block
+        for (size_t kk = 0; kk < local_blk_len; ++kk) {
+          float a = blk_a[kk];
+          amax = std::max(amax, fabsf(a));
+        }
+
+        constexpr float range_max = (1 << 7) - 1;
+        const float scale = amax / range_max;
+        const float scale_reciprocal = scale != 0.0f ? 1.0f / scale : 0.0f;
+
+        QuantAScale[m * BlockCountK + k_blk] = scale;
+
+        for (size_t kk = 0; kk < BlkLen; ++kk) {
+          const float q = roundf(blk_a[kk] * scale_reciprocal);
+          QuantAData[m * BlockCountK * BlkLen + k + kk] =
+              static_cast<int8_t>(
+                  std::clamp(q,
+                             static_cast<float>(std::numeric_limits<int8_t>::min()),
+                             static_cast<float>(std::numeric_limits<int8_t>::max())));
+        }
+      }
+    }
+  }
+
+  void CallReferenceGemm_CompInt8(size_t M,
+                                  size_t N,
+                                  size_t K,
+                                  const float* A,
+                                  const uint8_t* QuantBData,
+                                  const float* QuantBScale,
+                                  const uint8_t* QuantBZeroPoint,
+                                  const float* Bias,
+                                  float* C) {
+    const size_t BlockCountK = (K + BlkLen - 1) / BlkLen;
+
+    int8_t* QuantAData = BufferQuantAData.GetBuffer(M * BlockCountK * BlkLen);
+    float* QuantAScale = BufferQuantAScale.GetBuffer(M * BlockCountK);
+    QuantizeA(M, K, A, QuantAData, QuantAScale);
+
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t n = 0; n < N; ++n) {
+        float sum = Bias == nullptr ? 0.0f : Bias[n];
+        for (size_t k = 0, k_blk = 0; k < K; k += BlkLen, ++k_blk) {
+          const size_t k_blk_len = std::min(K - k, BlkLen);
+
+          const float a_scale = QuantAScale[m * BlockCountK + k_blk];
+
+          const float b_scale = QuantBScale[n * BlockCountK + k_blk];
+
+          static_assert(BlkBitWidth == 4, "only implemented for 4-bit quantized B");
+
+          uint8_t b_zp = 8;
+          if (QuantBZeroPoint != nullptr) {
+            const uint8_t b_zp_byte = QuantBZeroPoint[n * ((BlockCountK + 1) / 2) + k_blk / 2];
+            b_zp = (k_blk & 1) ? (b_zp_byte >> 4) : (b_zp_byte & 0x0F);
+          }
+
+          int32_t qsum = 0;
+
+          for (size_t kk = 0; kk < k_blk_len; ++kk) {
+            const int8_t qa = QuantAData[m * BlockCountK * BlkLen + k + kk];
+            const uint8_t qb_byte = QuantBData[(n * BlockCountK * BlkLen + k + kk) / 2];
+            const int8_t qb = ((kk & 1) == 1 ? (qb_byte >> 4) : (qb_byte & 0x0F)) - b_zp;
+            qsum += qa * qb;
+          }
+
+          sum += static_cast<float>(qsum) * a_scale * b_scale;
+        }
+
+        C[m * N + n] = sum;
+      }
+    }
   }
 
-  void CallReferenceGemm(size_t M,
-                         size_t N,
-                         size_t K,
-                         const float* A,
-                         const uint8_t* QuantBData,
-                         const float* QuantBScale,
-                         const uint8_t* QuantBZeroPoint,
-                         const float* Bias,
-                         float* C) {
+  void CallReferenceGemm_CompFp32(size_t M,
+                                  size_t N,
+                                  size_t K,
+                                  const float* A,
+                                  const uint8_t* QuantBData,
+                                  const float* QuantBScale,
+                                  const uint8_t* QuantBZeroPoint,
+                                  const float* Bias,
+                                  float* C) {
     float* DequantizedBData = BufferDequantizedB.GetBuffer(K * N);
     MlasDequantizeBlockwise<float, BlkBitWidth>(
         DequantizedBData, QuantBData, QuantBScale, QuantBZeroPoint, BlkLen, /* columnwise */ true,
@@ -95,6 +196,7 @@ class MlasSQNBitGemmTest : public MlasTestBase {
 
  public:
   void Test(size_t M, size_t N, size_t K,
+            MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
             bool WithBias, bool Symmetric, bool WithThreadpool) {
     MLAS_THREADPOOL* Threadpool = WithThreadpool ? GetMlasThreadPool() : nullptr;
 
@@ -126,7 +228,7 @@ class MlasSQNBitGemmTest : public MlasTestBase {
     float* C = BufferC.GetBuffer(N * M, true);
     float* CReference = BufferCReference.GetBuffer(N * M, true);
 
-    // pack B
+    // quantize B
     uint8_t* QuantBData = nullptr;
     float* QuantBScale = nullptr;
     uint8_t* QuantBZeroPoint = nullptr;
@@ -138,20 +240,48 @@ class MlasSQNBitGemmTest : public MlasTestBase {
 
       QuantBData = BufferQuantBData.GetBuffer(QuantBDataSizeInBytes);
       QuantBScale = BufferQuantBScale.GetBuffer(QuantBScaleSize);
-      if (Symmetric) {
+      if (!Symmetric) {
         QuantBZeroPoint = BufferQuantBZeroPoint.GetBuffer(QuantBZeroPointSizeInBytes);
       }
 
-      MlasQuantizeBlockwise<float, 4>(QuantBData, QuantBScale, QuantBZeroPoint,
-                                      B, BlkLen,
-                                      /* columnwise */ true,
-                                      static_cast<int>(K), static_cast<int>(N),
-                                      static_cast<int>(N),
-                                      GetMlasThreadPool());
+      MlasQuantizeBlockwise<float, BlkBitWidth>(QuantBData, QuantBScale, QuantBZeroPoint,
+                                                B, BlkLen,
+                                                /* columnwise */ true,
+                                                static_cast<int>(K), static_cast<int>(N),
+                                                static_cast<int>(N),
+                                                GetMlasThreadPool());
     }
 
-    CallGemm(M, N, K, A, /* lda */ K, QuantBData, QuantBScale, QuantBZeroPoint, Bias, C, /* ldc */ N, Threadpool);
-    CallReferenceGemm(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
+    void* Workspace = nullptr;
+    if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType);
+        WorkspaceSize > 0) {
+      Workspace = BufferWorkspace.GetBuffer(WorkspaceSize);
+    }
+
+    void* PackedQuantBData = nullptr;
+    if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen);
+        PackedQuantBDataSize > 0) {
+      PackedQuantBData = BufferPackedQuantBData.GetBuffer(PackedQuantBDataSize);
+      MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, QuantBData, PackedQuantBData, GetMlasThreadPool());
+    }
+
+    if (ComputeType == CompFp32) {
+      CallReferenceGemm_CompFp32(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
+    } else if (ComputeType == CompInt8) {
+      CallReferenceGemm_CompInt8(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference);
+    } else {
+      FAIL() << "Test is not implemented for compute type "
+             << ComputeType << " (" << ComputeTypeName(ComputeType) << ")";
+    }
+
+    CallGemm(M, N, K,
+             A, /* lda */ K,
+             QuantBData, PackedQuantBData, QuantBScale, QuantBZeroPoint,
+             Bias,
+             C, /* ldc */ N,
+             Workspace,
+             ComputeType,
+             Threadpool);
 
     size_t f = 0;
     for (size_t m = 0; m < M; m++) {
@@ -179,74 +309,90 @@ template <size_t BlkBitWidth, size_t BlkLen>
 class SQNBitGemmShortExecuteTest : public MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>> {
  public:
   explicit SQNBitGemmShortExecuteTest(size_t M, size_t N, size_t K,
+                                      MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
                                       bool WithThreadpool, bool Symmetric, bool WithBias)
-      : M_(M), N_(N), K_(K), WithThreadpool_(WithThreadpool), Symmetric_(Symmetric), WithBias_(WithBias) {
+      : M_(M),
+        N_(N),
+        K_(K),
+        ComputeType_(ComputeType),
+        WithThreadpool_(WithThreadpool),
+        Symmetric_(Symmetric),
+        WithBias_(WithBias) {
   }
 
   void TestBody() override {
     MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>>::mlas_tester->Test(
-        M_, N_, K_, WithThreadpool_, Symmetric_, WithBias_);
+        M_, N_, K_, ComputeType_, WithThreadpool_, Symmetric_, WithBias_);
   }
 
   static size_t RegisterSingleTest(size_t M, size_t N, size_t K,
+                                   MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType,
                                    bool WithThreadpool, bool Symmetric, bool WithBias) {
-    std::stringstream ss;
-    ss << (WithThreadpool ? "SingleThread" : "Threaded")
-       << "/isSymmetric" << Symmetric
-       << "/M" << M << "xN" << N << "xK" << K
-       << "/hasBias" << WithBias;
-    auto test_name = ss.str();
-
-    testing::RegisterTest(
-        MlasSQNBitGemmTest<BlkBitWidth, BlkLen>::GetTestSuiteName(),
-        test_name.c_str(),
-        nullptr,
-        test_name.c_str(),
-        __FILE__,
-        __LINE__,
-        // Important to use the fixture type as the return type here.
-        [=]() -> MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>>* {
-          return new SQNBitGemmShortExecuteTest(
-              M, N, K, WithThreadpool, Symmetric, WithBias);
-        });
-
-    return 1;
+    size_t tests_registered = 0;
+
+    if (MlasIsSQNBitGemmAvailable(M, N, K, BlkBitWidth, BlkLen, ComputeType)) {
+      std::stringstream ss;
+      ss << (WithThreadpool ? "SingleThread" : "Threaded")
+         << "/isSymmetric" << Symmetric
+         << "/M" << M << "xN" << N << "xK" << K
+         << "/hasBias" << WithBias
+         << "/computeType" << ComputeTypeName(ComputeType);
+      auto test_name = ss.str();
+
+      testing::RegisterTest(
+          MlasSQNBitGemmTest<BlkBitWidth, BlkLen>::GetTestSuiteName(),
+          test_name.c_str(),
+          nullptr,
+          test_name.c_str(),
+          __FILE__,
+          __LINE__,
+          // Important to use the fixture type as the return type here.
+          [=]() -> MlasTestFixture<MlasSQNBitGemmTest<BlkBitWidth, BlkLen>>* {
+            return new SQNBitGemmShortExecuteTest(
+                M, N, K, ComputeType, WithThreadpool, Symmetric, WithBias);
+          });
+
+      tests_registered += 1;
+    }
+
+    return tests_registered;
   }
 
   static size_t RegisterShortExecuteTests() {
-    size_t test_registered = 0;
+    size_t tests_registered = 0;
 
-    if (MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen)) {
+    for (MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType : {CompFp32, CompInt8}) {
       for (bool WithThreadpool : {false, true}) {
         for (bool Symmetric : {false, true}) {
           for (size_t b = 1; b < 16; b++) {
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, false);
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, true);
           }
           for (size_t b = 16; b <= 256; b <<= 1) {
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, false);
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, true);
           }
           for (size_t b = 256; b < 320; b += 32) {
-            test_registered += RegisterSingleTest(b, b, b, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(b, b, b, ComputeType, WithThreadpool, Symmetric, true);
           }
           for (size_t b = 1; b < 96; b++) {
-            test_registered += RegisterSingleTest(1, b, 32, WithThreadpool, Symmetric, false);
-            test_registered += RegisterSingleTest(1, 32, b, WithThreadpool, Symmetric, true);
-            test_registered += RegisterSingleTest(1, b, b, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(1, b, 32, ComputeType, WithThreadpool, Symmetric, false);
+            tests_registered += RegisterSingleTest(1, 32, b, ComputeType, WithThreadpool, Symmetric, true);
+            tests_registered += RegisterSingleTest(1, b, b, ComputeType, WithThreadpool, Symmetric, false);
           }
-          test_registered += RegisterSingleTest(43, 500, 401, WithThreadpool, Symmetric, true);
+          tests_registered += RegisterSingleTest(43, 500, 401, ComputeType, WithThreadpool, Symmetric, true);
 
-          // test_registered += RegisterSingleTest(1001, 1027, 1031, WithThreadpool, Symmetric, false);
+          // tests_registered += RegisterSingleTest(1001, 1027, 1031, ComputeType, WithThreadpool, Symmetric, false);
         }
       }
     }
 
-    return test_registered;
+    return tests_registered;
   }
 
  private:
   size_t M_, N_, K_;
+  MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType_;
   bool WithThreadpool_, Symmetric_, WithBias_;
 };
 

From 443aeb851c7f941237ffecb04d36462216eaeaf3 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Fri, 12 Jan 2024 18:10:05 -0800
Subject: [PATCH 052/100] [TensorRT EP] Customizable engine cache prefix
 (#19083)

### Description
<!-- Describe your changes. -->
Add new option `trt_engine_cache_prefix` to customize TRTEP engine cache
prefix.

i.e:
- If user specifies `trt_engine_cache_prefix|FRCNN
trt_engine_cache_enable|true` when running FRCNN model
- the cache will be saved/loaded:
`FRCNN_2068723788287043730_*_sm80.engine`. Engine profile follows same
pattern.

- If skipping this option, the engine will be saved/loaded:
`TensorrtExecutionProvider_TRTKernel_graph_torch-jit-export_2068723788287043730_*_*_sm80.engine`
as default case.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
https://github.com/microsoft/onnxruntime/issues/16708

---------

Co-authored-by: Chi Lo <Chi.Lo@microsoft.com>
Co-authored-by: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
---
 .../tensorrt/tensorrt_provider_options.h      |  1 +
 .../tensorrt/tensorrt_execution_provider.cc   | 31 +++++++++---
 .../tensorrt/tensorrt_execution_provider.h    |  4 ++
 .../tensorrt_execution_provider_info.cc       |  6 +++
 .../tensorrt_execution_provider_info.h        |  1 +
 .../tensorrt_execution_provider_utils.h       | 47 +++++++++++++++++++
 .../tensorrt/tensorrt_provider_factory.cc     |  1 +
 .../core/session/provider_bridge_ort.cc       |  2 +
 .../python/onnxruntime_pybind_state.cc        |  9 +++-
 .../test/perftest/command_args_parser.cc      |  1 +
 .../providers/tensorrt/tensorrt_basic_test.cc | 25 ++++++++++
 11 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index daa4089061825..60196d0c80cbb 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -49,4 +49,5 @@ struct OrtTensorRTProviderOptionsV2 {
   int trt_dump_ep_context_model{0};                      // Dump EP context node model
   int trt_ep_context_embed_mode{0};                      // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
   int trt_ep_context_compute_capability_enable{1};       // Add GPU compute capability as an EP context node's attribute
+  const char* trt_engine_cache_prefix{nullptr};          // specify engine cache prefix
 };
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 1d4ead019dc27..aa02d8384afa6 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1352,6 +1352,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     detailed_build_log_ = info.detailed_build_log;
     if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       cache_path_ = info.engine_cache_path;
+      cache_prefix_ = info.engine_cache_prefix;
     }
     // use a more global cache if given
     if (timing_cache_enable_) {
@@ -1463,6 +1464,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
       if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
         const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
         cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
+        cache_prefix_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePrefix);
         if (!engine_cache_path.empty() && cache_path_.empty()) {
           cache_path_ = engine_cache_path;
           LOGS_DEFAULT(WARNING) << "[TensorRT EP] ORT_TENSORRT_ENGINE_CACHE_PATH is deprecated! Please use ORT_TENSORRT_CACHE_PATH to specify engine cache path";
@@ -1578,7 +1580,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     dla_core_ = 0;
   }
 
-  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
+  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_ || !cache_prefix_.empty()) {
     if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
       if (!fs::create_directory(cache_path_)) {
         throw std::runtime_error("Failed to create directory " + cache_path_);
@@ -1689,7 +1691,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_profile_min_shapes: " << profile_min_shapes
                         << ", trt_profile_max_shapes: " << profile_max_shapes
                         << ", trt_profile_opt_shapes: " << profile_opt_shapes
-                        << ", trt_cuda_graph_enable: " << cuda_graph_enable_;
+                        << ", trt_cuda_graph_enable: " << cuda_graph_enable_
+                        << ", trt_cache_prefix: " << cache_prefix_;
 }
 
 TensorrtExecutionProvider::~TensorrtExecutionProvider() {
@@ -2026,7 +2029,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
         bool has_control_flow_op = false;
 
         // Add node and node args
-        // If node output is also parent graph output, the  output will be added to the
+        // If node output is also parent graph output, the output will be added to the
         // subgraph's output list
         std::vector<std::string> subgraph_output_names;
         for (const auto& index : group.first) {
@@ -2774,7 +2777,6 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     trt_config->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
     LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Sparse weights are allowed";
   }
-
 #if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR == 5
   if (build_heuristics_enable_) {
     trt_config->setFlag(nvinfer1::BuilderFlag::kENABLE_TACTIC_HEURISTIC);
@@ -2831,7 +2833,16 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
 
   // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
   // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-  const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
+  std::string cache_suffix = "";
+  std::string cache_path = "";
+  // Customize cache prefix if assigned
+  if (!cache_prefix_.empty()) {
+    // Generate cache suffix in case user would like to customize cache prefix
+    cache_suffix = "_" + GetCacheSuffix(fused_node.Name(), trt_node_name_with_precision);
+    cache_path = GetCachePath(cache_path_, cache_prefix_) + cache_suffix;
+  } else {
+    cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
+  }
   const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
   const std::string engine_cache_path = cache_path_prefix + ".engine";
   const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
@@ -3072,7 +3083,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
           runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
           dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
           global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_,
-          builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics};
+          builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics, cuda_graph_enable_, cache_prefix_, cache_suffix};
     *state = p.release();
     return 0;
   };
@@ -3124,7 +3135,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
     // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
     // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
     // Prepare cache name
-    const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
+    std::string cache_path = "";
+    // Customize cache prefix if assigned
+    if (!cache_prefix_.empty()) {
+      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->cache_prefix) + trt_state->cache_suffix;
+    } else {
+      cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
+    }
     const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
     const std::string engine_cache_path = cache_path_prefix + ".engine";
     const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 9b8798e0fc4ca..401a8da119ac2 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -49,6 +49,7 @@ static const std::string kCudaGraphEnable = "ORT_TENSORRT_CUDA_GRAPH_ENABLE";
 static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL";
 static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE";
 static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE";
+static const std::string kEngineCachePrefix = "ORT_TENSORRT_CACHE_PREFIX";
 // Old env variable for backward compatibility
 static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
 }  // namespace tensorrt_env_vars
@@ -178,6 +179,8 @@ struct TensorrtFuncState {
   bool filter_tactic_sources = false;
   nvinfer1::TacticSources tactic_sources;
   bool cuda_graph_enable = 0;
+  std::string cache_prefix;
+  std::string cache_suffix;
 };
 
 // Minimum information to construct kernel function state for direct engine load code path
@@ -290,6 +293,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   bool force_timing_cache_match_ = false;
   bool detailed_build_log_ = false;
   bool cuda_graph_enable_ = false;
+  std::string cache_prefix_;
 
   // The OrtAllocator object will be get during ep compute time
   // and should be kept for the lifetime of TRT EP object.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index f7820ac8a08c3..28f6e1720f615 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -26,6 +26,7 @@ constexpr const char* kDLACore = "trt_dla_core";
 constexpr const char* kDumpSubgraphs = "trt_dump_subgraphs";
 constexpr const char* kEngineCacheEnable = "trt_engine_cache_enable";
 constexpr const char* kEngineCachePath = "trt_engine_cache_path";
+constexpr const char* kEngineCachePrefix = "trt_engine_cache_prefix";
 constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
 constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
 constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
@@ -81,6 +82,7 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePath, info.engine_cache_path)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePrefix, info.engine_cache_prefix)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
@@ -124,6 +126,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)},
       {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.engine_cache_enable)},
       {tensorrt::provider_option_names::kEngineCachePath, MakeStringWithClassicLocale(info.engine_cache_path)},
+      {tensorrt::provider_option_names::kEngineCachePrefix, MakeStringWithClassicLocale(info.engine_cache_prefix)},
       {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.engine_decryption_enable)},
       {tensorrt::provider_option_names::kDecryptionLibPath, MakeStringWithClassicLocale(info.engine_decryption_lib_path)},
       {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.force_sequential_engine_build)},
@@ -155,6 +158,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   auto empty_if_null = [](const char* s) { return s != nullptr ? std::string{s} : std::string{}; };
   const std::string kInt8CalibTable_ = empty_if_null(info.trt_int8_calibration_table_name);
   const std::string kEngineCachePath_ = empty_if_null(info.trt_engine_cache_path);
+  const std::string kEngineCachePrefix_ = empty_if_null(info.trt_engine_cache_prefix);
   const std::string kTimingCachePath_ = empty_if_null(info.trt_timing_cache_path);
   const std::string kTacticSources_ = empty_if_null(info.trt_tactic_sources);
   const std::string kDecryptionLibPath_ = empty_if_null(info.trt_engine_decryption_lib_path);
@@ -178,6 +182,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.trt_dump_subgraphs)},
       {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.trt_engine_cache_enable)},
       {tensorrt::provider_option_names::kEngineCachePath, kEngineCachePath_},
+      {tensorrt::provider_option_names::kEngineCachePrefix, kEngineCachePrefix_},
       {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.trt_engine_decryption_enable)},
       {tensorrt::provider_option_names::kDecryptionLibPath, kDecryptionLibPath_},
       {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.trt_force_sequential_engine_build)},
@@ -267,6 +272,7 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_engine_cache_enable = internal_options.engine_cache_enable;
 
   trt_provider_options_v2.trt_engine_cache_path = copy_string_if_needed(internal_options.engine_cache_path);
+  trt_provider_options_v2.trt_engine_cache_prefix = copy_string_if_needed(internal_options.engine_cache_prefix);
   trt_provider_options_v2.trt_timing_cache_path = copy_string_if_needed(internal_options.timing_cache_path);
 
   trt_provider_options_v2.trt_engine_decryption_enable = internal_options.engine_decryption_enable;
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index 76223b7847359..a133ef45affe8 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -54,6 +54,7 @@ struct TensorrtExecutionProviderInfo {
   bool dump_ep_context_model{false};
   int ep_context_embed_mode{0};
   bool ep_context_compute_capability_enable{1};
+  std::string engine_cache_prefix{""};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
index 07f6f8eb3476f..a8e3ae3ddf6ec 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -4,6 +4,8 @@
 #include <fstream>
 #include <unordered_map>
 #include <string>
+#include <vector>
+#include <sstream>
 #include <iostream>
 #include <filesystem>
 #include <experimental/filesystem>
@@ -695,4 +697,49 @@ bool ParseProfileShapes(std::string profile_shapes_string, std::unordered_map<st
 
   return true;
 }
+
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+
+std::string join(const std::vector<std::string>& vec, const std::string& delimiter) {
+  std::string result;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    result += vec[i];
+    if (i < vec.size() - 1) {
+      result += delimiter;
+    }
+  }
+  return result;
+}
+
+/*
+ * Parse engine cache name suffix when user customizes prefix for engine cache name
+ *
+ * For example:
+ * When default subgraph name is "TensorrtExecutionProvider_TRTKernel_graph_torch-jit-export_2068723788287043730_189_189_fp16"
+ * This func will generate the suffix "2068723788287043730_189_fp16"
+ *
+ */
+std::string GetCacheSuffix(const std::string& fused_node_name, const std::string& trt_node_name_with_precision) {
+  std::vector<std::string> split_fused_node_name = split(fused_node_name, '_');
+  if (split_fused_node_name.size() >= 3) {
+    // Get index of model hash from fused_node_name
+    std::string model_hash = split_fused_node_name[split_fused_node_name.size() - 3];
+    size_t index = fused_node_name.find(model_hash);
+    // Parse suffix from trt_node_name_with_precision, as it has additional precision info
+    std::vector<std::string> suffix_group = split(trt_node_name_with_precision.substr(index), '_');
+    if (suffix_group.size() > 2) {
+      suffix_group.erase(suffix_group.begin() + 2);
+    }
+    return join(suffix_group, "_");
+  }
+  return "";
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 0e29df72f0322..62f124afbd1e5 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -119,6 +119,7 @@ struct Tensorrt_Provider : Provider {
     info.dump_ep_context_model = options.trt_dump_ep_context_model != 0;
     info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
     info.ep_context_compute_capability_enable = options.trt_ep_context_compute_capability_enable != 0;
+    info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
 
     return std::make_shared<TensorrtProviderFactory>(info);
   }
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index b9fd79997a538..45d8006e6b49e 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -1419,6 +1419,7 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
   trt_options_converted.trt_profile_max_shapes = "";
   trt_options_converted.trt_profile_opt_shapes = "";
   trt_options_converted.trt_cuda_graph_enable = 0;
+  trt_options_converted.trt_engine_cache_prefix = "";
 
   return trt_options_converted;
 }
@@ -1982,6 +1983,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
   if (ptr != nullptr) {
     delete[] ptr->trt_int8_calibration_table_name;
     delete[] ptr->trt_engine_cache_path;
+    delete[] ptr->trt_engine_cache_prefix;
     delete[] ptr->trt_timing_cache_path;
     delete[] ptr->trt_engine_decryption_lib_path;
     delete[] ptr->trt_tactic_sources;
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 06eb2afdf80f2..d2cd6140b838e 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -475,7 +475,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // So we need these std::string variables defined here as they will be kept alive for the lifetime of TRT EP and we can still access them from OrtTensorRTProviderOptionsV2 instance.
       // (The reason is string copy is involved, for example params.trt_engine_cache_path = cache_path.c_str() and those std::string variable is referenced by OrtTensorRTProviderOptionsV2 instance
       // and TRT EP instance, so it won't be released.)
-      std::string calibration_table, cache_path, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile;
+      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
         OrtTensorRTProviderOptionsV2 params;
@@ -572,6 +572,13 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_cache_path' should be a path string i.e. 'engine_cache'.\n");
             }
+          } else if (option.first == "trt_engine_cache_prefix") {
+            if (!option.second.empty()) {
+              cache_prefix = option.second;
+              params.trt_engine_cache_prefix = cache_prefix.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_cache_prefix' should be a string to customize engine cache prefix i.e. 'FRCNN' or 'yolov4'.\n");
+            }
           } else if (option.first == "trt_engine_decryption_enable") {
             if (option.second == "True" || option.second == "true") {
               params.trt_engine_decryption_enable = true;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index f8d6296d2d785..f1b9f05a21f11 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -93,6 +93,7 @@ namespace perftest {
       "\t    [TensorRT only] [trt_dump_subgraphs]: Dump TRT subgraph to onnx model.\n"
       "\t    [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n"
       "\t    [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n"
+      "\t    [TensorRT only] [trt_engine_cache_prefix]: Customize engine cache prefix when trt_engine_cache_enable is true.\n"
       "\t    [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n"
       "\t    [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n"
       "\t    [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n"
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index d9f917f6d187c..508739ae1d235 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -122,6 +122,19 @@ void CreateBaseModel(std::string model_name,
   status = onnxruntime::Model::Save(model, model_name);
 }
 
+bool HasCacheFileWithPrefix(const std::string& prefix) {
+  const std::filesystem::path current_dir = std::filesystem::current_path();
+  for (const auto& entry : std::filesystem::directory_iterator(current_dir)) {
+    if (entry.is_regular_file()) {
+      std::string filename = entry.path().filename().string();
+      if (filename.rfind(prefix, 0) == 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void RunSession(InferenceSession& session_object,
                 RunOptions& run_options,
                 NameMLValMap& feeds,
@@ -177,6 +190,7 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
 
   OrtTensorRTProviderOptionsV2 params;
   params.trt_engine_cache_enable = 1;
+  params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
   std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
   EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   auto status = session_object.Load(model_name);
@@ -192,6 +206,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
   // Y: 1, 3, 3, 2, 2, 2
   // Z: 1, 3, 3, 2, 2, 2
   RunSession(session_object, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  // Verify on cache with customized prefix
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
 }
 
 void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string sess_log_id, bool has_non_zero_node = false) {
@@ -227,6 +244,7 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
 
   OrtTensorRTProviderOptionsV2 params;
   params.trt_engine_cache_enable = 1;
+  params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
   std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
   EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   auto status = session_object.Load(model_name);
@@ -253,6 +271,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string
 
   for (auto& th : threads)
     th.join();
+
+  // Verify on cache with customized prefix
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
 }
 
 TEST(TensorrtExecutionProviderTest, SessionCreationWithMultiThreadsAndInferenceWithMultiThreads) {
@@ -426,6 +447,7 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
      */
 
     params.trt_engine_cache_enable = 1;
+    params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
     std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
     EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
     auto status = session_object.Load(model_name);
@@ -551,6 +573,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 
     status = session_object2.Run(run_options, feeds, output_names, &fetches);
 
+    // Verify on cache with customized prefix
+    ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
+
     if (input_type.compare("static") == 0) {
       // Can't run inference since input shape changes but the engine is built with static input
       ASSERT_FALSE(status.IsOK());

From 62a4e9103e33b3372d262ad490a9d1c67d84b2eb Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Fri, 12 Jan 2024 19:07:02 -0800
Subject: [PATCH 053/100] Add extreme_power_saver for htp_performance_mode
 (#19111)

### Description
Add extreme_power_saver mode for htp_performance_mode
---
 .../core/session/onnxruntime_c_api.h          |  2 +-
 .../qnn/builder/qnn_backend_manager.cc        | 22 ++++++++++++++-----
 .../core/providers/qnn/builder/qnn_def.h      |  1 +
 .../providers/qnn/qnn_execution_provider.cc   |  2 ++
 onnxruntime/test/onnx/main.cc                 |  4 ++--
 .../test/perftest/command_args_parser.cc      |  2 +-
 onnxruntime/test/perftest/ort_test_session.cc |  2 +-
 7 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 3e69923330d6b..b321b2b2bac27 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3598,7 +3598,7 @@ struct OrtApi {
    *   "rpc_control_latency": QNN RPC control latency.
    *   "vtcm_mb": QNN VTCM size in MB. default to 0(not set).
    *   "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance",
-   *   "high_power_saver", "low_balanced", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
+   *   "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default".
    *   "qnn_saver_path": File path to the QNN Saver backend library. If specified, QNN Saver will be enabled and will
    *   dump QNN API calls to disk for replay/debugging. QNN Saver produces incorrect model inference results and
    *   may alter model/EP partitioning. Use only for debugging.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index ca6a2238e520d..193e4f5ff2a31 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -693,6 +693,18 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_SVS_PLUS;
       break;
+    case HtpPerformanceMode::kHtpExtremePowerSaver:
+      dcvs_v3.setSleepLatency = 1;  // true
+      dcvs_v3.sleepLatency = kSleepMediumLatency;
+      dcvs_v3.setBusParams = 1;
+      dcvs_v3.busVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.busVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.setCoreParams = 1;
+      dcvs_v3.coreVoltageCornerMin = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerTarget = DCVS_VOLTAGE_CORNER_DISABLE;
+      dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_CORNER_DISABLE;
+      break;
     case HtpPerformanceMode::kHtpLowBalanced:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMediumLatency;
@@ -721,13 +733,13 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode_));
       break;
   }
-  std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr_ = ObtainNullTermPtrVector(power_configs);
-  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data());
+  std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr = ObtainNullTermPtrVector(power_configs);
+  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data());
   ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode.");
 
   // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
   if (rpc_control_latency_ != 0) {
-    constexpr int kNumRpcPollingPowerConfigs = 1;
+    constexpr int kNumRpcPollingPowerConfigs = 2;
     std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> rpc_power_configs(kNumRpcPollingPowerConfigs);
     QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency = rpc_power_configs[0];
     // v68 doesn't support this.
@@ -735,8 +747,8 @@ Status QnnBackendManager::SetHtpPowerConfig() {
     rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
     rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
     rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_;
-    perf_power_configs_ptr_ = ObtainNullTermPtrVector(rpc_power_configs);
-    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr_.data());
+    perf_power_configs_ptr = ObtainNullTermPtrVector(rpc_power_configs);
+    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data());
     ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency.");
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index c202f2bf79c57..cb6344b4e7902 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -46,6 +46,7 @@ enum class HtpPerformanceMode : uint8_t {
   kHtpHighPowerSaver,
   kHtpLowBalanced,
   kHtpBalanced,
+  kHtpExtremePowerSaver,
 };
 
 enum class ContextPriority : uint8_t {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index e5856e85e19e8..04bd58c237141 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -64,6 +64,8 @@ static void ParseHtpPerformanceMode(std::string htp_performance_mode_string,
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpLowPowerSaver;
   } else if (htp_performance_mode_string == "power_saver") {
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpPowerSaver;
+  } else if (htp_performance_mode_string == "extreme_power_saver") {
+    htp_performance_mode = qnn::HtpPerformanceMode::kHtpExtremePowerSaver;
   } else if (htp_performance_mode_string == "sustained_high_performance") {
     htp_performance_mode = qnn::HtpPerformanceMode::kHtpSustainedHighPerformance;
   } else {
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 51edb91b5d3af..7e0a811b7d07c 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -54,7 +54,7 @@ void usage() {
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
-      "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    'high_power_saver', 'low_balanced', 'extreme_power_saver', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
       "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
       "\t    0 means dump the QNN context binary into separate bin file and set the path in the Onnx skeleton model.\n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
@@ -487,7 +487,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
           // no validation
         } else if (key == "htp_performance_mode") {
           std::set<std::string> supported_htp_perf_mode = {"burst", "balanced", "default", "high_performance",
-                                                           "high_power_saver", "low_balanced", "low_power_saver",
+                                                           "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver",
                                                            "power_saver", "sustained_high_performance"};
           if (supported_htp_perf_mode.find(value) == supported_htp_perf_mode.end()) {
             std::ostringstream str_stream;
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index f1b9f05a21f11..ef04e2be8fd29 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -73,7 +73,7 @@ namespace perftest {
       "\t    [QNN only] [rpc_control_latency]: QNN rpc control latency. default to 10.\n"
       "\t    [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
       "\t    [QNN only] [htp_performance_mode]: QNN performance mode, options: 'burst', 'balanced', 'default', 'high_performance', \n"
-      "\t    'high_power_saver', 'low_balanced', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
+      "\t    'high_power_saver', 'low_balanced', 'extreme_power_saver', 'low_power_saver', 'power_saver', 'sustained_high_performance'. Default to 'default'. \n"
       "\t    [QNN only] [qnn_context_priority]: QNN context priority, options: 'low', 'normal', 'normal_high', 'high'. Default to 'normal'. \n"
       "\t    [QNN only] [qnn_saver_path]: QNN Saver backend path. e.g '/folderpath/libQnnSaver.so'.\n"
       "\t    [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: \n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 27385d44e257a..f8a012af5bb13 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -347,7 +347,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
         // no validation
       } else if (key == "htp_performance_mode") {
         std::set<std::string> supported_htp_perf_mode = {"burst", "balanced", "default", "high_performance",
-                                                         "high_power_saver", "low_balanced", "low_power_saver",
+                                                         "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver",
                                                          "power_saver", "sustained_high_performance"};
         if (supported_htp_perf_mode.find(value) == supported_htp_perf_mode.end()) {
           std::ostringstream str_stream;

From e5eacc6d1192c1ced310e93013ce917e2fe023a7 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 12 Jan 2024 22:09:25 -0500
Subject: [PATCH 054/100] Fix cuda-packaging-pipeline.yml (#19115)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index a53416997025e..df7b5f59d28fc 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -125,8 +125,6 @@ stages:
       parameters:
         BaseImage: 'registry.access.redhat.com/ubi8/ubi'
         OnnxruntimeArch: 'x64'
-        OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
-        OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
         OnnxruntimeNodejsBindingArch: 'x64'
         PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
         PackageJava: false

From 07cfc56538bf7aa769943dbf96647cb2b677deb8 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Fri, 12 Jan 2024 19:24:24 -0800
Subject: [PATCH 055/100] [js] enable external data loading for ort-web
 (#19087)

### Description
enable external data loading for ort-web.

### Why
The ORT external data design is highly depending on the file system,
especially synchronous file I/O APIs. Those are not available in web
platforms. We need to have extra code to make external data working on
web.

### How
Considering there is no file system in web, an implementation for web to
support external data is to use pre-loaded data. Assume model file
a.onnx includes initializers that linked to ./b.bin, we require users to
pass a full data file list when creating the session. The user code will
be look like:
```js
const mySess = await ort.InferenceSession.create('./path/model/a.onnx', {
  // session options
  externalData: [
    {
      // relative or absolute path/URL of the file,
      // or a pre-loaded Uint8Array containing the data of the external data file
      data: './path/data/b.bin',

      // the relative path of the external data. Should match initializers' "location" value defined in the model file
      path: './b.bin'
    },
    // { } if multiple external data file
  ]
});
```

Currently, this feature only works with JSEP build enabled.
---
 js/common/lib/inference-session.ts            |   3 +-
 js/common/lib/onnx-model.ts                   |  57 +++++
 js/web/lib/wasm/binding/ort-wasm.d.ts         |   5 +
 js/web/lib/wasm/proxy-worker/main.ts          |  10 +-
 js/web/lib/wasm/session-handler-inference.ts  |  14 +-
 js/web/lib/wasm/wasm-core-impl.ts             | 199 ++++++++++--------
 js/web/lib/wasm/wasm-utils-load-file.ts       |  77 +++++++
 js/web/script/build.ts                        |   6 +-
 .../e2e/browser-test-webgpu-external-data.js  |  24 +++
 js/web/test/e2e/karma.conf.js                 |  11 +-
 js/web/test/e2e/model_with_orig_ext_data.bin  | Bin 0 -> 32 bytes
 js/web/test/e2e/model_with_orig_ext_data.onnx |  19 ++
 js/web/test/e2e/run.js                        |   1 +
 js/web/test/e2e/simple-http-server.js         |   2 +
 .../core/framework/tensorprotoutils.cc        |  70 ++++++
 onnxruntime/core/optimizer/initializer.cc     |   4 +
 .../test/framework/test_tensor_loader.cc      |   2 +
 .../test/optimizer/initializer_test.cc        |   2 +
 onnxruntime/wasm/js_internal_api.js           |  22 +-
 19 files changed, 420 insertions(+), 108 deletions(-)
 create mode 100644 js/common/lib/onnx-model.ts
 create mode 100644 js/web/lib/wasm/wasm-utils-load-file.ts
 create mode 100644 js/web/test/e2e/browser-test-webgpu-external-data.js
 create mode 100644 js/web/test/e2e/model_with_orig_ext_data.bin
 create mode 100644 js/web/test/e2e/model_with_orig_ext_data.onnx

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index c7760692eed00..edc32535fc64d 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {InferenceSession as InferenceSessionImpl} from './inference-session-impl.js';
+import {OnnxModelOptions} from './onnx-model.js';
 import {OnnxValue, OnnxValueDataLocation} from './onnx-value.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
@@ -43,7 +44,7 @@ export declare namespace InferenceSession {
   /**
    * A set of configurations for session behavior.
    */
-  export interface SessionOptions {
+  export interface SessionOptions extends OnnxModelOptions {
     /**
      * An array of execution provider options.
      *
diff --git a/js/common/lib/onnx-model.ts b/js/common/lib/onnx-model.ts
new file mode 100644
index 0000000000000..1cd3eedb6fcca
--- /dev/null
+++ b/js/common/lib/onnx-model.ts
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+/**
+ * A string that represents a file's URL or path.
+ *
+ * Path is vailable only in onnxruntime-node or onnxruntime-web running in Node.js.
+ */
+export type FileUrlOrPath = string;
+
+/**
+ * A Blob object that represents a file.
+ */
+export type FileBlob = Blob;
+
+/**
+ * A Uint8Array, ArrayBuffer or SharedArrayBuffer object that represents a file content.
+ *
+ * When it is an ArrayBuffer or SharedArrayBuffer, the whole buffer is assumed to be the file content.
+ */
+export type FileData = Uint8Array|ArrayBufferLike;
+
+/**
+ * Represents a file that can be loaded by the ONNX Runtime JavaScript API.
+ */
+export type FileType = FileUrlOrPath|FileBlob|FileData;
+
+/**
+ * Represents an external data file.
+ */
+export interface ExternalDataFileDescription {
+  /**
+   * Specify the external data file.
+   */
+  data: FileType;
+  /**
+   * Specify the file path.
+   */
+  path: string;
+}
+
+/**
+ * Represents an external data file.
+ *
+ * When using a string, it should be a file URL or path that in the same directory as the model file.
+ */
+export type ExternalDataFileType = ExternalDataFileDescription|FileUrlOrPath;
+
+/**
+ * Options for model loading.
+ */
+export interface OnnxModelOptions {
+  /**
+   * Specifying a list of files that represents the external data.
+   */
+  externalData?: readonly ExternalDataFileType[];
+}
diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts
index 6c55dcc1bfd32..9d4d5875310b7 100644
--- a/js/web/lib/wasm/binding/ort-wasm.d.ts
+++ b/js/web/lib/wasm/binding/ort-wasm.d.ts
@@ -115,6 +115,11 @@ export interface OrtWasmModule extends EmscriptenModule {
   mainScriptUrlOrBlob?: string|Blob;
   // #endregion
 
+  // #region external data API
+  mountExternalData?(externalDataFilePath: string, externalDataFileData: Uint8Array): void;
+  unmountExternalData?(): void;
+  // #endregion
+
   // #region JSEP
   /**
    * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime.
diff --git a/js/web/lib/wasm/proxy-worker/main.ts b/js/web/lib/wasm/proxy-worker/main.ts
index 4df524cdcfb22..6cbd38c76ccc8 100644
--- a/js/web/lib/wasm/proxy-worker/main.ts
+++ b/js/web/lib/wasm/proxy-worker/main.ts
@@ -79,8 +79,14 @@ self.onmessage = (ev: MessageEvent<OrtWasmMessage>): void => {
       }
       case 'create': {
         const {model, options} = message!;
-        const sessionMetadata = createSession(model, options);
-        postMessage({type, out: sessionMetadata} as OrtWasmMessage);
+        createSession(model, options)
+            .then(
+                sessionMetadata => {
+                  postMessage({type, out: sessionMetadata} as OrtWasmMessage);
+                },
+                err => {
+                  postMessage({type, err});
+                });
         break;
       }
       case 'release':
diff --git a/js/web/lib/wasm/session-handler-inference.ts b/js/web/lib/wasm/session-handler-inference.ts
index e17ec37e3e612..2bece248669f5 100644
--- a/js/web/lib/wasm/session-handler-inference.ts
+++ b/js/web/lib/wasm/session-handler-inference.ts
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {readFile} from 'node:fs/promises';
 import {InferenceSession, InferenceSessionHandler, SessionHandler, Tensor, TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
 
 import {SerializableInternalBuffer, TensorMetadata} from './proxy-messages';
 import {copyFromExternalBuffer, createSession, endProfiling, releaseSession, run} from './proxy-wrapper';
 import {isGpuBufferSupportedType} from './wasm-common';
+import {loadFile} from './wasm-utils-load-file';
 
 export const encodeTensorMetadata = (tensor: Tensor, getName: () => string): TensorMetadata => {
   switch (tensor.location) {
@@ -43,14 +43,8 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
   outputNames: string[];
 
   async fetchModelAndCopyToWasmMemory(path: string): Promise<SerializableInternalBuffer> {
-    // fetch model from url and move to wasm heap. The arraybufffer that held the http
-    // response is freed once we return
-    const response = await fetch(path);
-    if (response.status !== 200) {
-      throw new Error(`failed to load model: ${path}`);
-    }
-    const arrayBuffer = await response.arrayBuffer();
-    return copyFromExternalBuffer(new Uint8Array(arrayBuffer));
+    // fetch model from url and move to wasm heap.
+    return copyFromExternalBuffer(await loadFile(path));
   }
 
   async loadModel(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise<void> {
@@ -60,7 +54,7 @@ export class OnnxruntimeWebAssemblySessionHandler implements InferenceSessionHan
     if (typeof pathOrBuffer === 'string') {
       if (typeof process !== 'undefined' && process.versions && process.versions.node) {
         // node
-        model = await readFile(pathOrBuffer);
+        model = await loadFile(pathOrBuffer);
       } else {
         // browser
         // fetch model and copy to wasm heap.
diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts
index a9dfd9218bb6f..5821fac3c468f 100644
--- a/js/web/lib/wasm/wasm-core-impl.ts
+++ b/js/web/lib/wasm/wasm-core-impl.ts
@@ -9,6 +9,7 @@ import {setSessionOptions} from './session-options';
 import {dataLocationStringToEnum, getTensorElementSize, isGpuBufferSupportedType, logLevelStringToEnum, tensorDataTypeEnumToString, tensorDataTypeStringToEnum, tensorTypeToTypedArrayConstructor} from './wasm-common';
 import {getInstance} from './wasm-factory';
 import {allocWasmString, checkLastError} from './wasm-utils';
+import {loadFile} from './wasm-utils-load-file';
 
 // #region Initializations
 
@@ -187,108 +188,124 @@ export const copyFromExternalBuffer = (model: Uint8Array): [number, number] => {
  * @param options an optional session options object.
  * @returns a 3-elements tuple containing [session handle, input names, output names]
  */
-export const createSession =
-    (modelData: Uint8Array|SerializableInternalBuffer,
-     options?: InferenceSession.SessionOptions): SerializableSessionMetadata => {
-      let modelDataOffset: number, modelDataLength: number;
-      const wasm = getInstance();
-
-      if (Array.isArray(modelData)) {
-        // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data
-        [modelDataOffset, modelDataLength] = modelData;
-      } else if (modelData.buffer === wasm.HEAPU8.buffer) {
-        // if model data uses the same buffer as the WASM heap, we don't need to copy it.
-        [modelDataOffset, modelDataLength] = [modelData.byteOffset, modelData.byteLength];
-      } else {
-        // otherwise, copy the model data to the WASM heap.
-        [modelDataOffset, modelDataLength] = copyFromExternalBuffer(modelData);
-      }
+export const createSession = async(
+    modelData: Uint8Array|SerializableInternalBuffer,
+    options?: InferenceSession.SessionOptions): Promise<SerializableSessionMetadata> => {
+  let modelDataOffset: number, modelDataLength: number;
+  const wasm = getInstance();
 
-      let sessionHandle = 0;
-      let sessionOptionsHandle = 0;
-      let ioBindingHandle = 0;
-      let allocs: number[] = [];
-      const inputNamesUTF8Encoded = [];
-      const outputNamesUTF8Encoded = [];
+  if (Array.isArray(modelData)) {
+    // if model data is an array, it must be a 2-elements tuple containing the pointer and size of the model data
+    [modelDataOffset, modelDataLength] = modelData;
+  } else if (modelData.buffer === wasm.HEAPU8.buffer) {
+    // if model data uses the same buffer as the WASM heap, we don't need to copy it.
+    [modelDataOffset, modelDataLength] = [modelData.byteOffset, modelData.byteLength];
+  } else {
+    // otherwise, copy the model data to the WASM heap.
+    [modelDataOffset, modelDataLength] = copyFromExternalBuffer(modelData);
+  }
 
-      try {
-        [sessionOptionsHandle, allocs] = setSessionOptions(options);
+  let sessionHandle = 0;
+  let sessionOptionsHandle = 0;
+  let ioBindingHandle = 0;
+  let allocs: number[] = [];
+  const inputNamesUTF8Encoded = [];
+  const outputNamesUTF8Encoded = [];
 
-        sessionHandle = wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle);
-        if (sessionHandle === 0) {
-          checkLastError('Can\'t create a session.');
-        }
+  try {
+    [sessionOptionsHandle, allocs] = setSessionOptions(options);
+
+    if (options?.externalData && wasm.mountExternalData) {
+      const loadingPromises = [];
+      for (const file of options.externalData) {
+        const path = typeof file === 'string' ? file : file.path;
+        loadingPromises.push(loadFile(typeof file === 'string' ? file : file.data).then(data => {
+          wasm.mountExternalData!(path, data);
+        }));
+      }
 
-        const [inputCount, outputCount] = getSessionInputOutputCount(sessionHandle);
+      // wait for all external data files to be loaded
+      await Promise.all(loadingPromises);
+    }
 
-        const inputNames = [];
-        const outputNames = [];
-        const outputPreferredLocations: SupportedTensorDataLocationForInputOutput[] = [];
-        for (let i = 0; i < inputCount; i++) {
-          const name = wasm._OrtGetInputName(sessionHandle, i);
-          if (name === 0) {
-            checkLastError('Can\'t get an input name.');
-          }
-          inputNamesUTF8Encoded.push(name);
-          inputNames.push(wasm.UTF8ToString(name));
-        }
-        for (let i = 0; i < outputCount; i++) {
-          const name = wasm._OrtGetOutputName(sessionHandle, i);
-          if (name === 0) {
-            checkLastError('Can\'t get an output name.');
-          }
-          outputNamesUTF8Encoded.push(name);
-          const nameString = wasm.UTF8ToString(name);
-          outputNames.push(nameString);
-
-          if (!BUILD_DEFS.DISABLE_WEBGPU) {
-            const location = typeof options?.preferredOutputLocation === 'string' ?
-                options.preferredOutputLocation :
-                options?.preferredOutputLocation?.[nameString] ?? 'cpu';
-            if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer') {
-              throw new Error(`Not supported preferred output location: ${location}.`);
-            }
-            outputPreferredLocations.push(location);
-          }
-        }
+    sessionHandle = wasm._OrtCreateSession(modelDataOffset, modelDataLength, sessionOptionsHandle);
+    if (sessionHandle === 0) {
+      checkLastError('Can\'t create a session.');
+    }
 
-        // use IO binding only when at least one output is preffered to be on GPU.
-        let bindingState: IOBindingState|null = null;
-        if (!BUILD_DEFS.DISABLE_WEBGPU && outputPreferredLocations.some(l => l === 'gpu-buffer')) {
-          ioBindingHandle = wasm._OrtCreateBinding(sessionHandle);
-          if (ioBindingHandle === 0) {
-            checkLastError('Can\'t create IO binding.');
-          }
+    const [inputCount, outputCount] = getSessionInputOutputCount(sessionHandle);
 
-          bindingState = {
-            handle: ioBindingHandle,
-            outputPreferredLocations,
-            outputPreferredLocationsEncoded: outputPreferredLocations.map(l => dataLocationStringToEnum(l)),
-          };
+    const inputNames = [];
+    const outputNames = [];
+    const outputPreferredLocations: SupportedTensorDataLocationForInputOutput[] = [];
+    for (let i = 0; i < inputCount; i++) {
+      const name = wasm._OrtGetInputName(sessionHandle, i);
+      if (name === 0) {
+        checkLastError('Can\'t get an input name.');
+      }
+      inputNamesUTF8Encoded.push(name);
+      inputNames.push(wasm.UTF8ToString(name));
+    }
+    for (let i = 0; i < outputCount; i++) {
+      const name = wasm._OrtGetOutputName(sessionHandle, i);
+      if (name === 0) {
+        checkLastError('Can\'t get an output name.');
+      }
+      outputNamesUTF8Encoded.push(name);
+      const nameString = wasm.UTF8ToString(name);
+      outputNames.push(nameString);
+
+      if (!BUILD_DEFS.DISABLE_WEBGPU) {
+        const location = typeof options?.preferredOutputLocation === 'string' ?
+            options.preferredOutputLocation :
+            options?.preferredOutputLocation?.[nameString] ?? 'cpu';
+        if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer') {
+          throw new Error(`Not supported preferred output location: ${location}.`);
         }
+        outputPreferredLocations.push(location);
+      }
+    }
 
-        activeSessions.set(sessionHandle, [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, bindingState]);
-        return [sessionHandle, inputNames, outputNames];
-      } catch (e) {
-        inputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
-        outputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
+    // use IO binding only when at least one output is preffered to be on GPU.
+    let bindingState: IOBindingState|null = null;
+    if (!BUILD_DEFS.DISABLE_WEBGPU && outputPreferredLocations.some(l => l === 'gpu-buffer')) {
+      ioBindingHandle = wasm._OrtCreateBinding(sessionHandle);
+      if (ioBindingHandle === 0) {
+        checkLastError('Can\'t create IO binding.');
+      }
 
-        if (ioBindingHandle !== 0) {
-          wasm._OrtReleaseBinding(ioBindingHandle);
-        }
+      bindingState = {
+        handle: ioBindingHandle,
+        outputPreferredLocations,
+        outputPreferredLocationsEncoded: outputPreferredLocations.map(l => dataLocationStringToEnum(l)),
+      };
+    }
 
-        if (sessionHandle !== 0) {
-          wasm._OrtReleaseSession(sessionHandle);
-        }
-        throw e;
-      } finally {
-        wasm._free(modelDataOffset);
-        if (sessionOptionsHandle !== 0) {
-          wasm._OrtReleaseSessionOptions(sessionOptionsHandle);
-        }
-        allocs.forEach(alloc => wasm._free(alloc));
-      }
-    };
+    activeSessions.set(sessionHandle, [sessionHandle, inputNamesUTF8Encoded, outputNamesUTF8Encoded, bindingState]);
+    return [sessionHandle, inputNames, outputNames];
+  } catch (e) {
+    inputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
+    outputNamesUTF8Encoded.forEach(buf => wasm._OrtFree(buf));
+
+    if (ioBindingHandle !== 0) {
+      wasm._OrtReleaseBinding(ioBindingHandle);
+    }
+
+    if (sessionHandle !== 0) {
+      wasm._OrtReleaseSession(sessionHandle);
+    }
+    throw e;
+  } finally {
+    wasm._free(modelDataOffset);
+    if (sessionOptionsHandle !== 0) {
+      wasm._OrtReleaseSessionOptions(sessionOptionsHandle);
+    }
+    allocs.forEach(alloc => wasm._free(alloc));
+
+    // unmount external data if necessary
+    wasm.unmountExternalData?.();
+  }
+};
 
 export const releaseSession = (sessionId: number): void => {
   const wasm = getInstance();
diff --git a/js/web/lib/wasm/wasm-utils-load-file.ts b/js/web/lib/wasm/wasm-utils-load-file.ts
new file mode 100644
index 0000000000000..abe480a43c790
--- /dev/null
+++ b/js/web/lib/wasm/wasm-utils-load-file.ts
@@ -0,0 +1,77 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import * as fs from 'fs';
+import {readFile} from 'node:fs/promises';
+
+/**
+ * Load a file into a Uint8Array.
+ *
+ * @param file - the file to load. Can be a URL/path, a Blob, an ArrayBuffer, or a Uint8Array.
+ * @returns a Uint8Array containing the file data.
+ */
+export const loadFile = async(file: string|Blob|ArrayBufferLike|Uint8Array): Promise<Uint8Array> => {
+  if (typeof file === 'string') {
+    if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+      // load file into ArrayBuffer in Node.js
+      try {
+        return new Uint8Array(await readFile(file));
+      } catch (e) {
+        if (e.code === 'ERR_FS_FILE_TOO_LARGE') {
+          // file is too large, use fs.createReadStream instead
+          const stream = fs.createReadStream(file);
+          const chunks: Uint8Array[] = [];
+          for await (const chunk of stream) {
+            chunks.push(chunk);
+          }
+          return new Uint8Array(Buffer.concat(chunks));
+        }
+        throw e;
+      }
+    } else {
+      // load file into ArrayBuffer in browsers
+      const response = await fetch(file);
+      if (!response.ok) {
+        throw new Error(`failed to load external data file: ${file}`);
+      }
+      const contentLengthHeader = response.headers.get('Content-Length');
+      const fileSize = contentLengthHeader ? parseInt(contentLengthHeader, 10) : 0;
+      if (fileSize < 1073741824 /* 1GB */) {
+        // when Content-Length header is not set, we cannot determine the file size. We assume it is small enough to
+        // load into memory.
+        return new Uint8Array(await response.arrayBuffer());
+      } else {
+        // file is too large, use stream instead
+        if (!response.body) {
+          throw new Error(`failed to load external data file: ${file}, no response body.`);
+        }
+        const reader = response.body.getReader();
+
+        // use WebAssembly Memory to allocate larger ArrayBuffer
+        const pages = Math.ceil(fileSize / 65536);
+        const buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+
+        let offset = 0;
+        // eslint-disable-next-line no-constant-condition
+        while (true) {
+          const {done, value} = await reader.read();
+          if (done) {
+            break;
+          }
+          const chunkSize = value.byteLength;
+          const chunk = new Uint8Array(buffer, offset, chunkSize);
+          chunk.set(value);
+          offset += chunkSize;
+        }
+        return new Uint8Array(buffer, 0, fileSize);
+      }
+    }
+
+  } else if (file instanceof Blob) {
+    return new Uint8Array(await file.arrayBuffer());
+  } else if (file instanceof Uint8Array) {
+    return file;
+  } else {
+    return new Uint8Array(file);
+  }
+};
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index a52ac4454a5c1..ea0c122cb51de 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -122,7 +122,11 @@ async function buildOrt({
           case 'node:fs/promises':
           case 'node:fs':
           case 'fs':
-            return {contents: 'export const readFile = undefined;'};
+            return {
+              contents: 'export const readFile = undefined;' +
+                  'export const readFileSync = undefined;' +
+                  'export const createReadStream = undefined;'
+            };
           case 'node:os':
           case 'os':
             return {contents: 'export const cpus = undefined;'};
diff --git a/js/web/test/e2e/browser-test-webgpu-external-data.js b/js/web/test/e2e/browser-test-webgpu-external-data.js
new file mode 100644
index 0000000000000..8fb0b4d6ec545
--- /dev/null
+++ b/js/web/test/e2e/browser-test-webgpu-external-data.js
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+'use strict';
+
+it('Browser E2E testing - WebGPU backend with external data', async function() {
+  const session = await ort.InferenceSession.create('./model_with_orig_ext_data.onnx', {
+    executionProviders: ['webgpu'],
+    externalData: [{data: './model_with_orig_ext_data.bin', path: 'model_with_orig_ext_data.bin'}]
+  });
+
+  const fetches = await session.run({X: new ort.Tensor('float32', [1, 1], [1, 2])});
+
+  const Y = fetches.Y;
+
+  assert(Y instanceof ort.Tensor);
+  assert(Y.dims.length === 2 && Y.dims[0] === 2 && Y.dims[1] === 3);
+  assert(Y.data[0] === 1);
+  assert(Y.data[1] === 1);
+  assert(Y.data[2] === 0);
+  assert(Y.data[3] === 0);
+  assert(Y.data[4] === 0);
+  assert(Y.data[5] === 0);
+});
diff --git a/js/web/test/e2e/karma.conf.js b/js/web/test/e2e/karma.conf.js
index b7ff408fa29c6..b541d9d120110 100644
--- a/js/web/test/e2e/karma.conf.js
+++ b/js/web/test/e2e/karma.conf.js
@@ -15,6 +15,8 @@ if (typeof USER_DATA !== 'string') {
   throw new Error('flag --user-data=<CHROME_USER_DATA_FOLDER> is required');
 }
 
+const flags = ['--ignore-gpu-blocklist', '--gpu-vendor-id=0x10de'];
+
 module.exports = function(config) {
   const distPrefix = SELF_HOST ? './node_modules/onnxruntime-web/dist/' : 'http://localhost:8081/dist/';
   config.set({
@@ -25,10 +27,14 @@ module.exports = function(config) {
       {pattern: TEST_MAIN},
       {pattern: './node_modules/onnxruntime-web/dist/*.wasm', included: false, nocache: true},
       {pattern: './model.onnx', included: false},
+      {pattern: './model_with_orig_ext_data.onnx', included: false},
+      {pattern: './model_with_orig_ext_data.bin', included: false},
     ],
     plugins: [require('@chiragrupani/karma-chromium-edge-launcher'), ...config.plugins],
     proxies: {
       '/model.onnx': '/base/model.onnx',
+      '/model_with_orig_ext_data.onnx': '/base/model_with_orig_ext_data.onnx',
+      '/model_with_orig_ext_data.bin': '/base/model_with_orig_ext_data.bin',
       '/test-wasm-path-override/ort-wasm.wasm': '/base/node_modules/onnxruntime-web/dist/ort-wasm.wasm',
       '/test-wasm-path-override/renamed.wasm': '/base/node_modules/onnxruntime-web/dist/ort-wasm.wasm',
     },
@@ -43,10 +49,11 @@ module.exports = function(config) {
     hostname: 'localhost',
     browsers: [],
     customLaunchers: {
-      Chrome_default: {base: 'ChromeHeadless', chromeDataDir: USER_DATA},
+      Chrome_default: {base: 'Chrome', flags, chromeDataDir: USER_DATA},
       Chrome_no_threads: {
-        base: 'ChromeHeadless',
+        base: 'Chrome',
         chromeDataDir: USER_DATA,
+        flags
         // TODO: no-thread flags
       },
       Edge_default: {base: 'Edge', edgeDataDir: USER_DATA}
diff --git a/js/web/test/e2e/model_with_orig_ext_data.bin b/js/web/test/e2e/model_with_orig_ext_data.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d69e6beeff85daff64de4df5719686b136e278db
GIT binary patch
literal 32
PcmZQzKn09YE{p~M05|{x

literal 0
HcmV?d00001

diff --git a/js/web/test/e2e/model_with_orig_ext_data.onnx b/js/web/test/e2e/model_with_orig_ext_data.onnx
new file mode 100644
index 0000000000000..6f9cce0bc5b4f
--- /dev/null
+++ b/js/web/test/e2e/model_with_orig_ext_data.onnx
@@ -0,0 +1,19 @@
+	onnx-example:�
+:
+X
+model_with_orig_ext_dataY"Pad*
+mode"constant�
+test-model*JBmodel_with_orig_ext_dataj(
+locationmodel_with_orig_ext_data.binpZ
+X
+
+
+Z&
+model_with_orig_ext_data
+
+
+b
+Y
+
+
+B
\ No newline at end of file
diff --git a/js/web/test/e2e/run.js b/js/web/test/e2e/run.js
index 2776f6dff46ab..46c04792f1b97 100644
--- a/js/web/test/e2e/run.js
+++ b/js/web/test/e2e/run.js
@@ -119,6 +119,7 @@ async function testAllBrowserCases({hostInKarma}) {
   await runKarma({hostInKarma, main: './browser-test-wasm-path-override-prefix.js'});
   await runKarma({hostInKarma, main: './browser-test-wasm-path-override-prefix.js', ortMain: 'ort.wasm.min.js'});
   await runKarma({hostInKarma, main: './browser-test-wasm-image-tensor-image.js'});
+  await runKarma({hostInKarma, main: './browser-test-webgpu-external-data.js', ortMain: 'ort.webgpu.min.js'});
 }
 
 async function runKarma({hostInKarma, main, browser = BROWSER, ortMain = 'ort.min.js'}) {
diff --git a/js/web/test/e2e/simple-http-server.js b/js/web/test/e2e/simple-http-server.js
index 1244aaddafd23..6a6162855df83 100644
--- a/js/web/test/e2e/simple-http-server.js
+++ b/js/web/test/e2e/simple-http-server.js
@@ -16,6 +16,7 @@ const validRequests = {
   '/dist/ort-wasm-simd.wasm': ['dist/ort-wasm-simd.wasm', 'application/wasm'],
   '/dist/ort-wasm-threaded.wasm': ['dist/ort-wasm-threaded.wasm', 'application/wasm'],
   '/dist/ort-wasm-simd-threaded.wasm': ['dist/ort-wasm-simd-threaded.wasm', 'application/wasm'],
+  '/dist/ort-wasm-simd.jsep.wasm': ['dist/ort-wasm-simd.jsep.wasm', 'application/wasm'],
 
   // proxied .wasm files:
   '/test-wasm-path-override/ort-wasm.wasm': ['dist/ort-wasm.wasm', 'application/wasm'],
@@ -25,6 +26,7 @@ const validRequests = {
   '/dist/ort.min.js': ['dist/ort.min.js', 'text/javascript'],
   '/dist/ort.js': ['dist/ort.js', 'text/javascript'],
   '/dist/ort.webgl.min.js': ['dist/ort.webgl.min.js', 'text/javascript'],
+  '/dist/ort.webgpu.min.js': ['dist/ort.webgpu.min.js', 'text/javascript'],
   '/dist/ort.wasm.min.js': ['dist/ort.wasm.min.js', 'text/javascript'],
   '/dist/ort.wasm-core.min.js': ['dist/ort.wasm-core.min.js', 'text/javascript'],
 };
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index fd32aaedcc2ee..8a2db6d5728af 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -7,6 +7,10 @@
 #include <algorithm>
 #include <limits>
 
+#if defined(__wasm__)
+#include <emscripten.h>
+#endif
+
 #include "core/common/gsl.h"
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
@@ -769,6 +773,7 @@ static void DeleteCharArray(void* param) noexcept {
   delete[] arr;
 }
 
+#if !defined(__wasm__)
 static Status GetFileContent(
     const Env& env, const ORTCHAR_T* file_path, FileOffsetType offset, size_t length,
     void*& raw_buffer, OrtCallback& deleter) {
@@ -797,6 +802,7 @@ static Status GetFileContent(
   raw_buffer = buffer.release();
   return Status::OK();
 }
+#endif
 
 Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
                                  const ONNX_NAMESPACE::TensorProto& tensor_proto,
@@ -819,6 +825,69 @@ Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
     ext_data_len = raw_data_safe_len;
     ext_data_deleter = OrtCallback{nullptr, nullptr};
   } else {
+#if defined(__wasm__)
+    ORT_RETURN_IF(file_offset < 0 || file_offset + raw_data_safe_len >= 4294967296,
+                  "External initializer: ", tensor_proto.name(),
+                  " offset: ", file_offset, " size to read: ", static_cast<size_t>(raw_data_safe_len),
+                  " are out of bounds or can not be read in full (>4GB).");
+
+    auto buffer = std::make_unique<char[]>(raw_data_safe_len);
+    ext_data_deleter = OrtCallback{DeleteCharArray, buffer.get()};
+    ext_data_buf = buffer.release();
+    ext_data_len = raw_data_safe_len;
+
+    // In WebAssembly, try use a simplified preloaded file map in WebAssembly when available.
+    auto err_code = EM_ASM_INT(({
+                                 // If available, "Module.MountedFiles" is a Map for all preloaded files.
+                                 if (typeof Module == 'undefined' || !Module.MountedFiles) {
+                                   return 1;  // "Module.MountedFiles" is not available.
+                                 }
+                                 let fileName = UTF8ToString($0 >>> 0);
+                                 if (fileName.startsWith('./')) {
+                                   fileName = fileName.substring(2);
+                                 }
+                                 const fileData = Module.MountedFiles.get(fileName);
+                                 if (!fileData) {
+                                   return 2;  // File not found in preloaded files.
+                                 }
+                                 const offset = $1 >>> 0;
+                                 const length = $2 >>> 0;
+                                 const buffer = $3 >>> 0;
+
+                                 if (offset + length > fileData.byteLength) {
+                                   return 3;  // Out of bounds.
+                                 }
+
+                                 try {
+                                   // Copy the file data (fileData,offset,length) into WebAssembly memory (HEAPU8,buffer,length).
+                                   HEAPU8.set(fileData.subarray(offset, offset + length), buffer);
+                                   return 0;
+                                 } catch {
+                                   return 4;
+                                 }
+                               }),
+                               external_data_file_path.c_str(),
+                               static_cast<int32_t>(file_offset),
+                               static_cast<int32_t>(raw_data_safe_len),
+                               ext_data_buf);
+    const char* err_msg;
+    switch (err_code) {
+      case 0:
+        return Status::OK();
+      case 1:
+        err_msg = "Module.MountedFiles is not available.";
+        break;
+      case 2:
+        err_msg = "File not found in preloaded files.";
+        break;
+      case 3:
+        err_msg = "Out of bounds.";
+        break;
+      default:
+        err_msg = "Unknown error occurred in memory copy.";
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to load external data file \"", external_data_file_path, "\", error: ", err_msg);
+#else
     size_t file_length;
     // error reporting is inconsistent across platforms. Make sure the full path we attempted to open is included.
     auto status = env.GetFileLength(external_data_file_path.c_str(), file_length);
@@ -836,6 +905,7 @@ Status GetExtDataFromTensorProto(const Env& env, const ORTCHAR_T* model_path,
     ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len,
                                        ext_data_buf, ext_data_deleter));
     ext_data_len = raw_data_safe_len;
+#endif
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/optimizer/initializer.cc b/onnxruntime/core/optimizer/initializer.cc
index 9e807ddc7be59..3679a40d32eee 100644
--- a/onnxruntime/core/optimizer/initializer.cc
+++ b/onnxruntime/core/optimizer/initializer.cc
@@ -27,10 +27,14 @@ Initializer::Initializer(ONNX_NAMESPACE::TensorProto_DataType data_type,
 
 Initializer::Initializer(const ONNX_NAMESPACE::TensorProto& tensor_proto, const Path& model_path) {
   ORT_ENFORCE(utils::HasDataType(tensor_proto), "Initializer must have a datatype");
+#if !defined(__wasm__)
+  // using full filepath is required by utils::TensorProtoToTensor(). One exception is WebAssembly platform, where
+  // external data is not loaded from real file system.
   if (utils::HasExternalData(tensor_proto)) {
     ORT_ENFORCE(!model_path.IsEmpty(),
                 "model_path must not be empty. Ensure that a path is provided when the model is created or loaded.");
   }
+#endif
 
   auto proto_data_type = tensor_proto.data_type();
   if (utils::HasName(tensor_proto)) {
diff --git a/onnxruntime/test/framework/test_tensor_loader.cc b/onnxruntime/test/framework/test_tensor_loader.cc
index e71830be08b5e..71d70abceb82e 100644
--- a/onnxruntime/test/framework/test_tensor_loader.cc
+++ b/onnxruntime/test/framework/test_tensor_loader.cc
@@ -95,6 +95,7 @@ TEST(CApiTensorTest, load_simple_float_tensor_allocator) {
   g_ort->ReleaseStatus(ort_st);
 }
 
+#if !defined(__wasm__)
 template <bool use_current_dir>
 static void run_external_data_test() {
   FILE* fp;
@@ -154,6 +155,7 @@ TEST(CApiTensorTest, load_float_tensor_with_external_data) {
   run_external_data_test<true>();
   run_external_data_test<false>();
 }
+#endif
 
 #if defined(__amd64__) || defined(_M_X64)
 #ifndef __ANDROID__
diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc
index 8da7e6d820746..ee93cfaa67e2a 100644
--- a/onnxruntime/test/optimizer/initializer_test.cc
+++ b/onnxruntime/test/optimizer/initializer_test.cc
@@ -19,6 +19,7 @@
 
 namespace onnxruntime {
 namespace test {
+#if !defined(__wasm__)
 namespace {
 template <typename T>
 Status WriteExternalDataFile(gsl::span<const T> data, const PathString& path, ScopedFileDeleter& file_deleter) {
@@ -106,6 +107,7 @@ TEST(OptimizerInitializerTest, LoadExternalData) {
     EXPECT_THROW(Initializer i(tensor_proto, tensor_data_dir_path), OnnxRuntimeException);
   }
 }
+#endif
 
 template <typename T>
 constexpr ONNX_NAMESPACE::TensorProto_DataType GetTensorProtoDataType();
diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js
index 427ad6f6d14f3..25ece9c700d5d 100644
--- a/onnxruntime/wasm/js_internal_api.js
+++ b/onnxruntime/wasm/js_internal_api.js
@@ -3,7 +3,27 @@
 
 'use strict';
 
-// init JSEP
+/**
+ * Mount external data files of a model to the virtual file system (MEMFS).
+ *
+ * @param {string} externalDataFilesPath
+ * @param {Uint8Array} externalDataFilesData
+ */
+Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => {
+  const files = Module.MountedFiles || (Module.MountedFiles = new Map());
+    files.set(externalDataFilePath, externalDataFileData);
+};
+
+/**
+ * Unmount external data files of a model from the virtual file system (MEMFS).
+ */
+Module['unmountExternalData'] = () => {
+  delete Module.MountedFiles;
+};
+
+/**
+ * init JSEP
+ */
 Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, runKernel) => {
   Module.jsepBackend = backend;
   Module.jsepAlloc = alloc;

From 78e796bb278ab5789c4f868518b6df7da39d8477 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sat, 13 Jan 2024 01:30:43 -0500
Subject: [PATCH 056/100] Fixing issue where unzip package froim
 'onnxruntime-win-x64-gpu' was also uploaded. (#19096)

### Description
Fixing issue where unzip package froim 'onnxruntime-win-x64-gpu' was
also uploaded.


For example,
https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=396440&view=artifacts&pathAsName=false&type=publishedArtifacts
---
 .../c-api-noopenmp-packaging-pipelines.yml      |  4 +++-
 .../stages/nuget-win-cuda-packaging-stage.yml   | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 024fc1116954f..93d3b7f37008b 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -718,7 +718,9 @@ stages:
       displayName: 'Publish Pipeline Combined GPU Package Artifact'
       inputs:
         artifactName: 'onnxruntime-win-x64-gpu'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
+        targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
+
+
     - template: templates/component-governance-component-detection-steps.yml
       parameters :
         condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 3fb653c6b4405..66b6bc4ab3591 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -139,9 +139,24 @@ stages:
             filename: $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet\run_capi_application.bat
             arguments: $(Build.SourcesDirectory)\onnxruntime $(Build.ArtifactStagingDirectory)\onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet
             workingFolder: '$(Build.ArtifactStagingDirectory)'
+        - script: |
+            dir
+          workingDirectory: '$(Build.ArtifactStagingDirectory)'
+          displayName: 'List ArtifactStagingDirectory before delete'
+
+        - task: DeleteFiles@1
+          displayName: 'Clean up none zip files from ArtifactStagingDirectory'
+          inputs:
+            SourceFolder: $(Build.ArtifactStagingDirectory)
+            Contents: '*/'
+
+        - script: |
+            dir
+          workingDirectory: '$(Build.ArtifactStagingDirectory)'
+          displayName: 'List ArtifactStagingDirectory after delete'
 
         - task: PublishPipelineArtifact@0
           displayName: 'Publish Pipeline Combined GPU Package Artifact'
           inputs:
             artifactName: 'onnxruntime-win-x64-gpu'
-            targetPath: '$(Build.ArtifactStagingDirectory)'
\ No newline at end of file
+            targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
\ No newline at end of file

From e803f8eb0f6cac716c1c9c2a548867f3349c9ffc Mon Sep 17 00:00:00 2001
From: Yang Gu <yang.gu@intel.com>
Date: Sat, 13 Jan 2024 16:23:17 +0800
Subject: [PATCH 057/100] [js/webgpu] Refactor timestamp-query and introduce
 timestamp-query-inside-passes (#18894)

We submit kernels in a batch (a fixed number 16 is used except for the
last batch) for better performance. However, timestamp query support is
at pass level so we disable the batch execution in profiling mode in
previous implementation. Actually we can have multiple passes in a batch
so that we don't have to disable batch execution, which is the first
enhancement of this PR.
Furthermore, WebGPU has an extension to support timestamp query inside
passes, which isn't supported by all the platforms (e.g., Windows
supports it, while macOS doesn't). This is expected to have lower cost
compared with multiple passes solution. So this PR also introduce this
support when available.
This PR also refactors some implementation related to kernelInfo, and
try to unify the related kernel names.
---
 js/common/lib/env.ts                          |   1 +
 js/web/lib/wasm/jsep/backend-webgpu.ts        | 233 ++++++++++++++----
 js/web/lib/wasm/jsep/init.ts                  |   5 +-
 .../lib/wasm/jsep/webgpu/program-manager.ts   |  79 +-----
 js/web/lib/wasm/jsep/webgpu/types.ts          |   3 +-
 5 files changed, 198 insertions(+), 123 deletions(-)

diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts
index b007b5e164bf3..6299c26159400 100644
--- a/js/common/lib/env.ts
+++ b/js/common/lib/env.ts
@@ -110,6 +110,7 @@ export declare namespace Env {
     kernelId: number;
     kernelType: string;
     kernelName: string;
+    programName: string;
     startTime: number;
     endTime: number;
   }
diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
index 0148f32cdd91b..2956ec1cad4da 100644
--- a/js/web/lib/wasm/jsep/backend-webgpu.ts
+++ b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -1,14 +1,30 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {Env, Tensor, TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
+import {Env, Tensor, TRACE, TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
+
+import {tensorDataTypeEnumToString} from '../wasm-common';
 
 import {configureLogger, LOG_DEBUG} from './log';
 import {createView, TensorView} from './tensor-view';
 import {createGpuDataManager, downloadGpuData, GpuDataManager} from './webgpu/gpu-data-manager';
 import {RunFunction, WEBGPU_OP_RESOLVE_RULES} from './webgpu/op-resolve-rules';
 import {ProgramManager} from './webgpu/program-manager';
-import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency} from './webgpu/types';
+import {ComputeContext, GpuData, ProgramInfo, ProgramInputTensorInfoDependency, TimestampQuery} from './webgpu/types';
+
+interface KernelInfo {
+  readonly kernelType: string;
+  readonly kernelName: string;
+  readonly kernelEntry: RunFunction;
+  readonly attributes: [((attribute: unknown) => unknown)|undefined, unknown];
+}
+
+interface PendingKernelInfo {
+  readonly kernelId: number;
+  readonly programName: string;
+  readonly inputTensorViews: readonly TensorView[];
+  readonly outputTensorViews: readonly TensorView[];
+}
 
 const getProgramInputTensorInfoDependencyKey =
     (inputTensors: readonly TensorView[], inputDependencies: readonly ProgramInputTensorInfoDependency[]): string => {
@@ -122,20 +138,21 @@ export class WebGpuBackend {
     return data;
   }
 
-  /**
-   * a KernelID -> kernel info mapping. value is
-   * [ op_type, name, run function, [optional] preprocess_attribute_once function ]
-   */
-  kernels: Map<number, [string, string, RunFunction, [((attribute: unknown) => unknown) | undefined, unknown]]>;
-
+  // KernelID -> kernelInfo mapping
+  kernels: Map<number, KernelInfo>;
   private commandEncoder: GPUCommandEncoder|null = null;
   private computePassEncoder: GPUComputePassEncoder|null = null;
+  maxDispatchNumber = 16;
   pendingDispatchNumber = 0;
 
-  queryData?: GpuData;
-  querySet?: GPUQuerySet;
-  querySetCount = 2;
-  queryTimeBase?: bigint;
+  // info of kernels pending submission for a single batch
+  private pendingKernels: PendingKernelInfo[] = [];
+  // queryReadBuffer -> pendingKernels mapping for all the batches
+  private pendingQueries: Map<GPUBuffer, PendingKernelInfo[]> = new Map();
+  private queryResolveBuffer?: GPUBuffer;
+  private querySet?: GPUQuerySet;
+  private queryTimeBase?: bigint;
+  queryType: TimestampQuery;
 
   env: Env;
 
@@ -161,7 +178,9 @@ export class WebGpuBackend {
       requiredFeatures,
     };
 
-    if (adapter.features.has('timestamp-query')) {
+    if (adapter.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+      requiredFeatures.push('chromium-experimental-timestamp-query-inside-passes' as GPUFeatureName);
+    } else if (adapter.features.has('timestamp-query')) {
       requiredFeatures.push('timestamp-query');
     }
     if (adapter.features.has('shader-f16')) {
@@ -188,6 +207,9 @@ export class WebGpuBackend {
     };
 
     Object.defineProperty(this.env.webgpu, 'device', {value: this.device});
+
+    // init queryType, which is necessary for createKernel
+    this.setQueryType();
   }
 
   dispose(): void {
@@ -200,6 +222,18 @@ export class WebGpuBackend {
   getCommandEncoder(): GPUCommandEncoder {
     if (!this.commandEncoder) {
       this.commandEncoder = this.device.createCommandEncoder();
+
+      // refresh queryType, as sometimes we only need to enable query for a specific run
+      this.setQueryType();
+      if (this.queryType !== 'none' && typeof this.querySet === 'undefined') {
+        this.querySet = this.device.createQuerySet({
+          type: 'timestamp',
+          count: this.maxDispatchNumber * 2,
+        });
+        this.queryResolveBuffer = this.device.createBuffer(
+            // eslint-disable-next-line no-bitwise
+            {size: this.maxDispatchNumber * 2 * 8, usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE});
+      }
     }
     return this.commandEncoder;
   }
@@ -207,17 +241,12 @@ export class WebGpuBackend {
   getComputePassEncoder(): GPUComputePassEncoder {
     if (!this.computePassEncoder) {
       const computePassDescriptor: GPUComputePassDescriptor = {};
-      if (this.isQueryEnabled()) {
-        if (typeof this.querySet === 'undefined') {
-          this.querySet = this.device.createQuerySet({
-            type: 'timestamp',
-            count: this.querySetCount,
-          });
-        }
+
+      if (this.queryType === 'at-passes') {
         computePassDescriptor.timestampWrites = {
-          querySet: this.querySet,
-          beginningOfPassWriteIndex: 0,
-          endOfPassWriteIndex: 1,
+          querySet: this.querySet!,
+          beginningOfPassWriteIndex: this.pendingDispatchNumber * 2,
+          endOfPassWriteIndex: this.pendingDispatchNumber * 2 + 1,
         };
       }
 
@@ -234,19 +263,95 @@ export class WebGpuBackend {
   }
 
   flush(): void {
-    if (this.commandEncoder) {
-      this.endComputePass();
-      this.device.queue.submit([this.getCommandEncoder().finish()]);
-      this.gpuDataManager.refreshPendingBuffers();
-      this.commandEncoder = null;
-      this.pendingDispatchNumber = 0;
+    if (!this.commandEncoder) {
+      return;
     }
-  }
 
-  isQueryEnabled(): boolean {
-    return this.device.features.has('timestamp-query') &&
-        (this.env.webgpu.profiling?.mode === 'default' ||
-         (!this.env.webgpu.profiling?.mode && this.env.webgpu.profilingMode === 'default'));
+    TRACE_FUNC_BEGIN();
+
+    this.endComputePass();
+    let queryReadBuffer: GPUBuffer;
+    if (this.queryType !== 'none') {
+      this.commandEncoder.resolveQuerySet(
+          this.querySet!, 0, this.pendingDispatchNumber * 2, this.queryResolveBuffer!, 0);
+
+      queryReadBuffer = this.device.createBuffer(
+          // eslint-disable-next-line no-bitwise
+          {size: this.pendingDispatchNumber * 2 * 8, usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST});
+
+      this.pendingQueries.set(queryReadBuffer, this.pendingKernels);
+      this.pendingKernels = [];
+      this.commandEncoder.copyBufferToBuffer(
+          this.queryResolveBuffer!, 0, queryReadBuffer, 0, this.pendingDispatchNumber * 2 * 8);
+    }
+
+    this.device.queue.submit([this.commandEncoder.finish()]);
+    this.gpuDataManager.refreshPendingBuffers();
+    this.commandEncoder = null;
+    this.pendingDispatchNumber = 0;
+
+    if (this.queryType !== 'none') {
+      void queryReadBuffer!.mapAsync(GPUMapMode.READ).then(() => {
+        const mappedData = new BigUint64Array(queryReadBuffer.getMappedRange());
+        const pendingKernels = this.pendingQueries.get(queryReadBuffer)!;
+        for (let i = 0; i < mappedData.length / 2; i++) {
+          const pendingKernelInfo = pendingKernels[i];
+          const kernelId = pendingKernelInfo.kernelId;
+          const kernelInfo = this.kernels.get(kernelId)!;
+          const kernelType = kernelInfo.kernelType;
+          const kernelName = kernelInfo.kernelName;
+          const programName = pendingKernelInfo.programName;
+          const inputTensorViews = pendingKernelInfo.inputTensorViews;
+          const outputTensorViews = pendingKernelInfo.outputTensorViews;
+          const startTimeU64 = mappedData[i * 2];
+          const endTimeU64 = mappedData[i * 2 + 1];
+
+          if (typeof this.queryTimeBase === 'undefined') {
+            this.queryTimeBase = startTimeU64;
+          }
+
+          const startTime = Number(startTimeU64 - this.queryTimeBase);
+          const endTime = Number(endTimeU64 - this.queryTimeBase);
+
+          if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
+            throw new RangeError('incorrect timestamp range');
+          }
+
+          if (this.env.webgpu.profiling?.ondata) {
+            this.env.webgpu.profiling.ondata({
+              version: 1,
+              inputsMetadata: inputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              outputsMetadata: outputTensorViews.map(
+                  value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
+              kernelId,
+              kernelType,
+              kernelName,
+              programName,
+              startTime,
+              endTime,
+            });
+          } else {
+            // if no callback is provided, print the profiling message to console
+            let inputShapes = '';
+            inputTensorViews.forEach((value, i) => {
+              inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            let outputShapes = '';
+            outputTensorViews.forEach((value, i) => {
+              outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
+            });
+            // eslint-disable-next-line no-console
+            console.log(`[profiling] kernel "${kernelId}|${kernelType}|${kernelName}|${programName}" ${inputShapes}${
+                outputShapes}execution time: ${endTime - startTime} ns`);
+          }
+          TRACE('GPU', `${programName}::${startTimeU64}::${endTimeU64}`);
+        }
+        queryReadBuffer.unmap();
+        this.pendingQueries.delete(queryReadBuffer);
+      });
+    }
+    TRACE_FUNC_END();
   }
 
   /**
@@ -384,9 +489,18 @@ export class WebGpuBackend {
         'info',
         () => `[ProgramManager] run "${program.name}" (key=${key}) with ${normalizedDispatchGroup[0]}x${
             normalizedDispatchGroup[1]}x${normalizedDispatchGroup[2]}`);
-    this.programManager.run(
-        artifact, inputTensorViews, outputTensorViews, inputDatas, outputDatas, normalizedDispatchGroup,
-        uniformBufferBinding);
+
+    if (this.queryType !== 'none') {
+      const pendingKernelInfo: PendingKernelInfo = {
+        kernelId: this.currentKernelId!,
+        programName: artifact.programInfo.name,
+        inputTensorViews,
+        outputTensorViews,
+      };
+      this.pendingKernels.push(pendingKernelInfo);
+    }
+
+    this.programManager.run(artifact, inputDatas, outputDatas, normalizedDispatchGroup, uniformBufferBinding);
 
     TRACE_FUNC_END(program.name);
     return outputTensorViews;
@@ -414,13 +528,19 @@ export class WebGpuBackend {
     return this.gpuDataManager.release(ptr);
   }
 
-  createKernel(opType: string, kernelId: number, attribute: unknown, nodeName: string): void {
-    const op = WEBGPU_OP_RESOLVE_RULES.get(opType);
+  createKernel(kernelType: string, kernelId: number, attribute: unknown, kernelName: string): void {
+    const op = WEBGPU_OP_RESOLVE_RULES.get(kernelType);
     if (!op) {
-      throw new Error(`kernel not implemented: ${opType}`);
+      throw new Error(`kernel not implemented: ${kernelType}`);
     }
 
-    this.kernels.set(kernelId, [opType, nodeName, op[0], [op[1], attribute]]);
+    const kernelInfo: KernelInfo = {
+      kernelType,
+      kernelName,
+      kernelEntry: op[0],
+      attributes: [op[1], attribute],
+    };
+    this.kernels.set(kernelId, kernelInfo);
   }
 
   releaseKernel(kernelId: number): void {
@@ -441,9 +561,12 @@ export class WebGpuBackend {
     if (!kernel) {
       throw new Error(`kernel not created: ${kernelId}`);
     }
-    const [opType, nodeName, kernelEntry, attributes] = kernel;
+    const kernelType = kernel.kernelType;
+    const kernelName = kernel.kernelName;
+    const kernelEntry = kernel.kernelEntry;
+    const attributes = kernel.attributes;
     if (this.currentKernelId !== null) {
-      throw new Error(`kernel "[${opType}] ${nodeName}" is not allowed to be called recursively`);
+      throw new Error(`kernel "[${kernelType}] ${kernelName}" is not allowed to be called recursively`);
     }
     this.currentKernelId = kernelId;
 
@@ -453,7 +576,7 @@ export class WebGpuBackend {
       attributes[0] = undefined;
     }
 
-    LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${opType}] ${nodeName}"...`);
+    LOG_DEBUG('info', () => `[WebGPU] Start to run kernel "[${kernelType}] ${kernelName}"...`);
 
     const useErrorScope = this.env.debug;
 
@@ -466,12 +589,12 @@ export class WebGpuBackend {
       kernelEntry(context, attributes[1]);
       return 0;  // ORT_OK
     } catch (e) {
-      errors.push(Promise.resolve(`[WebGPU] Kernel "[${opType}] ${nodeName}" failed. ${e}`));
+      errors.push(Promise.resolve(`[WebGPU] Kernel "[${kernelType}] ${kernelName}" failed. ${e}`));
       return 1;  // ORT_FAIL
     } finally {
       if (useErrorScope) {
         errors.push(this.device.popErrorScope().then(
-            err => err ? `GPU validation error for kernel "[${opType}] ${nodeName}": ${err.message}` : null));
+            err => err ? `GPU validation error for kernel "[${kernelType}] ${kernelName}": ${err.message}` : null));
       }
 
       for (const data of this.temporaryData) {
@@ -516,5 +639,23 @@ export class WebGpuBackend {
       return createView(data.buffer, type);
     };
   }
+  writeTimestamp(index: number): void {
+    if (this.queryType !== 'inside-passes') {
+      return;
+    }
+
+    // eslint-disable-next-line @typescript-eslint/no-explicit-any
+    (this.computePassEncoder as any).writeTimestamp(this.querySet, index);
+  }
+  setQueryType(): void {
+    this.queryType = 'none';
+    if (this.env.webgpu.profiling?.mode === 'default' || this.env.wasm.trace) {
+      if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) {
+        this.queryType = 'inside-passes';
+      } else if (this.device.features.has('timestamp-query')) {
+        this.queryType = 'at-passes';
+      }
+    }
+  }
   // #endregion
 }
diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
index 935f0dcabcd73..f1794d71579bf 100644
--- a/js/web/lib/wasm/jsep/init.ts
+++ b/js/web/lib/wasm/jsep/init.ts
@@ -187,9 +187,8 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte
           },
 
       // jsepCreateKernel
-      (name: string, kernel: number, attribute: unknown) => backend.createKernel(
-          name, kernel, attribute,
-          env.debug || backend.isQueryEnabled() ? module.UTF8ToString(module._JsepGetNodeName(kernel)) : `${kernel}`),
+      (kernelType: string, kernelId: number, attribute: unknown) =>
+          backend.createKernel(kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName(kernelId))),
 
       // jsepReleaseKernel
       (kernel: number) => backend.releaseKernel(kernel),
diff --git a/js/web/lib/wasm/jsep/webgpu/program-manager.ts b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
index 0d699326366b3..72eb9713e26a8 100644
--- a/js/web/lib/wasm/jsep/webgpu/program-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/program-manager.ts
@@ -3,10 +3,8 @@
 
 import {TRACE_FUNC_BEGIN, TRACE_FUNC_END} from 'onnxruntime-common';
 
-import {tensorDataTypeEnumToString} from '../../wasm-common';
 import {WebGpuBackend} from '../backend-webgpu';
 import {LOG_DEBUG} from '../log';
-import {TensorView} from '../tensor-view';
 
 import {createShaderHelper} from './ops/common';
 import {Artifact, GpuData, ProgramInfo} from './types';
@@ -34,13 +32,12 @@ export class ProgramManager {
   setArtifact(key: unknown, artifact: Artifact): void {
     this.repo.set(key, artifact);
   }
-  run(buildArtifact: Artifact, inputTensorViews: readonly TensorView[], outputTensorViews: readonly TensorView[],
-      inputs: GpuData[], outputs: GpuData[], dispatchGroup: [number, number, number],
+  run(buildArtifact: Artifact, inputs: GpuData[], outputs: GpuData[], dispatchGroup: [number, number, number],
       uniformBufferBinding: GPUBindingResource|undefined): void {
     TRACE_FUNC_BEGIN(buildArtifact.programInfo.name);
     const device = this.backend.device;
-
     const computePassEncoder = this.backend.getComputePassEncoder();
+    this.backend.writeTimestamp(this.backend.pendingDispatchNumber * 2);
     computePassEncoder.setPipeline(buildArtifact.computePipeline);
     const entries = [];
     for (const input of inputs) {
@@ -57,78 +54,14 @@ export class ProgramManager {
     computePassEncoder.setBindGroup(0, bindGroup);
 
     computePassEncoder.dispatchWorkgroups(...dispatchGroup);
-
+    this.backend.writeTimestamp(this.backend.pendingDispatchNumber * 2 + 1);
     this.backend.pendingDispatchNumber++;
 
-    if (this.backend.isQueryEnabled()) {
-      if (typeof this.backend.queryData === 'undefined') {
-        this.backend.queryData = this.backend.gpuDataManager.create(
-            // eslint-disable-next-line no-bitwise
-            this.backend.querySetCount * 8, GPUBufferUsage.COPY_SRC | GPUBufferUsage.QUERY_RESOLVE);
-      }
-      const syncData = this.backend.gpuDataManager.create(
-          // eslint-disable-next-line no-bitwise
-          this.backend.querySetCount * 8, GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST);
-
+    if (this.backend.pendingDispatchNumber >= this.backend.maxDispatchNumber ||
+        this.backend.queryType === 'at-passes') {
       this.backend.endComputePass();
-      this.backend.getCommandEncoder().resolveQuerySet(this.backend.querySet!, 0, 2, this.backend.queryData.buffer, 0);
-      this.backend.getCommandEncoder().copyBufferToBuffer(
-          this.backend.queryData.buffer, 0, syncData.buffer, 0, this.backend.querySetCount * 8);
-      this.backend.flush();
-
-      const kernelId = this.backend.currentKernelId!;
-      const kernelInfo = this.backend.kernels.get(kernelId)!;
-
-      void syncData.buffer.mapAsync(GPUMapMode.READ).then(() => {
-        const mappedData = new BigUint64Array(syncData.buffer.getMappedRange());
-        const [startTimeU64, endTimeU64] = mappedData;
-        const [kernelType, kernelName] = kernelInfo;
-
-        syncData.buffer.unmap();
-
-        if (typeof this.backend.queryTimeBase === 'undefined') {
-          this.backend.queryTimeBase = startTimeU64;
-        }
-
-        const startTime = Number(startTimeU64 - this.backend.queryTimeBase);
-        const endTime = Number(endTimeU64 - this.backend.queryTimeBase);
-
-        if (!Number.isSafeInteger(startTime) || !Number.isSafeInteger(endTime)) {
-          throw new RangeError('incorrect timestamp range');
-        }
-
-        this.backend.gpuDataManager.release(syncData.id);
-        if (this.backend.env.webgpu.profiling?.ondata) {
-          this.backend.env.webgpu.profiling.ondata({
-            version: 1,
-            inputsMetadata: inputTensorViews.map(
-                value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
-            outputsMetadata: outputTensorViews.map(
-                value => ({dims: value.dims, dataType: tensorDataTypeEnumToString(value.dataType)})),
-            kernelId,
-            kernelType,
-            kernelName,
-            startTime,
-            endTime,
-          });
-        } else {
-          // if no callback is provided, print the profiling message to console
-          let inputShapes = '';
-          inputTensorViews.forEach((value, i) => {
-            inputShapes += `input[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
-          });
-          let outputShapes = '';
-          outputTensorViews.forEach((value, i) => {
-            outputShapes += `output[${i}]: [${value.dims}] | ${tensorDataTypeEnumToString(value.dataType)}, `;
-          });
-          // eslint-disable-next-line no-console
-          console.log(`[profiling] kernel "${kernelId}|${kernelName}|${buildArtifact.programInfo.name}" ${inputShapes}${
-              outputShapes}execution time: ${endTime - startTime} ns`);
-        }
-      });
     }
-
-    if (this.backend.pendingDispatchNumber >= 16) {
+    if (this.backend.pendingDispatchNumber >= this.backend.maxDispatchNumber) {
       this.backend.flush();
     }
     TRACE_FUNC_END(buildArtifact.programInfo.name);
diff --git a/js/web/lib/wasm/jsep/webgpu/types.ts b/js/web/lib/wasm/jsep/webgpu/types.ts
index 23fa33a9bba8f..e55bfb6ba9f16 100644
--- a/js/web/lib/wasm/jsep/webgpu/types.ts
+++ b/js/web/lib/wasm/jsep/webgpu/types.ts
@@ -23,7 +23,6 @@ export interface TensorInfo {
   dataType: number;
 }
 
-
 export interface ProgramUniform {
   type: 'int32'|'float32'|'uint32';
   data: number|readonly number[];
@@ -172,3 +171,5 @@ export interface ComputeContext {
   compute(program: ProgramInfo, inputsOutputsMapping?: ComputeContextInputsOutputsMapping): TensorView[];
   output(index: number, dims: readonly number[]): number;
 }
+
+export type TimestampQuery = 'none'|'inside-passes'|'at-passes';

From 65893ef3822e31f095d4f59563a2145e52e8f218 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Sat, 13 Jan 2024 02:38:40 -0800
Subject: [PATCH 058/100] Add --parallel to QNN EP NuGet pipeline build command
 (#19126)

### Description
Add --parallel to QNN EP NuGet pipeline build command

### Motivation and Context
Improve build times for pipeline.
---
 .../azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index f6fcbd08ff03a..0b4951f01ff01 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -42,7 +42,7 @@ jobs:
       buildArch: x64
       setVcvars: true
       ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}} --parallel'
 
     steps:
       - template: templates/set-version-number-variables-step.yml
@@ -125,7 +125,7 @@ jobs:
         displayName: 'Generate CMake Configuration for arm64'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}} --parallel'
 
       - task: VSBuild@1
         displayName: 'Build onnxruntime arm64'

From 5558912d7b3ed85f7b1648e78ee8e66f717bd28f Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sat, 13 Jan 2024 18:40:43 -0800
Subject: [PATCH 059/100] Disable ccache in Windows CPU CI pipeline (#19131)

### Description
Disable ccache for all the jobs in in Windows CPU CI pipeline.
Before disabling it, the build has a warning that:

"MSIL .netmodule or module compiled with /GL found; restarting link with
/LTCG; add /LTCG to the link command line to improve linker performance"

After disabling it, the warning is gone and the build doesn't use /GL or
/LTCG.

Cache itself should not cause this difference.

### Motivation and Context
---
 .../azure-pipelines/win-ci-pipeline.yml       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index 71dcdf0cc76ac..d65b75ba9ede1 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -49,7 +49,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
     - job: build_x64_asan
@@ -95,7 +95,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 - stage: x64_release_dnnl
@@ -113,7 +113,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: DNNL
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         # Intel EPs require Intel CPUs
         MachinePool: 'onnxruntime-Win2022-Intel-CPU'
 
@@ -133,7 +133,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: XNNPACK
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 - stage: x64_release_winml
@@ -153,7 +153,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
         
 - stage: x86_release
@@ -172,7 +172,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 - stage: training_x64_debug
@@ -190,7 +190,7 @@ stages:
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win2022-CPU-training-AMD'
 
 - stage: training_x64_release
@@ -208,7 +208,7 @@ stages:
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win2022-CPU-training-AMD'
 
 - stage: ort_training_apis_x64_release
@@ -227,7 +227,7 @@ stages:
         isTraining: true
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win2022-CPU-training-AMD'
 
 - stage: x64_release_azure
@@ -254,7 +254,7 @@ stages:
         isTraining: false
         ORT_EP_NAME: CPU
         GenerateDocumentation: false
-        WITH_CACHE: true
+        WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
 
From e1e45901e2fe9e22f14bf03398d4212671dc37ce Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Sat, 13 Jan 2024 19:27:44 -0800
Subject: [PATCH 060/100] iOS packaging pipeline stability (#19097)

- Remove protoc build step which sometimes times out. Download protoc instead.
- Use macOS-12 image in the set variables stage. It seems more stable.
---
 .../external/onnxruntime_external_deps.cmake  | 74 ++++++++++---------
 .../mac-ios-packaging-pipeline.yml            |  2 +-
 .../stages/mac-ios-packaging-build-stage.yml  |  7 +-
 3 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 78f63227c8392..c79bb87fd7f5d 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -108,41 +108,14 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
+  if (APPLE)
+    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
+    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
+    # To keep it simple, just download and use the universal protoc binary for Apple builds.
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -150,6 +123,38 @@ if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
+  elseif(CMAKE_CROSSCOMPILING)
+    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    endif()
   endif()
 endif()
 
@@ -184,9 +189,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause 
+#TODO: we'd better to turn the following option off. However, it will cause
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -562,4 +567,3 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
-
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 5fd15b64e03b6..34a51649fc384 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-13"
+      vmImage: "macOS-12"  # macOS-13 seems less stable. macOS-12 will work for this job.
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index d1dff0769e25f..ed32c5d0e15be 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -78,10 +78,6 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
-    - script: |
-        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
-      displayName: "Build Host Protoc"
-
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -91,8 +87,7 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }} \
-          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
+          ${{ variables.optionalIncludeOpsByConfigOption }}
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |

From f917dde71740982c4520febc0ced1bff58b0068d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 13 Jan 2024 23:04:02 -0800
Subject: [PATCH 061/100] [web] remove xnnpack from web backends (#19116)

### Description
XNNPACK is already disabled in web assembly build. This change removes
the xnnpack backend registration in JS.
---
 js/common/lib/inference-session.ts                         | 2 +-
 js/web/lib/index.ts                                        | 7 ++-----
 js/web/lib/wasm/session-options.ts                         | 3 ---
 js/web/script/test-runner-cli-args.ts                      | 7 +++----
 js/web/test/test-runner.ts                                 | 4 ++--
 .../github/azure-pipelines/templates/win-web-ci.yml        | 6 +++---
 .../azure-pipelines/templates/win-web-multi-browsers.yml   | 6 +++---
 7 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index edc32535fc64d..1221b52cd4985 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -181,7 +181,7 @@ export declare namespace InferenceSession {
 
   // Currently, we have the following backends to support execution providers:
   // Backend Node.js binding: supports 'cpu' and 'cuda'.
-  // Backend WebAssembly: supports 'cpu', 'wasm', 'xnnpack' and 'webnn'.
+  // Backend WebAssembly: supports 'cpu', 'wasm', 'webgpu' and 'webnn'.
   // Backend ONNX.js: supports 'webgl'.
   // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android).
   interface ExecutionProviderOptionMap {
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 4f1a3943de69a..baf45e74addea 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -26,11 +26,8 @@ if (!BUILD_DEFS.DISABLE_WASM) {
   }
   registerBackend('cpu', wasmBackend, 10);
   registerBackend('wasm', wasmBackend, 10);
-  if (BUILD_DEFS.DISABLE_TRAINING) {
-    registerBackend('xnnpack', wasmBackend, 9);
-    if (!BUILD_DEFS.DISABLE_WEBNN) {
-      registerBackend('webnn', wasmBackend, 9);
-    }
+  if (!BUILD_DEFS.DISABLE_WEBNN) {
+    registerBackend('webnn', wasmBackend, 9);
   }
 }
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 45ea48a2df209..41ab2d52ca209 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -60,9 +60,6 @@ const setExecutionProviders =
 
         // check EP name
         switch (epName) {
-          case 'xnnpack':
-            epName = 'XNNPACK';
-            break;
           case 'webnn':
             epName = 'WEBNN';
             if (typeof ep !== 'string') {
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index fc74adfed1fee..8f6c5f6f04122 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -36,7 +36,6 @@ Options:
                                    webgl
                                    webgpu
                                    wasm
-                                   xnnpack
                                    webnn
  -e=<...>, --env=<...>         Specify the environment to run the test. Should be one of the following:
                                  chrome     (default)
@@ -111,7 +110,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'webnn';
+  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'webnn';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'dev'|'perf';
   type IOBindingMode = 'none'|'gpu-tensor'|'gpu-location';
@@ -378,13 +377,13 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'webnn'];
+  const browserBackends = ['webgl', 'webgpu', 'wasm', 'webnn'];
 
   // TODO: remove this when Chrome support WebNN.
   //       we need this for now because Chrome does not support webnn yet,
   //       and ChromeCanary is not in CI.
 
-  const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack' /*, 'webnn'*/];
+  const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm' /*, 'webnn'*/];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend = (typeof backendArgs !== 'string') ? (env === 'node' ? nodejsBackends : defaultBrowserBackends) :
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 3492c8f3780ea..442cb1bcf1f34 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -96,7 +96,7 @@ async function loadTensors(
   const outputs: Test.NamedTensor[] = [];
   let dataFileType: 'none'|'pb'|'npy' = 'none';
 
-  const allowInt64 = ['wasm', 'xnnpack', 'webgpu', 'webnn'].includes(backendName);
+  const allowInt64 = ['wasm', 'webgpu', 'webnn'].includes(backendName);
 
   for (const dataFile of testCase.dataFiles) {
     const ext = extname(dataFile);
@@ -317,7 +317,7 @@ export class TensorResultValidator {
     } else if (backend === 'webgpu') {
       this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
-    } else if (backend === 'wasm' || backend === 'xnnpack' || backend === 'webnn') {
+    } else if (backend === 'wasm' || backend === 'webnn') {
       this.absoluteThreshold = WASM_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WASM_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'onnxruntime') {
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 8d4efc79eaca8..8ba3517530edd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -169,12 +169,12 @@ jobs:
       errorActionPreference: stop
     displayName: 'Pack NPM packages'
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,xnnpack
+     npm test -- -e=chrome -b=webgl,wasm
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
-    displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)'
+    displayName: 'Run ort-web tests (wasm,webgl backend)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'false')
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,xnnpack,webgpu $(webgpuCommandlineExtraFlags)
+     npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index f7876f15029c1..31ee488318a0b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -68,15 +68,15 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm ci /js/web/'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --wasm-init-timeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Chrome)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --env=firefox --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --env=firefox --wasm-init-timeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Firefox)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --env=edge --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --env=edge --wasm-init-timeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Edge)'
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From bb4011b2b14cb2702a4922ccd0b070d9ecc49a93 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sun, 14 Jan 2024 11:36:49 -0800
Subject: [PATCH 062/100] Set default flags nvcc and do not set default compile
 flags for ROCM EP (#19124)

### Description
Set default flags nvcc and do not set the flags for ROCM EP.


### Motivation and Context
1. To meet a BinSkim requirement for CUDA EP.

https://github.com/microsoft/binskim/blob/main/docs/BinSkimRules.md#rule-BA2024EnableSpectreMitigations

2. The ROCM EP's pipeline is broken since PR #19073 . Unit tests failed
to load the EP with the following error message:

Failed to load library libonnxruntime_providers_rocm.so with error:
/build/Release/libonnxruntime_providers_rocm.so: undefined symbol:
vtable for onnxruntime::InsertMaxPoolOutput .

This PR is a hot fix to bring the pipeline back. So far I don't know why
the error happened. The symbol "InsertMaxPoolOutput" is in
onnxruntime_optimizers. I don't see any EP code references it directly.
---
 tools/ci_build/build.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 315b9a237b1c4..0da4adb51767d 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1474,15 +1474,18 @@ def generate_build_tree(
     cflags = None
     cxxflags = None
     ldflags = None
+    cudaflags = []
     for config in configs:
         # Setup default values for cflags/cxxflags/ldflags.
         # The values set here are purely for security and compliance purposes. ONNX Runtime should work fine without these flags.
         if (
             "CFLAGS" not in os.environ
             and "CXXFLAGS" not in os.environ
+            and (not args.use_cuda or "CUDAFLAGS" not in os.environ)
             and not args.ios
             and not args.android
             and not args.build_wasm
+            and not args.use_rocm
             and not (is_linux() and platform.machine() != "aarch64" and platform.machine() != "x86_64")
         ):
             if is_windows():
@@ -1515,9 +1518,19 @@ def generate_build_tree(
                 cxxflags = cflags.copy()
                 if not args.disable_exceptions:
                     cxxflags += ["/EHsc"]
+                if args.use_cuda:
+                    # On Windows, nvcc passes /EHsc to the host compiler by default.
+                    cuda_compile_flags_str = ""
+                    for compile_flag in cflags:
+                        if compile_flag.startswith("/D"):
+                            cudaflags.append(compile_flag)
+                        else:
+                            cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag
+                    if len(cuda_compile_flags_str) != 0:
+                        cudaflags.append('-Xcompiler="%s"' % cuda_compile_flags_str)
             elif is_linux() or is_macOS():
                 if is_linux():
-                    ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now"]
+                    ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", "-Wl,-z,noexecstack"]
                 else:
                     ldflags = []
                 if config == "Release":
@@ -1560,7 +1573,8 @@ def generate_build_tree(
                     # The following flags needs GCC 8 and newer
                     cflags += ["-fstack-clash-protection", "-fcf-protection"]
                 cxxflags = cflags.copy()
-
+                if args.use_cuda:
+                    cudaflags = cflags.copy()
         config_build_dir = get_config_build_dir(build_dir, config)
         os.makedirs(config_build_dir, exist_ok=True)
         if args.use_tvm:
@@ -1580,6 +1594,8 @@ def generate_build_tree(
                 "-DCMAKE_C_FLAGS=%s" % (" ".join(cflags)),
                 "-DCMAKE_CXX_FLAGS=%s" % (" ".join(cxxflags)),
             ]
+        if cudaflags is not None and len(cudaflags) != 0:
+            temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT=%s" % (" ".join(cudaflags))]
         if ldflags is not None and len(ldflags) != 0:
             temp_cmake_args += [
                 "-DCMAKE_EXE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),

From 76797127d6a3125fc59e605670809957a2183cbe Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sun, 14 Jan 2024 14:37:26 -0500
Subject: [PATCH 063/100] Always download cuda and trt libraries from Azure
 blob (#19118)

### Description
This way, we will not need to update the windows images constantly and
allow more flexibility to choose the cuda version in the future.
---
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 ++
 .../jobs/download_win_gpu_library.yml         | 36 +++++++++++--------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 93d3b7f37008b..f80b035582f18 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -1172,6 +1172,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     Skipx86Tests: 'true'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
@@ -1183,6 +1184,7 @@ stages:
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
     Skipx86Tests: 'true'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index b7ae9ffa3c219..538cccd3c903b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -20,31 +20,37 @@ steps:
     - powershell: |
         Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}\bin;$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}\extras\CUPTI\lib64"
       displayName: 'Append CUDA SDK Directory to PATH'
+
     - task: CmdLine@2
       inputs:
         script: |
           echo %PATH%
-      displayName: 'Print PATH'
+      displayName: 'Print PATH after download CUDA SDK'
 
   - ${{ if eq(parameters.DownloadTRT, true) }}:
     - ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      - powershell: |
-          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8'
-      - powershell: |
-          Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\lib"
-        displayName: 'Append TensorRT Directory to PATH'
-
+        - bash: |
+            echo "##vso[task.setvariable variable=trtCudaVersion]11.8"
+          displayName: Set trtCudaVersion
     - ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      - powershell: |
-          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0'
-      - powershell: |
-          Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0\lib"
-        displayName: 'Append TensorRT Directory to PATH'
+        - bash: |
+            echo "##vso[task.setvariable variable=trtCudaVersion]12.0"
+          displayName: Set trtCudaVersion
+
+    - bash: |
+        echo $(trtCudaVersion)
+      displayName: Get trtCudaVersion
+
+    - powershell: |
+        azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion) $(Agent.TempDirectory)
+      displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion)'
+
+    - powershell: |
+        Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion)\lib"
+      displayName: 'Append TensorRT Directory to PATH'
 
     - task: CmdLine@2
       inputs:
         script: |
           echo %PATH%
-      displayName: 'Print PATH'
\ No newline at end of file
+      displayName: 'Print PATH after download TensorRT'
\ No newline at end of file

From c3ce9df80c2cfc7013445f8b44213f3e75cac753 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sun, 14 Jan 2024 17:51:00 -0500
Subject: [PATCH 064/100] Disabling python3.12 on training python packaging
 pipleines (#19123)

---
 .../templates/py-packaging-training-cuda-stage.yml  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index e7b935712ac6c..158037661f072 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -98,12 +98,13 @@ stages:
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
-          Python312:
-            PythonVersion: '3.12'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
+# TODO: enable this when we have torch support pyton 3.12
+#          Python312:
+#            PythonVersion: '3.12'
+#            TorchVersion: ${{ parameters.torch_version }}
+#            OpsetVersion: ${{ parameters.opset_version }}
+#            CudaVersion: ${{ parameters.cuda_version }}
+#            UploadWheel: ${{ parameters.upload_wheel }}
 
       steps:
       - task: CmdLine@2

From 71657d1eb8b0a24a4b6584d9e904506a0b4e1521 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Sun, 14 Jan 2024 17:53:26 -0500
Subject: [PATCH 065/100] [java] Fix double close (#19133)

### Description
The `OnnxValue` and `OrtProviderOptions` implementations now check to
see if they've been closed before accessing the native pointer, and also
before close is called.

### Motivation and Context
Before they could be closed twice which SIGSEGV'd the JVM. Fixes #19125.
---
 .../src/main/java/ai/onnxruntime/OnnxMap.java | 27 +++++++++++++--
 .../java/ai/onnxruntime/OnnxSequence.java     | 27 +++++++++++++--
 .../java/ai/onnxruntime/OnnxSparseTensor.java | 18 ++++++++--
 .../main/java/ai/onnxruntime/OnnxTensor.java  | 24 +++++++++++---
 .../java/ai/onnxruntime/OnnxTensorLike.java   | 16 +++++++++
 .../main/java/ai/onnxruntime/OnnxValue.java   |  9 ++++-
 .../ai/onnxruntime/OrtProviderOptions.java    | 30 ++++++++++++++++-
 .../ai/onnxruntime/OrtTrainingSession.java    | 33 +++++++++++++++++--
 .../StringConfigProviderOptions.java          |  1 +
 .../java/ai/onnxruntime/InferenceTest.java    |  2 ++
 .../java/ai/onnxruntime/OnnxTensorTest.java   | 27 +++++++++++++--
 .../test/java/ai/onnxruntime/TestHelpers.java | 12 +++++++
 12 files changed, 208 insertions(+), 18 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxMap.java b/java/src/main/java/ai/onnxruntime/OnnxMap.java
index 354ebec61274d..cde9f0de4ff0a 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxMap.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxMap.java
@@ -8,6 +8,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.logging.Logger;
 
 /**
  * A container for a map returned by {@link OrtSession#run(Map)}.
@@ -16,6 +17,7 @@
  * values: String, Long, Float, Double.
  */
 public class OnnxMap implements OnnxValue {
+  private static final Logger logger = Logger.getLogger(OnnxMap.class.getName());
 
   static {
     try {
@@ -107,6 +109,8 @@ public static OnnxMapValueType mapFromOnnxJavaType(OnnxJavaType type) {
 
   private final OnnxMapValueType valueType;
 
+  private boolean closed;
+
   /**
    * Constructs an OnnxMap containing a reference to the native map along with the type information.
    *
@@ -122,6 +126,7 @@ public static OnnxMapValueType mapFromOnnxJavaType(OnnxJavaType type) {
     this.info = info;
     this.stringKeys = info.keyType == OnnxJavaType.STRING;
     this.valueType = OnnxMapValueType.mapFromOnnxJavaType(info.valueType);
+    this.closed = false;
   }
 
   /**
@@ -146,6 +151,7 @@ public OnnxValueType getType() {
    */
   @Override
   public Map<? extends Object, ? extends Object> getValue() throws OrtException {
+    checkClosed();
     Object[] keys = getMapKeys();
     Object[] values = getMapValues();
     HashMap<Object, Object> map = new HashMap<>(OrtUtil.capacityFromSize(keys.length));
@@ -222,10 +228,27 @@ public String toString() {
     return "ONNXMap(size=" + size() + ",info=" + info.toString() + ")";
   }
 
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   /** Closes this map, releasing the native memory backing it and it's elements. */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed map.");
+    }
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
   }
 
   private native String[] getStringKeys(long apiHandle, long nativeHandle, long allocatorHandle)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSequence.java b/java/src/main/java/ai/onnxruntime/OnnxSequence.java
index 93e1be21588b4..7722514b913b6 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSequence.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSequence.java
@@ -8,6 +8,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.logging.Logger;
 
 /**
  * A sequence of {@link OnnxValue}s all of the same type.
@@ -24,6 +25,7 @@
  * </ul>
  */
 public class OnnxSequence implements OnnxValue {
+  private static final Logger logger = Logger.getLogger(OnnxSequence.class.getName());
 
   static {
     try {
@@ -40,6 +42,8 @@ public class OnnxSequence implements OnnxValue {
 
   private final SequenceInfo info;
 
+  private boolean closed;
+
   /**
    * Creates the wrapper object for a native sequence.
    *
@@ -53,6 +57,7 @@ public class OnnxSequence implements OnnxValue {
     this.nativeHandle = nativeHandle;
     this.allocatorHandle = allocatorHandle;
     this.info = info;
+    this.closed = false;
   }
 
   @Override
@@ -76,6 +81,7 @@ public OnnxValueType getType() {
    */
   @Override
   public List<? extends OnnxValue> getValue() throws OrtException {
+    checkClosed();
     if (info.sequenceOfMaps) {
       OnnxMap[] maps = getMaps(OnnxRuntime.ortApiHandle, nativeHandle, allocatorHandle);
       return Collections.unmodifiableList(Arrays.asList(maps));
@@ -110,10 +116,27 @@ public String toString() {
     return "OnnxSequence(info=" + info.toString() + ")";
   }
 
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   /** Closes this sequence, releasing the native memory backing it and it's elements. */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed sequence.");
+    }
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
   }
 
   private native OnnxMap[] getMaps(long apiHandle, long nativeHandle, long allocatorHandle)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
index 53bd4c7f9b3e6..804fe742ad624 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
@@ -14,6 +14,7 @@
 import java.nio.LongBuffer;
 import java.nio.ShortBuffer;
 import java.util.Arrays;
+import java.util.logging.Logger;
 
 /**
  * A Java object wrapping an OnnxSparseTensor.
@@ -22,6 +23,7 @@
  * different static inner class representing each type.
  */
 public final class OnnxSparseTensor extends OnnxTensorLike {
+  private static final Logger logger = Logger.getLogger(OnnxSparseTensor.class.getName());
   private final SparseTensorType sparseTensorType;
 
   // Held to prevent deallocation while used in native code.
@@ -198,6 +200,7 @@ public OnnxValueType getType() {
 
   @Override
   public SparseTensor<? extends Buffer> getValue() throws OrtException {
+    checkClosed();
     Buffer buffer = getValuesBuffer();
     long[] indicesShape = getIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
     switch (sparseTensorType) {
@@ -234,8 +237,13 @@ public SparseTensor<? extends Buffer> getValue() throws OrtException {
   }
 
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed OnnxSparseTensor.");
+    }
   }
 
   /**
@@ -257,6 +265,7 @@ public SparseTensorType getSparseTensorType() {
    * @return The indices.
    */
   public Buffer getIndicesBuffer() {
+    checkClosed();
     switch (sparseTensorType) {
       case COO:
       case CSRC:
@@ -295,6 +304,7 @@ public Buffer getIndicesBuffer() {
    * @return The inner indices.
    */
   public LongBuffer getInnerIndicesBuffer() {
+    checkClosed();
     if (sparseTensorType == SparseTensorType.CSRC) {
       LongBuffer buf =
           getInnerIndicesBuffer(OnnxRuntime.ortApiHandle, nativeHandle)
@@ -320,6 +330,7 @@ public LongBuffer getInnerIndicesBuffer() {
    * @return The data buffer.
    */
   public Buffer getValuesBuffer() {
+    checkClosed();
     ByteBuffer buffer =
         getValuesBuffer(OnnxRuntime.ortApiHandle, nativeHandle).order(ByteOrder.nativeOrder());
     switch (info.type) {
@@ -396,6 +407,7 @@ public Buffer getValuesBuffer() {
    * @return The indices shape.
    */
   public long[] getIndicesShape() {
+    checkClosed();
     return getIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
   }
 
@@ -405,6 +417,7 @@ public long[] getIndicesShape() {
    * @return The indices shape.
    */
   public long[] getInnerIndicesShape() {
+    checkClosed();
     if (sparseTensorType == SparseTensorType.CSRC) {
       return getInnerIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
     } else {
@@ -420,6 +433,7 @@ public long[] getInnerIndicesShape() {
    * @return The values shape.
    */
   public long[] getValuesShape() {
+    checkClosed();
     return getValuesShape(OnnxRuntime.ortApiHandle, nativeHandle);
   }
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensor.java b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
index 0078adb6402f8..e1ee2c14fd9d1 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
@@ -14,12 +14,14 @@
 import java.nio.LongBuffer;
 import java.nio.ShortBuffer;
 import java.util.Optional;
+import java.util.logging.Logger;
 
 /**
  * A Java object wrapping an OnnxTensor. Tensors are the main input to the library, and can also be
  * returned as outputs.
  */
 public class OnnxTensor extends OnnxTensorLike {
+  private static final Logger logger = Logger.getLogger(OnnxTensor.class.getName());
 
   /**
    * This reference is held for OnnxTensors backed by a java.nio.Buffer to ensure the buffer does
@@ -97,6 +99,7 @@ public OnnxValueType getType() {
    */
   @Override
   public Object getValue() throws OrtException {
+    checkClosed();
     if (info.isScalar()) {
       switch (info.type) {
         case FLOAT:
@@ -144,16 +147,21 @@ public Object getValue() throws OrtException {
 
   @Override
   public String toString() {
-    return "OnnxTensor(info=" + info.toString() + ")";
+    return "OnnxTensor(info=" + info.toString() + ",closed=" + closed + ")";
   }
 
   /**
-   * Closes the tensor, releasing it's underlying memory (if it's not backed by an NIO buffer). If
-   * it is backed by a buffer then the memory is released when the buffer is GC'd.
+   * Closes the tensor, releasing its underlying memory (if it's not backed by an NIO buffer). If it
+   * is backed by a buffer then the memory is released when the buffer is GC'd.
    */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed tensor.");
+    }
   }
 
   /**
@@ -165,6 +173,7 @@ public void close() {
    * @return A ByteBuffer copy of the OnnxTensor.
    */
   public ByteBuffer getByteBuffer() {
+    checkClosed();
     if (info.type != OnnxJavaType.STRING) {
       ByteBuffer buffer = getBuffer(OnnxRuntime.ortApiHandle, nativeHandle);
       ByteBuffer output = ByteBuffer.allocate(buffer.capacity());
@@ -183,6 +192,7 @@ public ByteBuffer getByteBuffer() {
    * @return A FloatBuffer copy of the OnnxTensor.
    */
   public FloatBuffer getFloatBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.FLOAT) {
       // if it's fp32 use the efficient copy.
       FloatBuffer buffer = getBuffer().asFloatBuffer();
@@ -212,6 +222,7 @@ public FloatBuffer getFloatBuffer() {
    * @return A DoubleBuffer copy of the OnnxTensor.
    */
   public DoubleBuffer getDoubleBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.DOUBLE) {
       DoubleBuffer buffer = getBuffer().asDoubleBuffer();
       DoubleBuffer output = DoubleBuffer.allocate(buffer.capacity());
@@ -230,6 +241,7 @@ public DoubleBuffer getDoubleBuffer() {
    * @return A ShortBuffer copy of the OnnxTensor.
    */
   public ShortBuffer getShortBuffer() {
+    checkClosed();
     if ((info.type == OnnxJavaType.INT16)
         || (info.type == OnnxJavaType.FLOAT16)
         || (info.type == OnnxJavaType.BFLOAT16)) {
@@ -250,6 +262,7 @@ public ShortBuffer getShortBuffer() {
    * @return An IntBuffer copy of the OnnxTensor.
    */
   public IntBuffer getIntBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.INT32) {
       IntBuffer buffer = getBuffer().asIntBuffer();
       IntBuffer output = IntBuffer.allocate(buffer.capacity());
@@ -268,6 +281,7 @@ public IntBuffer getIntBuffer() {
    * @return A LongBuffer copy of the OnnxTensor.
    */
   public LongBuffer getLongBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.INT64) {
       LongBuffer buffer = getBuffer().asLongBuffer();
       LongBuffer output = LongBuffer.allocate(buffer.capacity());
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java b/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
index c2989fe296dc2..bbfd4e981ece2 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
@@ -28,6 +28,9 @@ public abstract class OnnxTensorLike implements OnnxValue {
   /** The size and shape information for this tensor. */
   protected final TensorInfo info;
 
+  /** Is this value closed? */
+  protected boolean closed;
+
   /**
    * Constructs a tensor-like (the base class of OnnxTensor and OnnxSparseTensor).
    *
@@ -39,6 +42,7 @@ public abstract class OnnxTensorLike implements OnnxValue {
     this.nativeHandle = nativeHandle;
     this.allocatorHandle = allocatorHandle;
     this.info = info;
+    this.closed = false;
   }
 
   /**
@@ -59,4 +63,16 @@ long getNativeHandle() {
   public TensorInfo getInfo() {
     return info;
   }
+
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
+  }
 }
diff --git a/java/src/main/java/ai/onnxruntime/OnnxValue.java b/java/src/main/java/ai/onnxruntime/OnnxValue.java
index 752a0e74267d3..e829bc80f09f6 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxValue.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxValue.java
@@ -64,7 +64,14 @@ public enum OnnxValueType {
    */
   public ValueInfo getInfo();
 
-  /** Closes the OnnxValue, freeing it's native memory. */
+  /**
+   * Checks if this value is closed (i.e., the native object has been released).
+   *
+   * @return True if the value is closed and the native object has been released.
+   */
+  public boolean isClosed();
+
+  /** Closes the OnnxValue, freeing its native memory. */
   @Override
   public void close();
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java b/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
index 39a5121fad7a2..70af10ff8cd79 100644
--- a/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
+++ b/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
@@ -5,11 +5,14 @@
 package ai.onnxruntime;
 
 import java.io.IOException;
+import java.util.logging.Logger;
 
 /** An abstract base class for execution provider options classes. */
 // Note this lives in ai.onnxruntime to allow subclasses to access the OnnxRuntime.ortApiHandle
 // package private field.
 public abstract class OrtProviderOptions implements AutoCloseable {
+  private static final Logger logger = Logger.getLogger(OrtProviderOptions.class.getName());
+
   static {
     try {
       OnnxRuntime.init();
@@ -21,6 +24,9 @@ public abstract class OrtProviderOptions implements AutoCloseable {
   /** The native pointer. */
   protected final long nativeHandle;
 
+  /** Is the native object closed? */
+  protected boolean closed;
+
   /**
    * Constructs a OrtProviderOptions wrapped around a native pointer.
    *
@@ -28,6 +34,7 @@ public abstract class OrtProviderOptions implements AutoCloseable {
    */
   protected OrtProviderOptions(long nativeHandle) {
     this.nativeHandle = nativeHandle;
+    this.closed = false;
   }
 
   /**
@@ -46,9 +53,30 @@ protected static long getApiHandle() {
    */
   public abstract OrtProvider getProvider();
 
+  /**
+   * Is the native object closed?
+   *
+   * @return True if the native object has been released.
+   */
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   @Override
   public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed tensor.");
+    }
+  }
+
+  /** Checks if the OrtProviderOptions is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OrtProviderOptions");
+    }
   }
 
   /**
diff --git a/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java b/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
index 49ddf29c22335..eeede3a1bed0b 100644
--- a/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
@@ -12,6 +12,7 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.logging.Logger;
 
 /**
  * Wraps an ONNX training model and allows training and inference calls.
@@ -1049,8 +1050,12 @@ private native void exportModelForInference(
 
   /** Wrapper class for the checkpoint state. */
   static final class OrtCheckpointState implements AutoCloseable {
+    private static final Logger logger = Logger.getLogger(OrtCheckpointState.class.getName());
+
     final long nativeHandle;
 
+    private boolean closed;
+
     /**
      * Wraps an object around the checkpoint native handle.
      *
@@ -1058,6 +1063,7 @@ static final class OrtCheckpointState implements AutoCloseable {
      */
     OrtCheckpointState(long nativeHandle) {
       this.nativeHandle = nativeHandle;
+      this.closed = false;
     }
 
     /**
@@ -1097,6 +1103,7 @@ static OrtCheckpointState loadCheckpoint(String checkpoint) throws OrtException
      * @throws OrtException If the checkpoint failed to save.
      */
     public void saveCheckpoint(Path outputPath, boolean saveOptimizer) throws OrtException {
+      checkClosed();
       Objects.requireNonNull(outputPath, "checkpoint path must not be null");
       String outputStr = outputPath.toString();
       saveCheckpoint(
@@ -1115,6 +1122,7 @@ public void saveCheckpoint(Path outputPath, boolean saveOptimizer) throws OrtExc
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, float value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1127,6 +1135,7 @@ public void addProperty(String name, float value) throws OrtException {
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, int value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1139,6 +1148,7 @@ public void addProperty(String name, int value) throws OrtException {
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, String value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1152,6 +1162,7 @@ public void addProperty(String name, String value) throws OrtException {
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public float getFloatProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getFloatProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1169,6 +1180,7 @@ public float getFloatProperty(OrtAllocator allocator, String name) throws OrtExc
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public int getIntProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getIntProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1186,6 +1198,7 @@ public int getIntProperty(OrtAllocator allocator, String name) throws OrtExcepti
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public String getStringProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getStringProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1194,9 +1207,25 @@ public String getStringProperty(OrtAllocator allocator, String name) throws OrtE
           name);
     }
 
+    /** Checks if the OrtCheckpointState is closed, if so throws {@link IllegalStateException}. */
+    private void checkClosed() {
+      if (closed) {
+        throw new IllegalStateException("Trying to use a closed OrtCheckpointState");
+      }
+    }
+
+    public synchronized boolean isClosed() {
+      return closed;
+    }
+
     @Override
-    public void close() {
-      close(OnnxRuntime.ortTrainingApiHandle, nativeHandle);
+    public synchronized void close() {
+      if (!closed) {
+        close(OnnxRuntime.ortTrainingApiHandle, nativeHandle);
+        closed = true;
+      } else {
+        logger.warning("Closing a checkpoint twice");
+      }
     }
 
     /*
diff --git a/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java b/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
index 02207b2949e54..961163035c9a6 100644
--- a/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
+++ b/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
@@ -32,6 +32,7 @@ protected StringConfigProviderOptions(long nativeHandle) {
    * @throws OrtException If the addition failed.
    */
   public void add(String key, String value) throws OrtException {
+    checkClosed();
     Objects.requireNonNull(key, "Key must not be null");
     Objects.requireNonNull(value, "Value must not be null");
     options.put(key, value);
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index e975117fb75bd..f6f9da1829402 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -69,7 +69,9 @@ public void environmentTest() {
     // Checks that the environment instance is the same.
     OrtEnvironment otherEnv = OrtEnvironment.getEnvironment();
     assertSame(env, otherEnv);
+    TestHelpers.quietLogger(OrtEnvironment.class);
     otherEnv = OrtEnvironment.getEnvironment("test-name");
+    TestHelpers.loudLogger(OrtEnvironment.class);
     assertSame(env, otherEnv);
   }
 
diff --git a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
index a5f285ba86a14..c060cf73ecf14 100644
--- a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
+++ b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
@@ -4,6 +4,10 @@
  */
 package ai.onnxruntime;
 
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
 import ai.onnxruntime.platform.Fp16Conversions;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -97,8 +101,8 @@ public void testBufferCreation() throws OrtException {
     float[] arrValues = new float[] {0, 1, 2, 3, 4};
     try (OnnxTensor t = OnnxTensor.createTensor(env, arrValues)) {
       // array creation isn't backed by buffers
-      Assertions.assertFalse(t.ownsBuffer());
-      Assertions.assertFalse(t.getBufferRef().isPresent());
+      assertFalse(t.ownsBuffer());
+      assertFalse(t.getBufferRef().isPresent());
       FloatBuffer buf = t.getFloatBuffer();
       float[] output = new float[arrValues.length];
       buf.get(output);
@@ -146,7 +150,7 @@ public void testBufferCreation() throws OrtException {
     directBuffer.rewind();
     try (OnnxTensor t = OnnxTensor.createTensor(env, directBuffer, new long[] {1, 5})) {
       // direct buffers don't trigger a copy
-      Assertions.assertFalse(t.ownsBuffer());
+      assertFalse(t.ownsBuffer());
       // tensors backed by buffers can get the buffer ref back out
       Assertions.assertTrue(t.getBufferRef().isPresent());
       FloatBuffer buf = t.getFloatBuffer();
@@ -428,4 +432,21 @@ public void testBf16RoundTrip() {
       }
     }
   }
+
+  @Test
+  public void testClose() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    long[] input = new long[] {1, 2, 3, 4, 5};
+    OnnxTensor value = OnnxTensor.createTensor(env, input);
+    assertFalse(value.isClosed());
+    long[] output = (long[]) value.getValue();
+    assertArrayEquals(input, output);
+    value.close();
+    // check use after close throws
+    assertThrows(IllegalStateException.class, value::getValue);
+    // check double close doesn't crash (emits warning)
+    TestHelpers.quietLogger(OnnxTensor.class);
+    value.close();
+    TestHelpers.loudLogger(OnnxTensor.class);
+  }
 }
diff --git a/java/src/test/java/ai/onnxruntime/TestHelpers.java b/java/src/test/java/ai/onnxruntime/TestHelpers.java
index 55d8169434d48..c13cdf222b15b 100644
--- a/java/src/test/java/ai/onnxruntime/TestHelpers.java
+++ b/java/src/test/java/ai/onnxruntime/TestHelpers.java
@@ -22,6 +22,8 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 import java.util.regex.Pattern;
 import org.junit.jupiter.api.Assertions;
 
@@ -258,6 +260,16 @@ static void flattenStringBase(String[] input, List<String> output) {
     output.addAll(Arrays.asList(input));
   }
 
+  static void loudLogger(Class<?> loggerClass) {
+    Logger l = Logger.getLogger(loggerClass.getName());
+    l.setLevel(Level.INFO);
+  }
+
+  static void quietLogger(Class<?> loggerClass) {
+    Logger l = Logger.getLogger(loggerClass.getName());
+    l.setLevel(Level.OFF);
+  }
+
   public static Path getResourcePath(String path) {
     return new File(TestHelpers.class.getResource(path).getFile()).toPath();
   }

From b2ce3eedb9f3d9cee82525c9f29c2d1f42ba58c7 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Mon, 15 Jan 2024 15:09:49 +1000
Subject: [PATCH 066/100] Fix build error for CoreML Split op (#19099)

### Description
<!-- Describe your changes. -->
The `split` input of the Split op is int64_t. Fixing that resolves a
type mismatch build error on Windows when CoreML is enabled (for
debugging the partitioning code).

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix build error

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../core/providers/coreml/builders/impl/split_op_builder.cc   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 815f68128ffaf..56c87c883156b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -139,8 +139,8 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
     }
     const auto& splits_tensor = *initializers.at(input_defs[1]->Name());
     Initializer unpacked_tensor(splits_tensor);
-    auto splits_span = unpacked_tensor.DataAsSpan<uint64_t>();
-    int sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), 0);
+    auto splits_span = unpacked_tensor.DataAsSpan<int64_t>();
+    int64_t sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), int64_t{0});
     if (sum_of_splits != split_dims_at_axis) {
       LOGS(logger, VERBOSE) << "Mismatch between the sum of 'split'. Expected: "
                             << split_dims_at_axis

From 922a2f00e3855fdc9852ed1bfe7f6f0a88e40a24 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Mon, 15 Jan 2024 14:37:22 +0800
Subject: [PATCH 067/100] Extend timeout in Nuget-CUDA-Packaging-Pipeline
 (#19138)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Linux_GPU_x64 job in the pipeline has been canceled due to timeout since
0112.
---
 .../azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index fbdd67bb5de22..48a6e0e8529e6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -15,7 +15,7 @@ stages:
   - job:
     workspace:
       clean: all
-    timeoutInMinutes: 120
+    timeoutInMinutes: 150
     pool: 'Onnxruntime-Linux-GPU'
     variables:
       - name: CUDA_VERSION_MAJOR

From a97199c62de4a96939624ba511313d0f81014f56 Mon Sep 17 00:00:00 2001
From: Ben Niu <niuben003@gmail.com>
Date: Mon, 15 Jan 2024 14:29:19 -0800
Subject: [PATCH 068/100] Fix Arm64EC build for test_q4qdq.cpp (#18523)

### Description
Fix ifdef guards in test_q4qdq.cpp to exclude code blocks intended only
for native x64 compilation instead of x64 + Arm64EC.
---
 onnxruntime/test/mlas/unittest/test_q4qdq.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
index 955c3b1201989..c317395bee970 100644
--- a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
+++ b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include "test_util.h"
 #include "mlas_q4.h"
 
-#if (defined(_M_AMD64) || defined(__x86_64__))
+#if ((defined(_M_AMD64) && !defined(_M_ARM64EC)) || defined(__x86_64__))
 
 /**
  * @brief For testing purpose,
@@ -93,7 +93,7 @@ class MlasQ4dqTest : public MlasTestBase {
                                      << K << "] QType: " << qtype;
     }
 
-#if (defined(_M_AMD64) || defined(__x86_64__))
+#if ((defined(_M_AMD64) && !defined(_M_ARM64EC)) || defined(__x86_64__))
 
     /* Test MlasBlkQ4DequantSgemmPackB, make sure we can reuse SGEMM kernel as it rearrange B the same way as sgemm pack B*/
     const size_t AlignedN = (N + 15) & ~15;

From 191525301f2b30fa4ff7337cd40c5f3f94834488 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Mon, 15 Jan 2024 17:42:50 -0500
Subject: [PATCH 069/100] [java] Updating TensorInfo so it contains the named
 dimensions (#18962)

### Description
The Java `TensorInfo` object which is used to describe a tensor's shape,
along with the input and output placeholders for a model couldn't show
any symbolic/named dimensions in that tensor. Now this information is
stored in Java strings on construction and included in the toString.

### Motivation and Context
Setting symbolic dimensions required external information in Java, the
names were not discoverable from within the API.
---
 .../main/java/ai/onnxruntime/TensorInfo.java  | 63 ++++++++++++++++---
 java/src/main/native/OrtJniUtil.c             | 26 ++++++--
 .../java/ai/onnxruntime/InferenceTest.java    |  6 ++
 3 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/TensorInfo.java b/java/src/main/java/ai/onnxruntime/TensorInfo.java
index 69ccb954e8afe..1c21387b50455 100644
--- a/java/src/main/java/ai/onnxruntime/TensorInfo.java
+++ b/java/src/main/java/ai/onnxruntime/TensorInfo.java
@@ -7,6 +7,7 @@
 import java.lang.reflect.Array;
 import java.nio.Buffer;
 import java.util.Arrays;
+import java.util.stream.Collectors;
 
 /** Describes an {@link OnnxTensor}, including it's size, shape and element type. */
 public class TensorInfo implements ValueInfo {
@@ -159,6 +160,12 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
   /** The shape of the tensor. */
   final long[] shape;
 
+  /** The names of the unbound dimensions. */
+  final String[] dimensionNames;
+
+  /** If there are non-empty dimension names */
+  private final boolean hasNames;
+
   /** The Java type of this tensor. */
   public final OnnxJavaType type;
 
@@ -177,6 +184,9 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
    */
   TensorInfo(long[] shape, OnnxJavaType type, OnnxTensorType onnxType) {
     this.shape = shape;
+    this.dimensionNames = new String[shape.length];
+    Arrays.fill(dimensionNames, "");
+    this.hasNames = false;
     this.type = type;
     this.onnxType = onnxType;
     this.numElements = elementCount(shape);
@@ -188,10 +198,20 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
    * <p>Called from JNI.
    *
    * @param shape The tensor shape.
+   * @param names The dimension names.
    * @param typeInt The native type int.
    */
-  TensorInfo(long[] shape, int typeInt) {
+  TensorInfo(long[] shape, String[] names, int typeInt) {
     this.shape = shape;
+    this.dimensionNames = names;
+    boolean hasNames = false;
+    for (String s : names) {
+      if (!s.isEmpty()) {
+        hasNames = true;
+        break;
+      }
+    }
+    this.hasNames = hasNames;
     this.onnxType = OnnxTensorType.mapFromInt(typeInt);
     this.type = OnnxJavaType.mapFromOnnxTensorType(this.onnxType);
     this.numElements = elementCount(shape);
@@ -206,15 +226,42 @@ public long[] getShape() {
     return Arrays.copyOf(shape, shape.length);
   }
 
+  /**
+   * Get a copy of the tensor's named dimensions.
+   *
+   * @return A copof the tensor's named dimensions.
+   */
+  public String[] getDimensionNames() {
+    return Arrays.copyOf(dimensionNames, dimensionNames.length);
+  }
+
   @Override
   public String toString() {
-    return "TensorInfo(javaType="
-        + type.toString()
-        + ",onnxType="
-        + onnxType.toString()
-        + ",shape="
-        + Arrays.toString(shape)
-        + ")";
+    String output =
+        "TensorInfo(javaType="
+            + type.toString()
+            + ",onnxType="
+            + onnxType.toString()
+            + ",shape="
+            + Arrays.toString(shape);
+    if (hasNames) {
+      output =
+          output
+              + ",dimNames=["
+              + Arrays.stream(dimensionNames)
+                  .map(
+                      a -> {
+                        if (a.isEmpty()) {
+                          return "\"\"";
+                        } else {
+                          return a;
+                        }
+                      })
+                  .collect(Collectors.joining(","))
+              + "]";
+    }
+    output = output + ")";
+    return output;
   }
 
   /**
diff --git a/java/src/main/native/OrtJniUtil.c b/java/src/main/native/OrtJniUtil.c
index 879ba8a310618..7b26291581395 100644
--- a/java/src/main/native/OrtJniUtil.c
+++ b/java/src/main/native/OrtJniUtil.c
@@ -342,7 +342,6 @@ jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorT
   if (code != ORT_OK) {
     return NULL;
   }
-  //printf("numDim %d\n",numDim);
   int64_t* dimensions = (int64_t*) malloc(sizeof(int64_t)*numDim);
   code = checkOrtStatus(jniEnv, api, api->GetDimensions(info, dimensions, numDim));
   if (code != ORT_OK) {
@@ -358,12 +357,31 @@ jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorT
   free(dimensions);
   dimensions = NULL;
 
+  // Create the string array for the names.
+  const char** dimensionNames = (const char**) malloc(sizeof(char*)*numDim);
+  if (dimensionNames == NULL) {
+    throwOrtException(jniEnv, 1, "Not enough memory");
+    return NULL;
+  }
+  code = checkOrtStatus(jniEnv, api, api->GetSymbolicDimensions(info, dimensionNames, numDim));
+  if (code != ORT_OK) {
+    // extraction failed, exception has been thrown, return to Java.
+    free(dimensionNames);
+    return NULL;
+  }
+  jclass stringClazz = (*jniEnv)->FindClass(jniEnv, "java/lang/String");
+  jobjectArray names = (*jniEnv)->NewObjectArray(jniEnv, safecast_size_t_to_jsize(numDim), stringClazz, NULL);
+  for (size_t i = 0; i < numDim; i++) {
+    jobject javaName = (*jniEnv)->NewStringUTF(jniEnv, dimensionNames[i]);
+    (*jniEnv)->SetObjectArrayElement(jniEnv, names, safecast_size_t_to_jsize(i), javaName);
+  }
+  free(dimensionNames);
+
   // Create the TensorInfo object
   static const char *tensorInfoClassName = "ai/onnxruntime/TensorInfo";
   jclass clazz = (*jniEnv)->FindClass(jniEnv, tensorInfoClassName);
-  jmethodID tensorInfoConstructor = (*jniEnv)->GetMethodID(jniEnv,clazz, "<init>", "([JI)V");
-  //printf("TensorInfo class %p, methodID %p\n",clazz,tensorInfoConstructor);
-  jobject tensorInfo = (*jniEnv)->NewObject(jniEnv, clazz, tensorInfoConstructor, shape, onnxTypeInt);
+  jmethodID tensorInfoConstructor = (*jniEnv)->GetMethodID(jniEnv,clazz, "<init>", "([J[Ljava/lang/String;I)V");
+  jobject tensorInfo = (*jniEnv)->NewObject(jniEnv, clazz, tensorInfoConstructor, shape, names, onnxTypeInt);
   return tensorInfo;
 }
 
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index f6f9da1829402..7fef2dc784b7b 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -590,6 +590,12 @@ public void testSymbolicDimensionAssignment() throws OrtException {
         Map<String, NodeInfo> infoMap = session.getInputInfo();
         TensorInfo aInfo = (TensorInfo) infoMap.get("A").getInfo();
         assertArrayEquals(new long[] {-1, 2}, aInfo.shape);
+        assertEquals(2, aInfo.dimensionNames.length);
+        assertEquals("n", aInfo.dimensionNames[0]);
+        assertEquals("", aInfo.dimensionNames[1]);
+        TensorInfo bInfo = (TensorInfo) infoMap.get("B").getInfo();
+        assertEquals(1, bInfo.dimensionNames.length);
+        assertEquals("m", bInfo.dimensionNames[0]);
       }
     }
     // Check that when the options are assigned it overrides the symbolic dimension

From 1150b1f81ea7e46a840212acf194422af7f764a3 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 16 Jan 2024 08:57:37 +0800
Subject: [PATCH 070/100] ORTModule memory improvement (#18924)

## Dependency

https://github.com/microsoft/onnxruntime/pull/19007

## ORTModule memory efficient gradient management

Previously I have tried to solve the coarsed-grained gradient
accumulation/update problem in ORTModule with
https://github.com/microsoft/onnxruntime/pull/8979, while that
resolution somehow is not fully validated with DDP or there is user
hooks on the gradient accumulation on torch parameter.

This PR is addressing the problem in the similar approach as PR 8979,
e.g. trigger gradient accumulation once ORT computed the grad, but
instead of use a AccumulateGrad op, this time with a ONNX operator
PythonOp, internally it will call param.backward(grad), which will help
handle all related hooks correctly.


## Design

Check the details from


https://microsoftapc-my.sharepoint.com/:p:/g/personal/pengwa_microsoft_com/EaaBq4EzsFhOmsDEXCG7Ba4Bb9bwd0O2sFV_JXJ4jBLYLA?e=7Sz2g8&nav=eyJzSWQiOjI3MSwiY0lkIjozMjE4NzI1NDIzfQ

## Convergence Validation:


![image](https://github.com/microsoft/onnxruntime/assets/10530022/ccf3a213-e815-4b23-b759-165033b2d9fe)

differences are on mostly 0.000x, sometimes 0.00x, which may comes from
the different order gradient apply happens before or after this change
(on deepspeed zero stage 2)


## TODO

Consolidate the logic with Stage3's similar logic.
---
 docs/ORTModule_Training_Guidelines.md         |  10 +
 onnxruntime/core/framework/execution_frame.cc |   3 +-
 .../python/tools/symbolic_shape_infer.py      |   9 +-
 .../ortmodule/_graph_execution_manager.py     | 109 ++++++--
 .../ortmodule/_mem_efficient_grad_mgmt.py     | 246 ++++++++++++++++++
 .../python/training/ortmodule/_onnx_models.py |   1 +
 .../training/ortmodule/_pythonop_helper.py    | 240 +++++++++++++++++
 .../training/ortmodule/_training_manager.py   |  27 +-
 .../python/training/ortmodule/options.py      |  12 +
 .../utils/hooks/_zero_offload_subscriber.py   |   2 +-
 .../python/orttraining_test_ortmodule_api.py  |   2 +-
 .../torch_custom_function_kernel_base.cc      |   5 +-
 ...-linux-nightly-ortmodule-test-pipeline.yml |   2 +-
 13 files changed, 638 insertions(+), 30 deletions(-)
 create mode 100644 orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
 create mode 100644 orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index bede16204d420..91057d3dfb120 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -293,6 +293,16 @@ A classical usage of disabling the deep copy: when the deep copy before module e
     export ORTMODULE_MEMORY_OPT_LEVEL=0
     ```
 
+### ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, the memory-efficient gradient management is turned off. The gradient after it is computed in ONNX Runtime, will trigger the corresponding parameter's backward function through `PythonOpGrad` operator. This would help release the gradient buffer managed in ONNX Runtime, which originally is released once all backward computation finishes.
+
+	```bash
+	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 # Enable
+	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 # Disable
+	```
+
 ### 2.2 Memory Optimization
 
 Q: *Want to run a bigger batch size?*
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index d9c49dc6bea1d..8c08152986cf6 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -223,7 +223,8 @@ void IExecutionFrame::Init(gsl::span<const int> feed_mlvalue_idxs, gsl::span<con
                            const std::unordered_map<int, OrtValue>& initializers,
                            const std::function<bool(const std::string& name)>& is_initializer_sparse_func,
                            gsl::span<const OrtValue> fetches) {
-  ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size());
+  ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size(), "Get feed size: ", feeds.size(), " but expected feed size: ",
+              feed_mlvalue_idxs.size());
   ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs_.size());
 
   // Need this for sparse conversions in host memory
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index e90eea553c185..ef4c4ae906243 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -2415,9 +2415,9 @@ def _infer_RotaryEmbedding(self, node):  # noqa: N802
 
     def _infer_PythonOp(self, node):  # noqa: N802
         output_tensor_types = get_attribute(node, "output_tensor_types")
-        assert output_tensor_types
+        assert output_tensor_types, f"PythonOp '{node.name}' has no output_tensor_types attribute."
         output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
-        assert output_tensor_ranks
+        assert output_tensor_ranks, f"PythonOp '{node.name}' has no output_tensor_ranks attribute."
 
         from onnxruntime.capi._pybind_state import get_shape_inference_function
 
@@ -2438,7 +2438,10 @@ def _infer_PythonOp(self, node):  # noqa: N802
                 input_dtype = self.known_vi_[node.input[input_index]].type.tensor_type.elem_type
                 input_dtypes.append(input_dtype)
             output_shapes, output_dtypes = shape_inferer(node, input_shapes, input_dtypes)
-            assert len(output_shapes) == len(output_dtypes) == (len(node.output) - 1)
+            assert len(output_shapes) == len(output_dtypes) == (len(node.output) - 1), (
+                f"PythonOp '{func_name}' returned {len(output_shapes)} shapes and {len(output_dtypes)} dtypes, "
+                f"but expected {len(node.output) - 1} outputs."
+            )
             for i in range(len(node.output) - 1):
                 output_index = i + 1
                 vi = self.known_vi_[node.output[output_index]]
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 853eab61b4bd6..779b6bfe50422 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -36,7 +36,6 @@
 from ._io import _FlattenedModule, _InputInfo
 from ._runtime_inspector import RuntimeInspector
 from ._utils import check_function_has_param, get_rank
-from ._zero_stage3_compatibility import stage3_export_context
 from .options import DebugOptions, LogLevel, _MemoryOptimizationLevel, _RuntimeOptions
 from .torch_cpp_extensions.cpu.aten_op_executor import load_aten_op_executor_cpp_extension
 
@@ -148,6 +147,10 @@ def __init__(
 
             configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True)
 
+        # Will be reset everytime we re-initialize the graph builder.
+        # Be noted, we will never enable this feature for inference mode.
+        self._mem_efficient_grad_management_is_enabled = False
+
     def _get_torch_gpu_allocator_function_addresses(self):
         if self._runtime_options.use_external_gpu_allocator and torch.cuda.is_available():
             # CPP extension to get torch GPU allocator's alloc and free function addresses
@@ -388,6 +391,8 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
         assert self._export_mode is not None, "Please use a concrete instance of ExecutionManager"
 
         try:
+            from ._zero_stage3_compatibility import stage3_export_context
+
             with torch.no_grad(), stage3_export_context(self._runtime_options.enable_zero_stage3_support, self):
                 required_export_kwargs = {
                     "input_names": self._input_info.names,
@@ -496,9 +501,35 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati
     def _initialize_graph_builder(self):
         """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder"""
 
+        self._mem_efficient_grad_management_is_enabled = (
+            self._export_mode != torch.onnx.TrainingMode.EVAL
+            and self._runtime_options.enable_mem_efficient_grad_management
+        )
+
+        # We post process the exported model because the trainable parame might be changed, so this path is
+        # re-triggered by reinitialize_graph_builder.
+        exported_model = copy.deepcopy(self._onnx_models.exported_model)
+        self._onnx_models.processed_exported_model = exported_model
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import post_processing_enable_mem_efficient_training
+
+            # Override the options if model is not modified.
+            (
+                self._mem_efficient_grad_management_is_enabled,
+                exported_model,
+            ) = post_processing_enable_mem_efficient_training(exported_model, self._flattened_module.named_parameters())
+
+            if self._runtime_options.run_symbolic_shape_infer:
+                exported_model = SymbolicShapeInference.infer_shapes(
+                    exported_model, auto_merge=True, guess_output_rank=True
+                )
+
         # All initializer names along with user inputs are a part of the onnx graph inputs
         # since the onnx model was exported with the flag keep_initializers_as_inputs=True
-        onnx_initializer_names = {p.name for p in self._onnx_models.exported_model.graph.input}
+        # We need to use the raw exported model here since the graph inputs include both user inputrs and
+        # parameters.
+        onnx_initializer_names = {p.name for p in exported_model.graph.input}
 
         # TODO: PyTorch exporter bug: changes the initializer order in ONNX model
         initializer_names = [
@@ -521,6 +552,13 @@ def _initialize_graph_builder(self):
 
             # Add stage3 pull weight trigger name to require_grad_names, so that it will be included in the gradient graph.
             input_names_require_grad.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME)
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+
+            # Add mem efficient grad trigger name to require_grad_names, so that it will be included in the gradient graph.
+            input_names_require_grad.append(MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME)
+
         grad_builder_config.input_names_require_grad = input_names_require_grad
         grad_builder_config.build_gradient_graph = self._export_mode == torch.onnx.TrainingMode.TRAINING
         grad_builder_config.enable_caching = self._runtime_options.enable_grad_acc_optimization
@@ -532,12 +570,23 @@ def _initialize_graph_builder(self):
 
         # It is assumed here that the order and names of the inputs and outputs are not modified by the backend in any way
         # and are kept as they appear in the exported onnx model.
-        self._graph_builder.initialize(self._onnx_models.exported_model.SerializeToString(), grad_builder_config)
+        self._graph_builder.initialize(exported_model.SerializeToString(), grad_builder_config)
+
+        raw_onnx_initializer_names = {p.name for p in self._onnx_models.exported_model.graph.input}
+
+        raw_initializer_names = [
+            name for name, _ in self._flattened_module.named_parameters() if name in raw_onnx_initializer_names
+        ]
+        raw_initializer_names_to_train = [
+            name
+            for name, param in self._flattened_module.named_parameters()
+            if param.requires_grad and name in raw_onnx_initializer_names
+        ]
 
         # TODO: Explore ways to make self._graph_info.initializer_names and self._graph_info.initializer_names_to_train
         #       a set (unordered_set in the backend) that does not require a copy on each reference.
-        self._graph_initializer_names = set(initializer_names)
-        self._graph_initializer_names_to_train = set(initializer_names_to_train)
+        self._graph_initializer_names = set(raw_initializer_names)
+        self._graph_initializer_names_to_train = set(raw_initializer_names_to_train)
 
         # Initializers can be cached and used since they are expected not to be re-instantiated
         # between forward calls.
@@ -588,7 +637,7 @@ def _enable_conditional_optimizations(
         # Enable data sparsity inspection if sparse optimizer is ON or user wants to print input density.
         if self._runtime_options.enable_sparse_optimizer or self._runtime_options.print_input_density:
             self._runtime_inspector.enable_input_inspector(
-                self._onnx_models.exported_model, self._graph_builder.get_graph_info().user_input_names
+                self._onnx_models.processed_exported_model, self._graph_builder.get_graph_info().user_input_names
             )
 
             if self._runtime_options.enable_sparse_optimizer:
@@ -596,11 +645,21 @@ def _enable_conditional_optimizations(
                     inputs, kwargs
                 )
 
-                if self._runtime_options.enable_zero_stage3_support:
+                if self._runtime_options.enable_zero_stage3_support or self._mem_efficient_grad_management_is_enabled:
                     self._append_pull_weight_trigger_as_input(kwargs, detected_device)
 
+                param_to_append_as_onnx_graph_inputs = []
+                if self._mem_efficient_grad_management_is_enabled:
+                    from ._mem_efficient_grad_mgmt import get_params_not_connected_to_pull_param_trigger
+
+                    param_to_append_as_onnx_graph_inputs = get_params_not_connected_to_pull_param_trigger(
+                        self._flattened_module.named_parameters(), self._onnx_models.exported_model
+                    )
+                else:
+                    param_to_append_as_onnx_graph_inputs = self._graph_initializers
+
                 _, embed_sparsity_results, label_sparsity_results = _io._combine_input_buffers_initializers(
-                    self._graph_initializers,
+                    param_to_append_as_onnx_graph_inputs,
                     self._graph_builder.get_graph_info().user_input_names,
                     self._input_info,
                     self._flattened_module.named_buffers(),
@@ -632,19 +691,31 @@ def _enable_conditional_optimizations(
                 self._runtime_inspector.disable_input_inspector()
 
     def _append_pull_weight_trigger_as_input(self, kwargs: Dict, device: torch.device):
-        from ._zero_stage3_compatibility import (
-            STAGE3_PULL_WEIGHT_TRIGGER_NAME,
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
-        )
+        if self._runtime_options.enable_zero_stage3_support:
+            from ._zero_stage3_compatibility import (
+                STAGE3_PULL_WEIGHT_TRIGGER_NAME,
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+            )
 
-        kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
-            dtype=onnx_dtype_to_pytorch_dtype(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
-            device=device,
-        ).requires_grad_()
+            kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+                dtype=onnx_dtype_to_pytorch_dtype(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
+                device=device,
+            ).requires_grad_()
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import (
+                MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+            )
 
-        return kwargs
+            kwargs[MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME] = torch.zeros(
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+                dtype=onnx_dtype_to_pytorch_dtype(MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE),
+                device=device,
+            ).requires_grad_()
 
     def _log_feature_stats(self):
         if get_rank() != 0:
diff --git a/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
new file mode 100644
index 0000000000000..4663afdaa94a0
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
@@ -0,0 +1,246 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import ctypes
+
+import torch
+from onnx import ModelProto, NodeProto, TensorProto, helper
+
+from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
+
+from ._pythonop_helper import make_pythonop_node
+
+MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME = "mem_efficient_pull_weight_trigger"
+MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE = TensorProto.FLOAT
+MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE = [1]
+
+
+def get_params_connected_to_pull_param_trigger(
+    named_params: dict[str, torch.nn.parameter.Parameter], exported_model: ModelProto
+):
+    # Be noted, some parameters might not in graph input because they are not used in forward, so we filtered them also.
+    onnx_initializer_names = {p.name for p in exported_model.graph.input}
+    return {k: v for k, v in named_params if v.requires_grad and k in onnx_initializer_names}
+
+
+def get_params_not_connected_to_pull_param_trigger(
+    named_params: dict[str, torch.nn.parameter.Parameter], exported_model: ModelProto
+):
+    # Be noted, some parameters might not in graph input because they are not used in forward, so we filtered them also.
+    onnx_initializer_names = {p.name for p in exported_model.graph.input}
+    return [v for k, v in named_params if not v.requires_grad and k in onnx_initializer_names]
+
+
+def post_processing_enable_mem_efficient_training(
+    exported_model: ModelProto,
+    named_params: dict[str, torch.nn.parameter.Parameter],
+) -> tuple[bool, ModelProto]:
+    """This function is used to enable zero stage3 compatibility.
+
+    Args:
+        exported_model (ModelProto): The exported model.
+        named_params (Optional[Dict[str, torch.nn.parameter.Parameter]]): The full parameter map.
+
+    Returns:
+        tuple[bool, ModelProto]: A tuple of bool and ModelProto. The bool indicates whether the model is modified.
+
+    """
+    trainable_named_params = get_params_connected_to_pull_param_trigger(named_params, exported_model)
+    if len(trainable_named_params) == 0:
+        return False, exported_model
+
+    # Create weight retrieving function using trainable_named_params.
+    param_pull_trigger_func_class = _create_param_trigger_function(trainable_named_params)
+    param_retrieve_func_class = _create_param_retrieval_function(trainable_named_params)
+
+    def _get_param_pull_trigger_name(param_name: str) -> str:
+        return f"pull_{param_name}"
+
+    # Create weight retrieving PythonOp.
+    inputs = [
+        helper.make_tensor_value_info(
+            MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME,
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,  # Use the same data type with output for the input
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+        )
+    ]
+
+    outputs = [
+        helper.make_tensor_value_info(
+            _get_param_pull_trigger_name(pname),
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+        )
+        for pname in trainable_named_params
+    ]
+
+    weight_pull_node = make_pythonop_node(
+        "weight_pull_trigger",
+        inputs,
+        outputs,
+        param_pull_trigger_func_class,
+        training_mode=1,
+        safe_run_mode=0,
+    )
+
+    graph_inputs_to_remove = []
+    input_offset = 0
+    for graph_input in exported_model.graph.input:
+        if graph_input.name not in trainable_named_params:
+            continue
+
+        graph_inputs_to_remove.append(graph_input)
+
+        # Create the param retrieval function for this parameter.
+        node_inputs = [
+            helper.make_tensor_value_info(
+                _get_param_pull_trigger_name(graph_input.name),
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+            ),
+            graph_input.name,  # Second param is a string, which represents the param_name
+        ]
+
+        node_outputs = [
+            helper.make_tensor_value_info(
+                graph_input.name,  # output use the same name as weight
+                int(pytorch_type_to_onnx_dtype(trainable_named_params[graph_input.name].dtype)),
+                list(trainable_named_params[graph_input.name].shape),
+            ),
+        ]
+
+        new_node = make_pythonop_node(
+            f"weight_retrieval_{graph_input.name}",
+            node_inputs,
+            node_outputs,
+            param_retrieve_func_class,
+            training_mode=1,
+            safe_run_mode=0,
+        )
+        exported_model.graph.node.insert(input_offset, new_node)
+        input_offset += 1
+
+    # Delete exported_model.graph.input
+    names_to_remove = [input.name for input in graph_inputs_to_remove]
+    value_infos_to_remove = [
+        value_info for value_info in exported_model.graph.value_info if value_info.name in names_to_remove
+    ]
+    for value_info in value_infos_to_remove:
+        exported_model.graph.value_info.remove(value_info)
+
+    for input_to_remove in graph_inputs_to_remove:
+        exported_model.graph.input.remove(input_to_remove)
+
+    # Re-order graph input to make sure the weight pull trigger is the first user input.
+    offset = 0  # Find the first trainable param, and insert the new input before it, as part of user inputs.
+    for input in exported_model.graph.input:
+        if input.name in named_params:
+            break
+        offset += 1
+    exported_model.graph.input.insert(offset, inputs[0])
+    exported_model.graph.node.insert(0, weight_pull_node)
+
+    return True, exported_model
+
+
+_PARAM_FUNCTION_INDEX = [0]
+
+
+def _create_param_trigger_function(trainable_named_params: dict[str, torch.nn.parameter.Parameter]):
+    """This function is used to create a weight retrieving function using trainable_named_params."""
+
+    @staticmethod
+    def forward(ctx, weight_in_trigger):
+        params = list(trainable_named_params.values())
+        ctx.params = params
+        ctx.dtype = weight_in_trigger.dtype
+        ctx.device = weight_in_trigger.device
+        ctx.shape = weight_in_trigger.shape
+        return (torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype),) * len(params)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype)
+
+    @staticmethod
+    def infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: list[list[int | str] | None],
+        tensor_input_dtypes: list[torch.onnx.TensorProtoDataType],
+    ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]:
+        param_count = len(trainable_named_params.values())
+        tensor_output_shapes = [
+            tensor_input_shapes[0],
+        ] * param_count
+        tensor_output_dtypes = [
+            tensor_input_dtypes[0],
+        ] * param_count
+
+        return tensor_output_shapes, tensor_output_dtypes
+
+    _PARAM_FUNCTION_INDEX[0] += 1
+
+    return type(
+        f"ParamTriggerFunction_{_PARAM_FUNCTION_INDEX[0]}",
+        (torch.autograd.Function,),
+        {
+            "forward": forward,
+            "backward": backward,
+            "infer_shape": infer_shape,
+        },
+    )
+
+
+def _create_param_retrieval_function(trainable_named_params: dict[str, torch.nn.parameter.Parameter]):
+    """This function is used to create a weight retrieving function using trainable_named_params."""
+
+    @staticmethod
+    def forward(ctx, param_trigger, param_name):
+        ctx.param_name = param_name
+        ctx.dtype = param_trigger.dtype
+        ctx.device = param_trigger.device
+        ctx.shape = param_trigger.shape
+        return trainable_named_params[param_name]
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        trainable_named_params[ctx.param_name].backward(grad_outputs[0])
+        return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype), None
+
+    @staticmethod
+    def infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: list[list[int | str] | None],
+        tensor_input_dtypes: list[torch.onnx.TensorProtoDataType],
+    ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]:
+        input_pointer_scalars_attr_name = "input_pointer_scalars"
+        found = [attr for attr in node.attribute if attr.name == input_pointer_scalars_attr_name]
+
+        assert len(found) == 1
+        input_pointer_scalars = found[0].ints
+
+        # Restore the nn.Module from the pointer.
+        param_name = ctypes.cast(input_pointer_scalars[0], ctypes.py_object).value
+
+        tensor_output_shapes = [
+            list(trainable_named_params[param_name].shape),
+        ]
+        tensor_output_dtypes = [
+            int(pytorch_type_to_onnx_dtype(trainable_named_params[param_name].dtype)),
+        ]
+
+        return tensor_output_shapes, tensor_output_dtypes
+
+    return type(
+        f"ParamRetrievalFunction_{_PARAM_FUNCTION_INDEX[0]}",
+        (torch.autograd.Function,),
+        {
+            "forward": forward,
+            "backward": backward,
+            "infer_shape": infer_shape,
+        },
+    )
diff --git a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
index d687bc24384ed..a0001a2f201f1 100644
--- a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
+++ b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
@@ -33,6 +33,7 @@ class ONNXModels:
     """
 
     exported_model: Optional[onnx.ModelProto] = None
+    processed_exported_model: Optional[onnx.ModelProto] = None
     optimized_model: Optional[onnx.ModelProto] = None
 
     def save_exported_model(self, path, name_prefix, export_mode):
diff --git a/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py b/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py
new file mode 100644
index 0000000000000..32a564b27acd0
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import inspect
+
+import onnx
+import torch
+
+from onnxruntime.capi._pybind_state import register_miscellaneous_const_input, register_torch_autograd_function
+
+from ._custom_autograd_function_exporter import register_custom_function_schema_supplementary
+from ._utils import get_fully_qualified_class_name
+
+PYTHON_OP_DOMAIN = "com.microsoft"
+PYTHON_OP_TYPE = "PythonOp"
+
+PYTHON_OP_ATTRIBUTE_FUNC_NAME = "func_name"
+PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE = "safe_run_mode"
+PYTHON_OP_ATTRIBUTE_TRAINING_MODE = "training_mode"
+
+
+def set_safe_run_mode(model: onnx.ModelProto, allowed_unsafe_run_python_op_names: list[str]) -> onnx.ModelProto:
+    # Update safe_run_mode attribute for PythonOp.
+    for node in model.graph.node:
+        if node.domain == PYTHON_OP_DOMAIN and node.op_type == PYTHON_OP_TYPE:
+            func_name = None
+            safe_run_mode_attr = None
+            for attr in node.attribute:
+                if attr.name == PYTHON_OP_ATTRIBUTE_FUNC_NAME:
+                    func_name = attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+                if attr.name == PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE:
+                    safe_run_mode_attr = attr
+
+            if func_name in allowed_unsafe_run_python_op_names:
+                if safe_run_mode_attr:
+                    node.attribute.remove(safe_run_mode_attr)
+                node.attribute.append(onnx.helper.make_attribute(PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE, 0))
+
+    return model
+
+
+_PYTHON_OP_INCRE_INDEX = [0]
+
+
+def make_pythonop_node(
+    name_prefix: str,
+    inputs: list[
+        onnx.ValueInfoProto | int | bool | float | tuple[int, ...] | tuple[bool, ...] | tuple[float, ...] | object
+    ],
+    outputs: list[onnx.ValueInfoProto],
+    func_class: torch.autograd.Function,
+    training_mode: int,
+    safe_run_mode: int,
+) -> onnx.NodeProto:
+    assert issubclass(func_class, torch.autograd.Function), "func_class must be a subclass of torch.autograd.Function."
+
+    assert len(inputs) > 0, f"inputs must not be empty for function {func_class}."
+    assert len(outputs) > 0, f"outputs must not be empty for function {func_class}."
+
+    all_input_parameters: list[inspect.Parameter] = list(inspect.signature(func_class.forward).parameters.values())
+
+    # Remove the first parameter (ctx) from inspected parameter list.
+    assert len(inputs) == len(all_input_parameters) - 1, (
+        f"The number of inputs ({len(inputs)}) must match the number of parameters "
+        f"({len(all_input_parameters) - 1}) of the forward function."
+    )
+
+    func_full_qual_name = get_fully_qualified_class_name(func_class)
+
+    input_tensor_types = []
+    input_tensor_ranks = []
+
+    input_bool_scalars = []
+    input_bool_scalar_positions = []
+
+    input_int_scalars = []
+    input_int_scalar_positions = []
+
+    input_float_scalars = []
+    input_float_scalar_positions = []
+
+    input_bool_tuples = []
+    input_bool_tuple_positions = []
+    input_bool_tuple_begins = []
+
+    input_int_tuples = []
+    input_int_tuple_positions = []
+    input_int_tuple_begins = []
+
+    input_float_tuples = []
+    input_float_tuple_positions = []
+    input_float_tuple_begins = []
+
+    input_pointer_scalars = []
+    input_pointer_scalar_positions = []
+
+    tensor_args = []
+    debug_comment = ""
+    cconv = ""
+    # Encode inputs to torch.autograd.Function.
+    for i, arg in enumerate(inputs):
+        if isinstance(arg, onnx.ValueInfoProto):
+            # Got a tensor variable.
+            tensor_args.append(arg.name)
+            input_tensor_types.append(arg.type.tensor_type.elem_type)
+            input_tensor_ranks.append(len(arg.type.tensor_type.shape.dim))
+            cconv += "d"
+            continue
+
+        cconv += "c"
+
+        # Got a non-tensor variable.
+        if isinstance(arg, float):
+            # A float.
+            input_float_scalar_positions.append(i)
+            input_float_scalars.append(arg)
+            continue
+        # bool check MUST be before int check since bool is a subclass of int
+        elif isinstance(arg, bool):
+            # A bool.
+            input_bool_scalar_positions.append(i)
+            input_bool_scalars.append(int(arg))
+            continue
+        elif isinstance(arg, int):
+            # A int.
+            input_int_scalar_positions.append(i)
+            input_int_scalars.append(arg)
+            continue
+
+        is_bool_tuple = False
+        is_int_tuple = False
+        is_float_tuple = False
+        if isinstance(arg, tuple) and len(arg) > 0:
+            # bool check MUST be before int check since bool is a subclass of int.
+            is_bool_tuple = all(isinstance(ele, bool) for ele in arg)
+            is_int_tuple = not is_bool_tuple and all(isinstance(ele, int) for ele in arg)
+            is_float_tuple = not is_bool_tuple and not is_int_tuple and all(isinstance(ele, float) for ele in arg)
+
+        # Only support tuple of bool, int or float, for other types, handle it as a pointer.
+        if is_bool_tuple:
+            # A tuple of bool.
+            input_bool_tuple_positions.append(i)
+            input_bool_tuple_begins.append(len(input_bool_tuples))
+            input_bool_tuples.extend([int(ele) for ele in arg])
+            continue
+        elif is_int_tuple:
+            # A tuple of ints.
+            input_int_tuple_positions.append(i)
+            input_int_tuple_begins.append(len(input_int_tuples))
+            input_int_tuples.extend(list(arg))
+            continue
+        elif is_float_tuple:
+            # A tuple of floats.
+            input_float_tuple_positions.append(i)
+            input_float_tuple_begins.append(len(input_float_tuples))
+            input_float_tuples.extend(list(arg))
+            continue
+
+        from onnxruntime.training.utils.hooks._statistics_subscriber import _InspectActivation
+
+        is_inspect_activation = func_full_qual_name == get_fully_qualified_class_name(_InspectActivation)
+        if is_inspect_activation and isinstance(arg, str):
+            # _InspectActivation is a special case where the first argument is a string
+            # that is used to determine the activation name to be inspected.
+            debug_comment += arg
+
+        # All other inputs are accessed via "pointers".
+        input_pointer_scalar_positions.append(i)
+        input_pointer_scalars.append(id(arg))
+
+        # For pointer (for example, ProcessGroup passed to PythonOp) needed for PythonOp execution,
+        # we append it into a global store to hold a reference (in case it is released after module exported).
+        register_miscellaneous_const_input(arg)
+
+    output_tensor_types = []
+    output_tensor_ranks = []
+    for arg in outputs:
+        output_tensor_types.append(arg.type.tensor_type.elem_type)
+        output_tensor_ranks.append(len(arg.type.tensor_type.shape.dim))
+
+    attrs = {
+        "func_name": func_full_qual_name,
+        "input_convention": cconv,
+        "input_tensor_types": input_tensor_types,
+        "input_tensor_ranks": input_tensor_ranks,
+        "output_tensor_types": output_tensor_types,
+        "output_tensor_ranks": output_tensor_ranks,
+        "training_mode": training_mode,
+        "safe_run_mode": safe_run_mode,
+        "comment": debug_comment,
+    }
+
+    if len(input_bool_scalars) > 0:
+        attrs["input_bool_scalars"] = input_bool_scalars
+        attrs["input_bool_scalar_positions"] = input_bool_scalar_positions
+    if len(input_int_scalars) > 0:
+        attrs["input_int_scalars"] = input_int_scalars
+        attrs["input_int_scalar_positions"] = input_int_scalar_positions
+    if len(input_float_scalars) > 0:
+        attrs["input_float_scalars"] = input_float_scalars
+        attrs["input_float_scalar_positions"] = input_float_scalar_positions
+    if len(input_bool_tuples) > 0:
+        attrs["input_bool_tuples"] = input_bool_tuples
+        attrs["input_bool_tuple_positions"] = input_bool_tuple_positions
+        attrs["input_bool_tuple_begins"] = input_bool_tuple_begins
+    if len(input_int_tuples) > 0:
+        attrs["input_int_tuples"] = input_int_tuples
+        attrs["input_int_tuple_positions"] = input_int_tuple_positions
+        attrs["input_int_tuple_begins"] = input_int_tuple_begins
+    if len(input_float_tuples) > 0:
+        attrs["input_float_tuples"] = input_float_tuples
+        attrs["input_float_tuple_positions"] = input_float_tuple_positions
+        attrs["input_float_tuple_begins"] = input_float_tuple_begins
+    if len(input_pointer_scalars) > 0:
+        attrs["input_pointer_scalars"] = input_pointer_scalars
+        attrs["input_pointer_scalar_positions"] = input_pointer_scalar_positions
+
+    # Register function with class names.
+    register_torch_autograd_function(func_full_qual_name, func_class)
+
+    register_custom_function_schema_supplementary(func_class)
+
+    _PYTHON_OP_INCRE_INDEX[0] += 1
+    node_name = f"{name_prefix}_{_PYTHON_OP_INCRE_INDEX[0]}"
+
+    node = onnx.helper.make_node(
+        PYTHON_OP_TYPE,
+        tensor_args,
+        [f"{node_name}_ctx", *[output.name for output in outputs]],
+        node_name,  # node name
+        "",
+        PYTHON_OP_DOMAIN,
+        **attrs,
+    )
+
+    return node
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 5b2c673ce94cb..cc533e549db92 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -310,11 +310,22 @@ def forward(self, *inputs, **kwargs):
 
             self._gradient_accumulation_manager.maybe_update_cache_before_run()
 
-            if self._runtime_options.enable_zero_stage3_support:
+            if self._runtime_options.enable_zero_stage3_support or self._mem_efficient_grad_management_is_enabled:
                 self._append_pull_weight_trigger_as_input(kwargs, self._device)
 
+            param_to_append_as_onnx_graph_inputs = []
+            if self._mem_efficient_grad_management_is_enabled:
+                from ._mem_efficient_grad_mgmt import get_params_not_connected_to_pull_param_trigger
+
+                param_to_append_as_onnx_graph_inputs = get_params_not_connected_to_pull_param_trigger(
+                    self._flattened_module.named_parameters(), self._onnx_models.exported_model
+                )
+
+            else:
+                param_to_append_as_onnx_graph_inputs = self._graph_initializers
+
             prepared_input_list, _, _ = _io._combine_input_buffers_initializers(
-                self._graph_initializers,
+                param_to_append_as_onnx_graph_inputs,
                 self._graph_info.user_input_names,
                 self._input_info,
                 self._flattened_module.named_buffers(),
@@ -492,10 +503,20 @@ def _reinitialize_graph_builder(self, input_info: _InputInfo):
             if param.requires_grad and name in self._graph_initializer_names
         }
 
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+
+            # Remove the inputs we added during model post-processing.
+            existing_require_grad_names = [
+                n for n in self._input_info.require_grad_names if n != MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+            ]
+        else:
+            existing_require_grad_names = self._input_info.require_grad_names
+
         # If inputs requiring gradient change from forward to the next, the module_gradient_graph_builder
         # needs to be reinitialized so it can compute the backward output for the new inputs that require_grad
         if (
-            input_info.require_grad_names != self._input_info.require_grad_names
+            input_info.require_grad_names != existing_require_grad_names
             or initializer_names_to_train_set_user_model != self._graph_initializer_names_to_train
         ):
             self._input_info = input_info
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index bfa38efb349ae..df3b078788d16 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -308,6 +308,9 @@ def __init__(self, logger: Logger):
         # Experimental features.
         self.enable_zero_stage3_support = False  # Once enabled, cannot be disabled.
 
+        # We disable memory efficient grad management by default, will enable once it's fully validated.
+        self.enable_mem_efficient_grad_management = False
+
         self.deepcopy_before_model_export = True
 
         # Override the feature config if it exists in os env.
@@ -397,6 +400,15 @@ def _override_from_env_vars(self):
         if "ORTMODULE_ENABLE_ZERO_STAGE3" in os.environ and int(os.getenv("ORTMODULE_ENABLE_ZERO_STAGE3")) == 1:
             self.enable_zero_stage3_support = True
 
+        if "ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT" in os.environ:
+            enable_grad_mgmt = int(os.getenv("ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT"))
+            self.enable_mem_efficient_grad_management = enable_grad_mgmt == 1 and self.enable_custom_autograd_function
+            if not self.enable_custom_autograd_function and enable_grad_mgmt == 1:
+                self._logger.warning(
+                    "ORTModule optimization for memory efficient gradient management cannot be enabled "
+                    "because PyTorch custom autograd function support is disabled."
+                )
+
         if "ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT" in os.environ:
             self.deepcopy_before_model_export = int(os.getenv("ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT")) == 1
 
diff --git a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
index e6004319ef5ea..d4b9768116e92 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
@@ -289,7 +289,7 @@ def backward(ctx, *grads):
                     raise RuntimeError(f"param {p} has no grad, this should not happen.")
                 # Param gradient accumulation is triggered here, along with the attached hooks, done by PyTorch.
                 assert p.shape == g.shape, f"param_index: {param_index} - param shape {p.shape} != grad shape {g.shape}"
-                # p.backward(g)
+                p.backward(g)
 
         # At this point, the **real** param grads are already updated, the following grads are only used for
         # completing the full backward propagation, will not affect parameter updates.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index f944d8bc5ef42..938d33cc9a714 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -684,7 +684,7 @@ def test_input_requires_grad_saved(device):
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device, requires_grad=True) + 1
     model(x)
-    assert model._torch_module._execution_manager(model._is_training())._input_info.require_grad_names == ["input1"]
+    assert "input1" in model._torch_module._execution_manager(model._is_training())._input_info.require_grad_names
 
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
index 3c5ac56cb139a..0a98cd959dd36 100644
--- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
+++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
@@ -385,7 +385,10 @@ void PythonOpGradBase::RunBackward(OpKernelContext* context,
 
 void PythonOpGradBase::SetOutputs(OpKernelContext* context, std::vector<OrtValue>& returned_ortvalues) const {
   auto* ctx_internal = reinterpret_cast<onnxruntime::OpKernelContextInternal*>(context);
-  ORT_ENFORCE(output_convention_.size() == returned_ortvalues.size(), "backward output count mismatch.");
+  ORT_ENFORCE(output_convention_.size() == returned_ortvalues.size(), "backward output count mismatch. Expected ",
+              output_convention_.size(), ", but got ", returned_ortvalues.size(),
+              ". Please check the backward function return same number of outputs as forward function's input for ",
+              name_, ".");
   int tensor_output_index = 0;
   for (size_t i = 0; i < returned_ortvalues.size(); ++i) {
     if (output_convention_[i] == 'd') {
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
index 7824bf2203efe..e13ef9160bed3 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
@@ -24,7 +24,7 @@ jobs:
         --volume $(Build.SourcesDirectory)/orttraining/orttraining/test/python:/onnxruntime_src \
         --volume $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly:/requirements_torch_nightly \
         ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev \
-         bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
+         bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py && ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
     displayName: 'Run ORTModule Tests'
     condition: succeededOrFailed()
     timeoutInMinutes: 120

From 9f87c5c41d50fdcf30ce439617c708c964d8a050 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 15 Jan 2024 17:10:58 -0800
Subject: [PATCH 071/100] Fix build error due to merge with DML adapter
 enumeration macro defined (#19121)

### Description
Fix build error when ENABLE_NPU_ADAPTER_ENUMERATION is defined


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/providers/dml/dml_provider_factory.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 73a068f3e1de2..b2688094a6d78 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -329,7 +329,6 @@ static std::optional<OrtDmlDeviceFilter> ParseFilter(const ProviderOptions& prov
   static const std::string Any = "any";
   static const std::string Gpu = "gpu";
 #ifdef ENABLE_NPU_ADAPTER_ENUMERATION
-  static const std::string Any = "any";
   static const std::string Npu = "npu";
 #endif
 

From 9dee543bedaed8419957afaed3a64b1ab5fa3a21 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 15 Jan 2024 18:40:38 -0800
Subject: [PATCH 072/100] fix gemm beta for fp16 (#19153)

per onnx spec beta is always fp32 so we need to cast it
---
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 30754c84413b7..a0d4021516bf7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -100,8 +100,8 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt
     ${calculateAlpha}
     ${(() => {
       if (c != null) {
-        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += uniforms.beta * ${
-            c.getByOffset('cOffset')};`;
+        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += ${
+            dataType}(uniforms.beta) * ${c.getByOffset('cOffset')};`;
       }
       return '';
     })()}

From 1bab98988b4e7b6d33be0e672fce361ccbb1d397 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 16 Jan 2024 10:44:25 +0800
Subject: [PATCH 073/100] [WebNN EP] Fixed bug in int8 data type processing
 (#19134)

---
 .../core/providers/webnn/builders/helper.cc    |  5 ++++-
 .../core/providers/webnn/builders/helper.h     |  4 +++-
 .../webnn/builders/impl/cast_op_builder.cc     |  4 +++-
 .../webnn/builders/impl/conv_op_builder.cc     |  4 +++-
 .../core/providers/webnn/builders/model.cc     | 18 ++++++++++++++----
 .../providers/webnn/builders/model_builder.cc  | 11 +++++++++--
 6 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index a55145b0125a7..ef7c10dae580c 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -166,11 +166,14 @@ bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type) {
   // TODO: Remove legacy "type" once all browsers implement the new "dataType".
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       desc.set("type", emscripten::val("uint8"));
       desc.set("dataType", emscripten::val("uint8"));
       return true;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      desc.set("type", emscripten::val("int8"));
+      desc.set("dataType", emscripten::val("int8"));
+      return true;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       desc.set("type", emscripten::val("float16"));
       desc.set("dataType", emscripten::val("float16"));
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index f3fc7ec5cc4cd..85dafcaf66575 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -101,10 +101,12 @@ inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::va
   }
   switch (tensor.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       scalar = emscripten::val{*reinterpret_cast<uint8_t*>(unpacked_tensor.data())};
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      scalar = emscripten::val{*reinterpret_cast<int8_t*>(unpacked_tensor.data())};
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       scalar = emscripten::val{MLFloat16::FromBits(*reinterpret_cast<uint16_t*>(unpacked_tensor.data())).ToFloat()};
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
index 062f1c56061a9..3d961e4589c2e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
@@ -39,10 +39,12 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::string operand_type;
   switch (to_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       operand_type = "uint8";
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      operand_type = "int8";
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       operand_type = "float16";
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 123a9cc016515..ceacb7c2b38a3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -184,10 +184,12 @@ Status AddInitializerInNewLayout(ModelBuilder& model_builder,
   size_t element_size{0};
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       element_size = sizeof(uint8_t);
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      element_size = sizeof(int8_t);
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       element_size = sizeof(uint16_t);
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index a4031fd9350c5..eaf549ef4e072 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -33,11 +33,14 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                             static_cast<const int8_t*>(tensor.buffer))};
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint16_t*>(tensor.buffer))};
@@ -90,11 +93,14 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                             static_cast<const int8_t*>(tensor.buffer))};
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint16_t*>(tensor.buffer))};
@@ -168,10 +174,12 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = input_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_inputs_.set(input, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        wnn_inputs_.set(input, emscripten::val::global("Int8Array").new_(num_elements));
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         wnn_inputs_.set(input, emscripten::val::global("Uint16Array").new_(num_elements));
         break;
@@ -201,10 +209,12 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = output_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_outputs_.set(output, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        wnn_outputs_.set(output, emscripten::val::global("Int8Array").new_(num_elements));
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         wnn_outputs_.set(output, emscripten::val::global("Uint16Array").new_(num_elements));
         break;
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 4e0c83db8b127..cf8a0e23db43b 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -160,12 +160,16 @@ Status ModelBuilder::RegisterInitializers() {
       }
       switch (data_type) {
         case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-        case ONNX_NAMESPACE::TensorProto_DataType_INT8:
         case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
           desc.set("type", emscripten::val("uint8"));
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint8_t*>(tensor_ptr))};
           break;
+        case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+          desc.set("type", emscripten::val("int8"));
+          view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                               reinterpret_cast<int8_t*>(tensor_ptr))};
+          break;
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint16_t*>(tensor_ptr))};
@@ -318,11 +322,14 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
   ORT_RETURN_IF_NOT(SetWebnnDataType(desc, data_type), "Unsupported data type");
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint8_t),
                                                            reinterpret_cast<const uint8_t*>(dest))};
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      view = emscripten::val{emscripten::typed_memory_view(size / sizeof(int8_t),
+                                                           reinterpret_cast<const int8_t*>(dest))};
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint16_t),
                                                            reinterpret_cast<const uint16_t*>(dest))};

From 8d4369b77ef8567653db3e247bbb2f48889fc457 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 15 Jan 2024 19:04:41 -0800
Subject: [PATCH 074/100] Update DirectML nuget version to 1.13.1 (#19122)

### Description
Update DML version to 1.13.1


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .pipelines/nuget_config/x64/packages.config     | 2 +-
 .pipelines/nuget_config/x86/packages.config     | 2 +-
 cmake/external/dml.cmake                        | 2 +-
 packages.config                                 | 2 +-
 tools/nuget/generate_nuspec_for_native_nuget.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 2583e0d1b2ead..b862dec5e1c87 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.13.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index 5ca659941c159..c348dd3e9cdad 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.13.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index dfd9ad120eb98..ae7e6d3801a64 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.13.0)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.13.1)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(
diff --git a/packages.config b/packages.config
index b67219d6d6913..e5b134d99dd89 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.AI.DirectML" version="1.13.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
   <package id="google.protobuf.tools" version="3.21.12" targetFramework="native" />
 </packages>
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 56e50750ac153..09fe99d36cc34 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version):
 
 
 def generate_dependencies(xml_text, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.13.0"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.13.1"/>'
 
     if package_name == "Microsoft.AI.MachineLearning":
         xml_text.append("<dependencies>")

From c92f72ebebf5f4a1e63b726e6e5cec1a47250bb5 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 16 Jan 2024 11:59:03 -0500
Subject: [PATCH 075/100] Merge Linux Nuget GPU pipeline with zip-nuget
 (#19120)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../c-api-noopenmp-packaging-pipelines.yml    | 174 ++----------------
 .../nuget-linux-cuda-packaging-stage.yml      |  18 +-
 2 files changed, 31 insertions(+), 161 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index f80b035582f18..2169a3ce1bb9e 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,6 +83,16 @@ resources:
 variables:
 - name: ReleaseVersionSuffix
   value: ''
+- name: docker_base_image
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+- name: linux_trt_version
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: 8.6.1.6-1.cuda11.8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: 8.6.1.6-1.cuda12.0
 
 stages:
 - stage: Setup
@@ -189,64 +199,11 @@ stages:
     AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
     BuildVariant: 'default'
 
-- stage: Linux_C_API_Packaging_GPU_x64
-  dependsOn: []
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    timeoutInMinutes: 120
-    pool: 'Onnxruntime-Linux-GPU'
-    variables:
-      - name: CUDA_VERSION_MAJOR
-        ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: '11'
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: '12'
-      - name: CUDA_VERSION
-        value: ${{ parameters.CudaVersion }}
-    steps:
-    - template: templates/set-version-number-variables-step.yml
-    - template: templates/get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build
-
-    - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
-      workingDirectory: $(Build.SourcesDirectory)
-      displayName: 'Build and Test'
-
-    - template: templates/java-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-          arch: 'linux-x64'
-          buildConfig: 'Release'
-          artifactName: 'onnxruntime-java-linux-x64-cuda'
-          version: '$(OnnxRuntimeVersion)'
-          libraryName: 'libonnxruntime.so'
-          nativeLibraryName: 'libonnxruntime4j_jni.so'
-
-    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-        buildConfig: 'Release'
-        artifactName: 'onnxruntime-linux-x64-cuda-$(OnnxRuntimeVersion)'
-        artifactNameNoVersionString: 'onnxruntime-linux-x64-cuda'
-        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
-    - template: templates/clean-agent-build-directory-step.yml
-
-- template: templates/linux-gpu-tensorrt-packaging-pipeline.yml
+- template: stages/nuget-linux-cuda-packaging-stage.yml
   parameters:
-      artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
-      artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-      buildJava: true
-      buildJavaOption: '--build_java'
-      buildNodejs: true
-      buildNodejsOption: '--build_nodejs'
+    CudaVersion: ${{ parameters.CudaVersion }}
+    docker_base_image: ${{ variables.docker_base_image }}
+    linux_trt_version: ${{ variables.linux_trt_version }}
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
@@ -527,109 +484,6 @@ stages:
       displayName: 'Clean Agent Directories'
       condition: always()
 
-- stage: Linux_Packaging_combined_GPU
-  dependsOn:
-  - Linux_C_API_Packaging_GPU_x64
-  - Linux_C_API_Packaging_GPU_TensorRT_x64
-  condition: succeeded()
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    pool: 'Onnxruntime-Linux-GPU'
-
-    steps:
-    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
-      submodules: false
-    - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
-      submodules: false
-    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
-      submodules: false
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
-
-    - script: |
-        set -e -x
-        cd $(Build.SourcesDirectory)
-        mv manylinux onnxruntime
-        ls
-
-    - template: templates/with-container-registry-steps.yml
-      parameters:
-        Steps:
-        - script: |
-            tools/ci_build/get_docker_image.py \
-              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \
-              --context tools/ci_build/github/linux/docker \
-              --docker-build-args "--network=host --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 --build-arg BUILD_UID=$( id -u )" \
-              --container-registry onnxruntimebuildcache \
-              --multiple_repos \
-              --repository onnxruntimecuda118xtrt86build
-          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda"
-          workingDirectory: $(Build.SourcesDirectory)/onnxruntime
-        ContainerRegistry: onnxruntimebuildcache
-
-    - template: templates/set-version-number-variables-step.yml
-      parameters:
-        versionFileDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-cuda'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-tensorrt'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ShellScript@2
-      displayName: 'Shell Script'
-      inputs:
-        scriptPath: 'onnxruntime/tools/ci_build/github/linux/extract_and_bundle_gpu_package.sh'
-        args: '-a $(Build.BinariesDirectory)/tgz-artifacts'
-        workingDirectory: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ArchiveFiles@2
-      inputs:
-        rootFolderOrFile: '$(Build.BinariesDirectory)/tgz-artifacts/onnxruntime-linux-x64-gpu'
-        includeRootFolder: false
-        archiveType: 'tar' # Options: zip, 7z, tar, wim
-        tarCompression: 'gz'
-        archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        replaceExistingArchive: true
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'tarball'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        ScriptPath: '$(Build.SourcesDirectory)/onnxruntime/tools/nuget/validate_package.py'
-        PlatformsSupported: 'linux-x64'
-        VerifyNugetSigning: false
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-
-    - task: CmdLine@2
-      displayName: 'Test C API application for GPU package'
-      inputs:
-        script: |
-          docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/models:/data/models --volume $(Build.SourcesDirectory):/src_dir \
-          --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
-          /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-    - task: PublishPipelineArtifact@1
-      inputs:
-        targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        artifactName: 'onnxruntime-linux-x64-gpu'
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
-
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 48a6e0e8529e6..dbbc9ef27e513 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -40,7 +40,16 @@ stages:
     - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
       workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build and Test'
-
+# We only support Maven package for CUDA 11.8
+    - ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      - template: ../templates/java-api-artifacts-package-and-publish-steps-posix.yml
+        parameters:
+          arch: 'linux-x64'
+          buildConfig: 'Release'
+          artifactName: 'onnxruntime-java-linux-x64-cuda'
+          version: '$(OnnxRuntimeVersion)'
+          libraryName: 'libonnxruntime.so'
+          nativeLibraryName: 'libonnxruntime4j_jni.so'
     - template: ../templates/c-api-artifacts-package-and-publish-steps-posix.yml
       parameters:
         buildConfig: 'Release'
@@ -82,6 +91,10 @@ stages:
         - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
           submodules: false
 
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
         - script: |
             set -e -x
             cd $(Build.SourcesDirectory)
@@ -159,3 +172,6 @@ stages:
           inputs:
             targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
             artifactName: 'onnxruntime-linux-x64-gpu'
+        - template: ../templates/component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
\ No newline at end of file

From e2e488d6f8bcd14f40e9e2c8e65f310ce9c0e872 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jan 2024 09:18:35 -0800
Subject: [PATCH 076/100] Revert "iOS packaging pipeline stability" (#19135)

Reverts microsoft/onnxruntime#19097 because it broken Android CI
pipeline.
---
 .../external/onnxruntime_external_deps.cmake  | 74 +++++++++----------
 .../mac-ios-packaging-pipeline.yml            |  2 +-
 .../stages/mac-ios-packaging-build-stage.yml  |  7 +-
 3 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index c79bb87fd7f5d..78f63227c8392 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -108,14 +108,41 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  if (APPLE)
-    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
-    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
-    # To keep it simple, just download and use the universal protoc binary for Apple builds.
+  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+      FetchContent_Populate(protoc_binary)
+    endif()
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
+  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+      FetchContent_Populate(protoc_binary)
+    endif()
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
+  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -123,38 +150,6 @@ if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
-  elseif(CMAKE_CROSSCOMPILING)
-    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-        FetchContent_Populate(protoc_binary)
-      endif()
-      if(protoc_binary_SOURCE_DIR)
-        message("Use prebuilt protoc")
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-      endif()
-    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-        FetchContent_Populate(protoc_binary)
-      endif()
-      if(protoc_binary_SOURCE_DIR)
-        message("Use prebuilt protoc")
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-      endif()
-    endif()
   endif()
 endif()
 
@@ -189,9 +184,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause
+#TODO: we'd better to turn the following option off. However, it will cause 
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -567,3 +562,4 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 34a51649fc384..5fd15b64e03b6 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-12"  # macOS-13 seems less stable. macOS-12 will work for this job.
+      vmImage: "macOS-13"
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index ed32c5d0e15be..d1dff0769e25f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -78,6 +78,10 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
+    - script: |
+        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
+      displayName: "Build Host Protoc"
+
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -87,7 +91,8 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }}
+          ${{ variables.optionalIncludeOpsByConfigOption }} \
+          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |

From 80f274ca6f2f4572d827edd6dc7f736d7a8c036a Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 16 Jan 2024 09:42:59 -0800
Subject: [PATCH 077/100] Fix SkipLayerNormalization shape inference (#18724)

SkipLayerNorm has more than one input, so `propagateShapeAndTypeFromFirstInput` is not enough.
---
 .../core/graph/contrib_ops/bert_defs.cc       |  4 +-
 .../contrib_ops/shape_inference_functions.cc  | 39 +++++++++++++++++++
 .../contrib_ops/shape_inference_functions.h   |  3 +-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index df8d0a59cb033..0317ffcfb0e31 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1285,7 +1285,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Output(3, "input_skip_bias_sum", "Sum of the input and skip inputs (and bias if it exists) with shape (batch_size, sequence_length, hidden_size).", "T", OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
     SkipSimplifiedLayerNormalization, 1,
@@ -1334,7 +1334,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 constexpr const char* NGramRepeatBlock_ver1_doc = R"DOC(
 Enforce no repetition of n-grams. Scores are set to `-inf` for tokens that form a repeated n-gram if added to the back of the input_ids.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
index eeef20e9dff5e..8b1812f62be25 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
@@ -114,6 +114,45 @@ void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& c
   }
 }
 
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx) {
+  propagateShapeAndTypeFromFirstInput(ctx);
+
+  auto stash_type = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+  if (ctx.getNumOutputs() > 1) {
+    auto output_type = ctx.getOutputType(1);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 2) {
+    auto output_type = ctx.getOutputType(2);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 3) {
+    propagateElemTypeFromInputToOutput(ctx, 0, 3);
+  }
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+  auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+  int64_t input_ndim = input_shape.dim_size();
+  int axis = static_cast<int>(input_ndim - 1);
+
+  if (ctx.getNumOutputs() > 1) {
+    auto mean_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
+    mean_shape->CopyFrom(input_shape);
+    mean_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 2) {
+    auto inv_std_dev_shape = ctx.getOutputType(2)->mutable_tensor_type()->mutable_shape();
+    inv_std_dev_shape->CopyFrom(input_shape);
+    inv_std_dev_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 3) {
+    propagateShapeFromInputToOutput(ctx, 0, 3);
+  }
+}
+
 // Shape inference for Attention and QAttention
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index) {
   // Input 0, 1, 2 are input, weights and bias.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
index 93cf5b304f653..6eb06af15309c 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
@@ -13,5 +13,6 @@ namespace onnxruntime {
 namespace contrib {
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index);
 void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
 }  // namespace contrib
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime

From 8e272b9cac70a11c472fb002af755213a4dabf66 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 16 Jan 2024 16:53:15 -0500
Subject: [PATCH 078/100] Update build.py to remove unused functions and update
 python to 3.8 (#19164)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 tools/ci_build/build.py | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 0da4adb51767d..1a6262edf45c9 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -56,7 +56,7 @@ def __init__(self, message):
 
 
 def _check_python_version():
-    required_minor_version = 7
+    required_minor_version = 8
     if (sys.version_info.major, sys.version_info.minor) < (3, required_minor_version):
         raise UsageError(
             f"Invalid Python version. At least Python 3.{required_minor_version} is required. "
@@ -786,11 +786,6 @@ def get_linux_distro():
         return "", ""
 
 
-def is_ubuntu_1604():
-    dist, ver = get_linux_distro()
-    return dist == "Ubuntu" and ver.startswith("16.04")
-
-
 def get_config_build_dir(build_dir, config):
     # build directory per configuration
     return os.path.join(build_dir, config)
@@ -844,15 +839,6 @@ def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
 
 
-def is_docker():
-    path = "/proc/self/cgroup"
-    return (
-        os.path.exists("/.dockerenv")
-        or os.path.isfile(path)
-        and any("docker" in line for line in open(path))  # noqa: SIM115
-    )
-
-
 def install_python_deps(numpy_version=""):
     dep_packages = ["setuptools", "wheel", "pytest"]
     dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
@@ -2401,16 +2387,6 @@ def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt
     run_subprocess(cmd_args, cwd=csharp_source_dir)
 
 
-def is_cross_compiling_on_apple(args):
-    if not is_macOS():
-        return False
-    if args.ios:
-        return True
-    if args.osx_arch != platform.machine():
-        return True
-    return False
-
-
 def generate_documentation(source_dir, build_dir, configs, validate):
     # Randomly choose one build config
     config = next(iter(configs))
@@ -2725,12 +2701,6 @@ def main():
             log.info("Activating emsdk...")
             run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
 
-        if is_ubuntu_1604():
-            if args.arm or args.arm64:
-                raise BuildError("Only Windows ARM(64) cross-compiled builds supported currently through this script")
-            if not is_docker() and not args.use_acl and not args.use_armnn:
-                install_python_deps()
-
         if args.enable_pybind and is_windows():
             install_python_deps(args.numpy_version)
 

From c935c8fbd2e463a3e0153145140a8efd780dfabc Mon Sep 17 00:00:00 2001
From: moyo1997 <54333118+moyo1997@users.noreply.github.com>
Date: Tue, 16 Jan 2024 16:24:37 -0800
Subject: [PATCH 079/100] remove unnecessary environment variable (#19166)

remove unnecessary environment variable when building as arm64x
---
 build_arm64x.bat | 1 -
 1 file changed, 1 deletion(-)

diff --git a/build_arm64x.bat b/build_arm64x.bat
index fbcdd373086a9..1ed268ae94a43 100644
--- a/build_arm64x.bat
+++ b/build_arm64x.bat
@@ -5,7 +5,6 @@
 
 setlocal
 set PATH=C:\Program Files\Git\usr\bin;%PATH%
-set LINK_REPRO_NAME=/mylink.rsp
 
 rem Requires a Python install to be available in your PATH
 python "%~dp0\tools\ci_build\build.py" --arm64 --buildasx  --build_dir "%~dp0\build\arm64-x" %*

From e61861b0a121bca1d60e5d4a3722e52b6820c430 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Tue, 16 Jan 2024 16:36:28 -0800
Subject: [PATCH 080/100] Clean up generated files in QNN UTs (#19127)

### Description
Clean up generated files in QNN UTs
---
 onnxruntime/test/providers/qnn/simple_op_htp_test.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 8ff65c08e8633..c4244fe532456 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -815,7 +815,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
   // Check the Onnx skeleton file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
   // Check the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists("qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"));
+  std::string qnn_ctx_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  EXPECT_TRUE(std::filesystem::exists(qnn_ctx_bin));
 
   // 2nd run loads and run from QDQ model + Onnx skeleton file + Qnn context cache binary file
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
@@ -837,6 +838,10 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(qnn_ctx_bin.c_str()), 0);
 }
 
 // Run QDQ model on HTP 2 times
@@ -898,6 +903,9 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
   ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
   // Verify the return status with code INVALID_GRAPH
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // Run QDQ model on HTP with 2 inputs
@@ -955,6 +963,8 @@ TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {

From 81d363045ba273b16a3ec654c53a15217a2d2a36 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jan 2024 17:25:18 -0800
Subject: [PATCH 081/100] Upgrade Ubuntu machine pool from 20.04 to 22.04
 (#19117)

### Description
Upgrade Ubuntu machine pool from 20.04 to 22.04
---
 .../build-perf-test-binaries-pipeline.yml     |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 +-
 ...lean-build-docker-image-cache-pipeline.yml | 10 +--------
 .../cuda-packaging-pipeline.yml               |  2 +-
 .../azure-pipelines/linux-ci-pipeline.yml     |  4 ++--
 .../linux-cpu-aten-pipeline.yml               |  2 +-
 .../linux-cpu-eager-pipeline.yml              |  2 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  2 +-
 .../linux-migraphx-ci-pipeline.yml            |  2 +-
 .../npm-packaging-pipeline.yml                |  4 ++--
 .../nuget/templates/test_linux.yml            |  2 +-
 .../orttraining-linux-ci-pipeline.yml         |  2 +-
 .../orttraining-pai-ci-pipeline.yml           |  4 ++--
 .../orttraining-py-packaging-pipeline-cpu.yml |  2 +-
 .../azure-pipelines/post-merge-jobs.yml       |  6 ++---
 .../py-package-test-pipeline.yml              |  2 +-
 .../stages/py-cuda-packaging-stage.yml        |  2 +-
 .../stages/py-cuda-publishing-stage.yml       |  2 +-
 .../templates/android-java-api-aar.yml        |  2 +-
 .../templates/build-linux-wasm-step.yml       | 22 +++++++++----------
 .../azure-pipelines/templates/c-api-cpu.yml   |  4 ++--
 .../templates/c-api-linux-cpu.yml             |  2 +-
 .../azure-pipelines/templates/linux-ci.yml    |  2 +-
 .../linux-cpu-packaging-pipeline.yml          |  2 +-
 .../templates/linux-wasm-ci.yml               |  2 +-
 ...device-training-cpu-packaging-pipeline.yml |  2 +-
 .../py-packaging-selectable-stage.yml         |  2 +-
 .../templates/py-packaging-stage.yml          |  4 ++--
 .../github/azure-pipelines/templates/rocm.yml |  2 +-
 .../azure-pipelines/web-ci-pipeline.yml       |  2 +-
 .../linux/build_linux_python_package.sh       |  6 ++---
 .../ci_build/github/linux/run_python_tests.sh |  2 +-
 32 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 3ddc167bc0a61..d37e9bdc5da4c 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -28,7 +28,7 @@ stages:
         artifactName: 'onnxruntime-android-full-aar'
         job_name_suffix: 'Full'
         publish_executables: '1'
-        pool_name: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 # build Python packages
 # Linux GPU only
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 2169a3ce1bb9e..3803333bd880a 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -246,7 +246,7 @@ stages:
     workspace:
       clean: all
     timeoutInMinutes: 120
-    pool: onnxruntime-Ubuntu2004-AMD-CPU
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     variables:
       RocmVersion: '5.6'
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
index 24086b6166fe4..43e668eef8d00 100644
--- a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
@@ -19,8 +19,7 @@ variables:
 jobs:
 - job: Clean_Build_Docker_Image_Cache
 
-  pool:
-    vmImage: 'ubuntu-20.04'
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
 
   timeoutInMinutes: 30
 
@@ -29,13 +28,6 @@ jobs:
     submodules: false
     fetchDepth: 1
 
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.9'
-      addToPath: true
-      architecture: 'x64'
-    displayName: "Use Python 3.9"
-
   - task: AzureCLI@2
     inputs:
       azureSubscription: 'AIInfraBuild'
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index df7b5f59d28fc..1d2ba88652f48 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -126,7 +126,7 @@ stages:
         BaseImage: 'registry.access.redhat.com/ubi8/ubi'
         OnnxruntimeArch: 'x64'
         OnnxruntimeNodejsBindingArch: 'x64'
-        PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
         PackageJava: false
         PackageNodeJS: false
   # Nuget Packaging
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 07f672c75d029..cff7c96aa9253 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -46,7 +46,7 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
@@ -123,7 +123,7 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
index 146186e9eeaf5..090ce97296687 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
@@ -43,7 +43,7 @@ jobs:
   variables:
     CCACHE_DIR: $(Agent.TempDirectory)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
index a5c08e95b7efc..d3d13cc5344da 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
@@ -51,7 +51,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - checkout: self
     clean: true
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 0993a81a02249..5bc8c3603ee92 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -64,7 +64,7 @@ jobs:
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index f7571a3b7eab6..9cf7a3fb42397 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -46,7 +46,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
index 7f73da23b5eb1..21fc205c72e89 100644
--- a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
@@ -41,7 +41,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: true
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-web'
     ExtraBuildArgs: ''
     UseWebPoolName: true
@@ -54,7 +54,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     BuildConfig: 'Release'
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-react-native'
     BuildAndroidAARStageDependsOn: 'Precheck_and_extract_commit'
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index f44106c145228..2567bec9fdfc2 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   ArtifactSuffix: ''
   NugetPackageName : ''
   StageSuffix: 'CPU'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index 018672e0b2dea..26fd5e1ec0b5d 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -44,7 +44,7 @@ jobs:
     skipComponentGovernanceDetection: true
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu-2004-Training-CPU
+  pool: onnxruntime-Ubuntu-2204-Training-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index a53f91fb317cb..71b224b65964f 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -37,7 +37,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
@@ -132,7 +132,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 817ace0571837..a44a8c215939f 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -16,7 +16,7 @@ stages:
       timeoutInMinutes: 180
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
 
       strategy:
         matrix:
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 5ee39876733e2..3ec5400dacc65 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -4,7 +4,7 @@ stages:
     parameters:
       NpmPackagingMode: 'dev'
       IsReleasePipeline: true
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       BuildStaticLib: true
       ExtraBuildArgs: ''
       UseWebPoolName: true
@@ -367,7 +367,7 @@ stages:
     timeoutInMinutes: 150
     variables:
       skipComponentGovernanceDetection: true
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     steps:
     - template: templates/set-version-number-variables-step.yml
 
@@ -413,7 +413,7 @@ stages:
   - job: AndroidCustomBuildScript
     workspace:
       clean: all
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     variables:
       dockerImageTag: onnxruntime-android-custom-build
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 55d3150f21aa3..04f555deb1a22 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
   - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'x86_64'
-      machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
       base_image: 'registry.access.redhat.com/ubi8/ubi'
       devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
       ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index e6d8ee35e75e3..f82c80d4d7e93 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -105,7 +105,7 @@ stages:
       - template: ../templates/py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           docker_base_image: ${{ variables.docker_base_image }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
index 4f440e0f61b3d..2a4debcf9fba5 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
@@ -20,7 +20,7 @@ stages:
       dependsOn: []
     jobs:
       - job:
-        pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
         steps:
           - checkout: none
           - task: DownloadPipelineArtifact@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 5e61f88b4aa18..509fea45ebe53 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -33,7 +33,7 @@ parameters:
 - name: pool_name
   displayName: Pool name
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: packageName
   # now we can build onnxruntime or onnxruntime-mobile for Android, need specify it here
diff --git a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
index e664cf69dec76..e77b1a4008b7c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
@@ -24,19 +24,17 @@ parameters:
   type: string
 
 steps:
-  - task: Cache@2
-    inputs:
-      ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
-        key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
-      ${{else}}:
-        key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
-      path: ${{parameters.CacheDir}}
-      restoreKeys: |
-        "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
-    displayName: Cache Task
-    condition: eq('${{parameters.WithCache}}', true)
-
   - ${{if eq(parameters.WithCache, true)}}:
+    - task: Cache@2
+      inputs:
+        ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
+          key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
+        ${{else}}:
+          key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
+        path: ${{parameters.CacheDir}}
+        restoreKeys: |
+          "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
+      displayName: Cache Task
     - script: |
         set -e -x
         pushd '$(Build.SourcesDirectory)/cmake/external/emsdk'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 81319e07c6b17..168602a17910b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -759,7 +759,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime'
     ArtifactSuffix: 'CPU'
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
@@ -796,7 +796,7 @@ stages:
     OS: Linux
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - template: final-jar-testing.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 8538f15e93753..cf470b3fa2448 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -19,7 +19,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: ArtifactNamePrefix
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
index 7b9788d90b17d..15165e3cb0950 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool : 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool : 'onnxruntime-Ubuntu2204-AMD-CPU'
   StageName : 'Linux_CI_Dev'
   RunDockerBuildArgs: '-o ubuntu20.04 -d cpu -x "--build_wheel"'
   NuPackScript: ''
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 6ad5f9f38a4db..8972d55f6e190 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -32,7 +32,7 @@ stages:
       BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
       OnnxruntimeNodejsBindingArch: 'x64'
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
       PackageJava: ${{ parameters.PackageJava }}
       PackageNodeJS: ${{ parameters.PackageNodeJS }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index e6693a6f6d26a..d279e667f9091 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -13,7 +13,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: SkipPublish
   type: boolean
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 51583a25f63ac..cf39be23cbdaf 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -336,7 +336,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Training'
     ArtifactSuffix: 'Training-CPU'
     StageSuffix: 'Training_CPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 00ba5ea4a475a..01cab936aa529 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -48,7 +48,7 @@ stages:
       timeoutInMinutes: 90
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       strategy:
         matrix:
           ${{ each PythonVersion in parameters.python_version }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index abe06e80f4f19..8669a883c31f1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -430,7 +430,7 @@ stages:
       - template: py-linux.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           base_image: 'registry.access.redhat.com/ubi8/ubi'
           devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
           ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
@@ -443,6 +443,6 @@ stages:
       - template: py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index 2e9e6c6b35a2e..43a80aa4fd4e3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -14,7 +14,7 @@ jobs:
   workspace:
     clean: all
   timeoutInMinutes: 180
-  pool: Ubuntu-2004-rocm-aiinfra
+  pool: Ubuntu-2204-rocm-aiinfra
   variables:
     - name: PythonVersion
       value: ${{ parameters.PythonVersion }}
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index e352a04068ee8..24809ccfdec1f 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -53,7 +53,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: false
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     BuildStaticLib: true
     ExtraBuildArgs: $(ExtraBuildArgs)
     WASMTemplate: linux-wasm-ci.yml
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 1059dd5047477..933d1f3d5874a 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -7,9 +7,9 @@ mkdir -p /build/dist
 
 EXTRA_ARG=""
 
-# Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this 
-# config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp38-cp38/bin/python3.8")
+# Put 3.8 at the last because Ubuntu 22.04 use python 3.10 and we will upload the intermediate build files of this 
+# config to Azure DevOps Artifacts and download them to a Ubuntu 22.04 machine to run the tests.
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp310-cp310/bin/python3.10")
 while getopts "d:p:x:c:" parameter_Option
 do case "${parameter_Option}"
 in
diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh
index 3164a10a09dfd..082c561dd17b9 100755
--- a/tools/ci_build/github/linux/run_python_tests.sh
+++ b/tools/ci_build/github/linux/run_python_tests.sh
@@ -15,7 +15,7 @@ c) BUILD_CONFIG=${OPTARG};;
 esac
 done
 
-export PATH=/opt/python/cp38-cp38/bin:$PATH
+export PATH=/opt/python/cp310-cp310/bin:$PATH
 cd /build
 files=(whl/*.whl)
 FILE_NAME="${files[0]}"

From 07d3aed3aa3a054deb502cedf867f559fc690755 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 17 Jan 2024 13:35:13 +0800
Subject: [PATCH 082/100] [WebNN EP] Fixed build issue with disable_rtti
 (#19173)

Previously building webnn ep with --disable_rtti will throw
unboundTypeError since unbound type names are illegal with RTTI disabled
in Embind API, we can fix it by adding a
-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 flag.
---
 cmake/adjust_global_compile_flags.cmake | 5 +++++
 cmake/onnxruntime_webassembly.cmake     | 5 ++++-
 tools/ci_build/build.py                 | 4 ----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 30d8cbf78fb1a..2c7bf9f1c2f5c 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -123,6 +123,11 @@ if (onnxruntime_DISABLE_RTTI)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/GR->" "$<$<COMPILE_LANGUAGE:CXX>:/we4541>")
   else()
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>")
+    if (onnxruntime_USE_WEBNN)
+      # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
+      # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
+      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0>")
+    endif()
   endif()
 else()
   #MSVC RTTI flag /GR is not added to CMAKE_CXX_FLAGS by default. But, anyway VC++2019 treats "/GR" default on.
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 858583e64e9df..546d50c1ca2d3 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -268,7 +268,10 @@ else()
   endif()
 
   if (onnxruntime_USE_WEBNN)
-   set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    if (onnxruntime_DISABLE_RTTI)
+      set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -fno-rtti -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+    endif()
   endif()
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 1a6262edf45c9..1034a82cb2854 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1283,10 +1283,6 @@ def generate_build_tree(
     if args.use_webnn:
         if not args.build_wasm:
             raise BuildError("WebNN is only available for WebAssembly build.")
-        if args.disable_rtti:
-            # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
-            # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/16911
-            raise BuildError("WebNN is not supported with RTTI disabled.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
     if args.use_snpe:

From 9876cc7c4f5f6249e1dec8b93abf7b8dfcf5ca0c Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 17 Jan 2024 15:46:19 +0800
Subject: [PATCH 083/100] more inputs support for LLM exporter (#19005)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../transformers/large_model_exporter.py      | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 1601b1a203b9a..9e8b284bf56c7 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -224,24 +224,35 @@ def fetch_onnx_inputs_outputs_name(
     if not num_of_past_key:
         num_of_past_key = model.config.num_hidden_layers
 
-    onnx_inp_names = ("input_ids", "attention_mask")
+    # filter out constant inputs
+    onnx_inp_names = tuple(
+        [torch_input_names[i] for i in range(len(torch_input_names)) if isinstance(onnx_inputs[i], torch.Tensor)]
+    )
+    assert (
+        "input_ids" in onnx_inp_names and "attention_mask" in onnx_inp_names
+    ), "input_ids and attention_mask must be existed in inputs"
     onnx_out_names = ("logits",)
     onnx_dynamic_axes = {
         "input_ids": {0: "batch_size", 1: "seq_len"},
         "attention_mask": {0: "batch_size", 1: "seq_len"},
     }
+    # add dyanmic dimensions for the unkonw inputs
+    for idx, name in enumerate(onnx_inp_names):
+        if name not in onnx_dynamic_axes:
+            unknown_dims = {i: f"{idx}__unknown_dims__{i}" for i in range(onnx_inputs[idx].dim())}
+            onnx_dynamic_axes[name] = unknown_dims
     if input_with_past:
         for i in range(num_of_past_key):
-            onnx_inp_names += (f"present_key.{i}",)
-            onnx_inp_names += (f"present_values.{i}",)
+            onnx_inp_names += (f"past_key_values.{i}.key",)
+            onnx_inp_names += (f"past_key_values.{i}.value",)
 
             onnx_dynamic_axes[onnx_inp_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_inp_names[-2]] = kv_cache_axis
 
     if with_past or input_with_past:
         for i in range(num_of_past_key):
-            onnx_out_names += (f"past_key.{i}",)
-            onnx_out_names += (f"past_values.{i}",)
+            onnx_out_names += (f"present.{i}.key",)
+            onnx_out_names += (f"present.{i}.value",)
             onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
 

From 63dd605d3310f5a9540c414216f3f3b67d455c4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:00:36 +0100
Subject: [PATCH 084/100] Fix untyped float values in quantization tool missing
 from PR #18043 (#19182)

### Description
Extends the code coverage to Entroy, Histogram and Distribution
calibration method, fix bugs while doing it.


### Motivation and Context
Bugs detected in [Olive](https://github.com/microsoft/OLive).
---
 .../python/tools/quantization/calibrate.py    | 86 +++++++++++++++----
 .../python/tools/quantization/quant_utils.py  |  2 +-
 .../python/quantization/test_op_matmul.py     | 66 +++++++++++++-
 3 files changed, 131 insertions(+), 23 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index d0db57c392961..77b3dce9fb004 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -5,6 +5,7 @@
 # license information.
 # --------------------------------------------------------------------------
 import abc
+import copy
 import itertools
 import os
 import uuid
@@ -21,6 +22,48 @@
 from .quant_utils import apply_plot, load_model_with_shape_infer, smooth_distribution
 
 
+def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray:
+    """
+    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr.
+    Python implementation.
+    """
+    res = np.empty(pk.shape, dtype=pk.dtype)
+    res[:] = pk[:] * np.log(pk[:] / qk[:])
+    c2 = (pk == 0) & (qk >= 0)
+    res[c2] = 0
+    c1 = (pk > 0) & (qk > 0)
+    res[~c1] = np.inf
+    return res
+
+
+def entropy(
+    pk: np.ndarray,
+    qk: np.ndarray,
+    base: Optional[float] = None,
+    axis: int = 0,
+) -> np.ndarray:
+    """
+    Simplifeied version of entropy.
+    Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html.
+    This avoids taking a dependency on scipy just for this function.
+    """
+    assert base is None or base > 0, "base={base} must be a positive number or `None`."
+    assert qk is not None, "qk is None"
+
+    pk = np.asarray(pk).astype(np.float32)
+    pk = 1.0 * pk / np.sum(pk, axis=axis, keepdims=True)
+
+    qk = np.asarray(qk).astype(np.float32)
+    pk, qk = np.broadcast_arrays(pk, qk)
+    qk = 1.0 * qk / np.sum(qk, axis=axis, keepdims=True)
+    vec = rel_entr(pk, qk)
+
+    s = np.sum(vec, axis=axis)
+    if base is not None:
+        s /= np.log(base)
+    return s.astype(pk.dtype)
+
+
 class TensorData:
     _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
     _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
@@ -708,8 +751,8 @@ def collect_absolute_value(self, name_to_arr):
                 min_value = np.min(data_arr_np)
                 max_value = np.max(data_arr_np)
             else:
-                min_value = 0
-                max_value = 0
+                min_value = np.array(0, dtype=data_arr_np.dtype)
+                max_value = np.array(0, dtype=data_arr_np.dtype)
 
             data_arr_np = np.absolute(data_arr_np)  # only consider absolute value
 
@@ -725,6 +768,8 @@ def collect_absolute_value(self, name_to_arr):
                 old_histogram = self.histogram_dict[tensor]
                 old_min = old_histogram[2]
                 old_max = old_histogram[3]
+                assert hasattr(old_min, "dtype"), f"old_min should be a numpy array but is {type(old_min)}"
+                assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
                 temp_amax = np.max(data_arr_np)
@@ -757,7 +802,7 @@ def collect_value(self, name_to_arr):
                 min_value = np.array(0, dtype=data_arr.dtype)
                 max_value = np.array(0, dtype=data_arr.dtype)
 
-            threshold = max(abs(min_value), abs(max_value))
+            threshold = np.array(max(abs(min_value), abs(max_value)), dtype=data_arr.dtype)
 
             if tensor in self.histogram_dict:
                 old_histogram = self.histogram_dict[tensor]
@@ -809,7 +854,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
     def compute_collection_result(self):
         if not self.histogram_dict or len(self.histogram_dict) == 0:
             raise ValueError("Histogram has not been collected. Please run collect() first.")
-        print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
+        print(f"Finding optimal threshold for each tensor using {self.method!r} algorithm ...")
 
         if self.method == "entropy":
             return self.compute_entropy()
@@ -938,7 +983,14 @@ def compute_distribution(self):
             assert avg_coef.dtype != np.float64
             assert std_coef.dtype != np.float64
             assert hist_edges.dtype != np.float64
-            thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
+            thresholds_dict[tensor] = TensorData(
+                avg=avg_coef,
+                std=std_coef,
+                hist=hist,
+                hist_edges=hist_edges,
+                lowest=hist_edges.min(),
+                highest=hist_edges.max(),
+            )
 
             # Plot histogram for debug only
             if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
@@ -952,18 +1004,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
         `q` is a truncated version of the original distribution.
         Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
         """
-        import copy
-
-        from scipy.stats import entropy
-
         hist = histogram[0]
         hist_edges = histogram[1]
         num_bins = hist.size
         zero_bin_index = num_bins // 2
         num_half_quantized_bin = num_quantized_bins // 2
 
+        dtype = histogram[1].dtype
         kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
-        thresholds = [(0, 0) for i in range(kl_divergence.size)]
+        thresholds = [(np.array(0, dtype=dtype), np.array(0, dtype=dtype)) for i in range(kl_divergence.size)]
 
         # <------------ num bins ---------------->
         #        <--- quantized bins ---->
@@ -983,10 +1032,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             start_index = zero_bin_index - i
             end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
 
-            thresholds[i - num_half_quantized_bin] = (
-                float(hist_edges[start_index]),
-                float(hist_edges[end_index]),
-            )
+            thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
 
             sliced_distribution = copy.deepcopy(hist[start_index:end_index])
 
@@ -1020,15 +1066,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
 
                 norm = sum(nonzeros[start:end])
                 if norm != 0:
-                    q[start:end] = float(quantized_bins[index]) / float(norm)
+                    q[start:end] = quantized_bins[index] / norm
 
             p = smooth_distribution(p)
             q = smooth_distribution(q)
-
-            if isinstance(q, np.ndarray):
-                kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
+            if p is None or q is None:
+                div = np.array(np.inf, dtype=dtype)
             else:
-                kl_divergence[i - num_half_quantized_bin] = float("inf")
+                div = np.array(entropy(p, q), dtype=dtype)
+            kl_divergence[i - num_half_quantized_bin] = div
 
         min_kl_divergence_idx = np.argmin(kl_divergence)
         optimal_threshold = thresholds[min_kl_divergence_idx]
@@ -1038,6 +1084,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             optimal_threshold = (min_value, optimal_threshold[1])
         if optimal_threshold[1] > max_value:
             optimal_threshold = (optimal_threshold[0], max_value)
+        assert hasattr(optimal_threshold[0], "dtype")
+        assert hasattr(optimal_threshold[1], "dtype")
         return optimal_threshold
 
 
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 68c2b3bf79c8b..036f49b420734 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -653,7 +653,7 @@ def smooth_distribution(p, eps=0.0001):
 
     if not n_nonzeros:
         # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
-        return -1
+        return None
     eps1 = eps * float(n_zeros) / float(n_nonzeros)
     assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
         n_zeros,
diff --git a/onnxruntime/test/python/quantization/test_op_matmul.py b/onnxruntime/test/python/quantization/test_op_matmul.py
index 344583aa7c624..91368bd643158 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul.py
@@ -10,13 +10,39 @@
 import numpy as np
 import onnx
 import packaging.version as pv
+from numpy.testing import assert_almost_equal
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization.calibrate import entropy
+
+
+def skip_if_new_opset_exception_raised(func):
+    def wrapper(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except Fail as e:
+            if "is under development and support for this is limited" in str(e):
+                raise unittest.SkipTest(f"Skipped {func} due to opset under development.")  # noqa: B904
+            raise
+
+    return wrapper
 
 
 class TestOpMatMul(unittest.TestCase):
+    def test_entropy(self):
+        try:
+            from scipy.stats import entropy as scipy_entropy
+        except ImportError:
+            raise unittest.SkipTest("scipy not installed.")  # noqa: B904
+        pk = (np.arange(10) - 5).astype(np.float32) / 10
+        qk = -(np.arange(10) - 5).astype(np.float32) / 10
+        ent = scipy_entropy(pk, qk)
+        get = entropy(pk, qk)
+        assert_almost_equal(ent, get)
+
     def input_feeds(self, n, name2shape, dtype):
         input_data_list = []
         for _i in range(n):
@@ -324,10 +350,11 @@ def test_quantize_matmul_u8u8(self):
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_u8u8_f16(self):
-        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 21, 9)
 
-    def quantize_matmul_s8s8(self, tt, opset, ir_version):
+    def quantize_matmul_s8s8(self, tt, opset, ir_version, calibrate_method=CalibrationMethod.MinMax):
         np.random.seed(1)
         model_fp_path = "matmul_fp.onnx"
         self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
@@ -341,6 +368,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
         self.static_quant_test_qdq(
             model_fp_path,
@@ -348,6 +376,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
 
         # dynamic quantization doesn't support activation:int8
@@ -357,11 +386,42 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
     def test_quantize_matmul_s8s8(self):
         self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)
 
+    def test_quantize_matmul_s8s8_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Entropy)
+
+    def test_quantize_matmul_s8s8_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Percentile)
+
+    def test_quantize_matmul_s8s8_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Distribution)
+
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_s8s8_f16(self):
-        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Entropy)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Percentile)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Distribution)
 
     def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
         np.random.seed(1)

From bd9d8fb2a545a59d87a4c23308ec543ba6e4c41d Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Wed, 17 Jan 2024 11:18:32 -0800
Subject: [PATCH 085/100] [ORT 1.17.0 release] Bump up version to 1.18.0
 (#19170)

### Description
<!-- Describe your changes. -->

Bump up version to 1.18.0 since the release branch has been cut.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
---
 VERSION_NUMBER                                            | 2 +-
 .../Training/NativeTrainingMethods.shared.cs              | 4 ++--
 docs/python/README.rst                                    | 5 +++++
 include/onnxruntime/core/session/onnxruntime_c_api.h      | 2 +-
 js/common/lib/version.ts                                  | 2 +-
 js/common/package-lock.json                               | 4 ++--
 js/common/package.json                                    | 2 +-
 js/node/lib/version.ts                                    | 2 +-
 js/node/package-lock.json                                 | 6 +++---
 js/node/package.json                                      | 2 +-
 js/react_native/lib/version.ts                            | 2 +-
 js/react_native/package.json                              | 2 +-
 js/react_native/yarn.lock                                 | 2 +-
 js/web/lib/version.ts                                     | 2 +-
 js/web/package-lock.json                                  | 6 +++---
 js/web/package.json                                       | 2 +-
 onnxruntime/__init__.py                                   | 2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc             | 8 ++++----
 18 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 092afa15df4df..84cc529467b05 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.17.0
+1.18.0
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
index 68a399f8b9671..7fe16f4156ef2 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
@@ -65,10 +65,10 @@ static NativeTrainingMethods()
                 DOrtGetApi OrtGetApi = (DOrtGetApi)Marshal.GetDelegateForFunctionPointer(NativeMethods.OrtGetApiBase().GetApi, typeof(DOrtGetApi));
 
                 // TODO: Make this save the pointer, and not copy the whole structure across
-                api_ = (OrtApi)OrtGetApi(17 /*ORT_API_VERSION*/);
+                api_ = (OrtApi)OrtGetApi(18 /*ORT_API_VERSION*/);
 
                 OrtGetTrainingApi = (DOrtGetTrainingApi)Marshal.GetDelegateForFunctionPointer(api_.GetTrainingApi, typeof(DOrtGetTrainingApi));
-                trainingApiPtr = OrtGetTrainingApi(17 /*ORT_API_VERSION*/);
+                trainingApiPtr = OrtGetTrainingApi(18 /*ORT_API_VERSION*/);
                 if (trainingApiPtr != IntPtr.Zero)
                 {
                     trainingApi_ = (OrtTrainingApi)Marshal.PtrToStructure(trainingApiPtr, typeof(OrtTrainingApi));
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 32bb3729e01d0..bbc8571fe3f17 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.18.0
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.18.0
+
 1.17.0
 ^^^^^^
 
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index b321b2b2bac27..aca9f4896fbdb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -38,7 +38,7 @@
  *
  * This value is used by some API functions to behave as this version of the header expects.
  */
-#define ORT_API_VERSION 17
+#define ORT_API_VERSION 18
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index 84f6dba83fa59..a5ada877b916a 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/common/package.json b/js/common/package.json
index beab7d29be263..64ab2736adbe3 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 542eebe746d59..2d7c39c86097f 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "os": [
         "win32",
@@ -27,7 +27,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/node/package.json b/js/node/package.json
index 8e591d8f46b9d..026840742e29e 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -13,7 +13,7 @@
       3
     ]
   },
-  "version": "1.17.0",
+  "version": "1.18.0",
   "dependencies": {
     "onnxruntime-common": "file:../common"
   },
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 39e6cb08bb06a..47324a76fe55f 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -36,7 +36,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index ff9be7fbe3a5b..4dca90d7415cf 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.17.0"
+  version "1.18.0"
 
 open@^6.2.0:
   version "6.4.0"
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index cd71c20ba4d2f..1815767fd2320 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^1.12.0",
@@ -49,7 +49,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/web/package.json b/js/web/package.json
index 7ffc9ba16aaa9..aa89606c00a1e 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -8,7 +8,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^1.12.0",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 57219c50f39aa..c3699f0fb33ad 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -7,7 +7,7 @@
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.17.0"
+__version__ = "1.18.0"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index d77c188f832a7..91a7f0d930b51 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2397,7 +2397,7 @@ Second example, if we wanted to add and remove some members, we'd do this:
     In GetApi we now make it return ort_api_3 for version 3.
 */
 
-static constexpr OrtApi ort_api_1_to_17 = {
+static constexpr OrtApi ort_api_1_to_18 = {
     // NOTE: The ordering of these fields MUST not change after that version has shipped since existing binaries depend on this ordering.
 
     // Shipped as version 1 - DO NOT MODIFY (see above text for more information)
@@ -2756,16 +2756,16 @@ static_assert(offsetof(OrtApi, KernelContext_GetResource) / sizeof(void*) == 265
 static_assert(offsetof(OrtApi, SetUserLoggingFunction) / sizeof(void*) == 266, "Size of version 17 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.17.0",
+static_assert(std::string_view(ORT_VERSION) == "1.18.0",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
-// 2. If there were any APIs added to ort_api_1_to_17 above:
+// 2. If there were any APIs added to ort_api_1_to_18 above:
 //    a. Add the 'End of version #' markers (pattern above should be obvious)
 //    b. Add a static_assert in the directly above list of version sizes to ensure nobody adds any more functions to the just shipped API version
 
 ORT_API(const OrtApi*, OrtApis::GetApi, uint32_t version) {
   if (version >= 1 && version <= ORT_API_VERSION)
-    return &ort_api_1_to_17;
+    return &ort_api_1_to_18;
 
   fprintf(stderr,
           "The requested API version [%u] is not available, only API versions [1, %u] are supported in this build."

From bc219ed553fc8d4b8fa3c7b4476810a63a864d8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Wed, 17 Jan 2024 20:33:34 +0100
Subject: [PATCH 086/100] [TensorRT EP] Enable a minimal CUDA EP compilation
 without kernels  (#19052)

Adresses https://github.com/microsoft/onnxruntime/issues/18542.
I followed the advice given by @RyanUnderhill
[here](https://github.com/microsoft/onnxruntime/pull/18731#issuecomment-1848261925)
and went with a minimal CUDA EP for now.
---
 cmake/CMakeLists.txt                          |  1 +
 cmake/onnxruntime_providers_cuda.cmake        | 49 ++++++++++++++-----
 .../core/providers/cuda/cuda_context.h        |  3 +-
 onnxruntime/core/providers/cuda/cuda_call.cc  |  4 ++
 .../core/providers/cuda/cuda_common.cc        | 42 ++++++++--------
 onnxruntime/core/providers/cuda/cuda_common.h |  6 ++-
 .../providers/cuda/cuda_execution_provider.cc | 14 +++++-
 onnxruntime/core/providers/cuda/cuda_pch.h    |  7 +++
 .../core/providers/cuda/cuda_stream_handle.cc |  4 ++
 .../core/providers/cuda/cudnn_common.cc       |  3 +-
 .../core/providers/cuda/cudnn_common.h        |  3 +-
 11 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc96218dac79e..712d5d76108aa 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -79,6 +79,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 84d1376f99d5e..9887d615c92d7 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -1,10 +1,25 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
-  )
+
+  if (onnxruntime_CUDA_MINIMAL)
+    file(GLOB onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.cc"
+    )
+    # Remove pch files
+    list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h"
+    )
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+    )
+  endif()
   # Remove pch files
   list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
     "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -16,11 +31,16 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
-  )
 
+
+  if (onnxruntime_CUDA_MINIMAL)
+    set(onnxruntime_providers_cuda_shared_srcs "")
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
+    )
+  endif()
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
   set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
 
@@ -156,10 +176,15 @@
     endif()
 
     add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
-    target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
-    if(onnxruntime_CUDNN_HOME)
-      target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
-      target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+    if(onnxruntime_CUDA_MINIMAL)
+      target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL)
+      target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+    else()
+      target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+      if(onnxruntime_CUDNN_HOME)
+          target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+          target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+      endif()
     endif()
 
     if (onnxruntime_USE_TRITON_KERNEL)
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 9416fad5f1448..1370f5c4c5e10 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -16,9 +16,10 @@
 #include "core/providers/custom_op_context.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cudnn.h>
-
+#endif
 namespace Ort {
 
 namespace Custom {
diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index 4f223041e04e3..f60684795a4bc 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -30,6 +30,7 @@ const char* CudaErrString<cudaError_t>(cudaError_t x) {
   return cudaGetErrorString(x);
 }
 
+#ifndef USE_CUDA_MINIMAL
 template <>
 const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
   cudaDeviceSynchronize();
@@ -76,6 +77,7 @@ const char* CudaErrString<cufftResult>(cufftResult e) {
       return "Unknown cufft error status";
   }
 }
+#endif
 
 #ifdef ORT_USE_NCCL
 template <>
@@ -132,6 +134,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
 
 template Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
+#ifndef USE_CUDA_MINIMAL
 template Status CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg, const char* file, const int line);
@@ -140,6 +143,7 @@ template Status CudaCall<curandStatus_t, false>(curandStatus_t retCode, const ch
 template void CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
+#endif
 
 #ifdef ORT_USE_NCCL
 template Status CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg, const char* file, const int line);
diff --git a/onnxruntime/core/providers/cuda/cuda_common.cc b/onnxruntime/core/providers/cuda/cuda_common.cc
index 33f2938940e4d..65083f89f7f77 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.cc
+++ b/onnxruntime/core/providers/cuda/cuda_common.cc
@@ -14,6 +14,27 @@ namespace cuda {
 //   0x04 - pedantic
 constexpr const char* kCudaGemmOptions = "ORT_CUDA_GEMM_OPTIONS";
 
+const char* CudaDataTypeToString(cudaDataType_t dt) {
+  switch (dt) {
+    case CUDA_R_16F:
+      return "CUDA_R_16F";
+    case CUDA_R_16BF:
+      return "CUDA_R_16BF";
+    case CUDA_R_32F:
+      return "CUDA_R_32F";
+#if !defined(DISABLE_FLOAT8_TYPES)
+    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
+    case CUDA_R_8F_E4M3:
+      return "CUDA_R_8F_E4M3";
+    case CUDA_R_8F_E5M2:
+      return "CUDA_R_8F_E5M2";
+#endif
+    default:
+      return "<unknown>";
+  }
+}
+
+#ifndef USE_CUDA_MINIMAL
 // Initialize the singleton instance
 HalfGemmOptions HalfGemmOptions::instance;
 
@@ -54,26 +75,6 @@ const char* cublasGetErrorEnum(cublasStatus_t error) {
   }
 }
 
-const char* CudaDataTypeToString(cudaDataType_t dt) {
-  switch (dt) {
-    case CUDA_R_16F:
-      return "CUDA_R_16F";
-    case CUDA_R_16BF:
-      return "CUDA_R_16BF";
-    case CUDA_R_32F:
-      return "CUDA_R_32F";
-#if !defined(DISABLE_FLOAT8_TYPES)
-    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
-    case CUDA_R_8F_E4M3:
-      return "CUDA_R_8F_E4M3";
-    case CUDA_R_8F_E5M2:
-      return "CUDA_R_8F_E5M2";
-#endif
-    default:
-      return "<unknown>";
-  }
-}
-
 const char* CublasComputeTypeToString(cublasComputeType_t ct) {
   switch (ct) {
     case CUBLAS_COMPUTE_16F:
@@ -92,6 +93,7 @@ const char* CublasComputeTypeToString(cublasComputeType_t ct) {
       return "<unknown>";
   }
 }
+#endif
 
 // It must exist somewhere already.
 cudaDataType_t ToCudaDataType(int32_t element_type) {
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 707099bac3ce0..e9941ce743bc3 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -22,13 +22,14 @@ namespace onnxruntime {
 namespace cuda {
 
 #define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr))
+#ifndef USE_CUDA_MINIMAL
 #define CUBLAS_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUBLAS_CALL(expr))
 #define CUSPARSE_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUSPARSE_CALL(expr))
 #define CURAND_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CURAND_CALL(expr))
 #define CUDNN_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDNN_CALL(expr))
 #define CUDNN2_RETURN_IF_ERROR(expr, m) ORT_RETURN_IF_ERROR(CUDNN_CALL2(expr, m))
 #define CUFFT_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUFFT_CALL(expr))
-
+#endif
 // Type mapping for MLFloat16 to half
 template <typename T>
 class ToCudaType {
@@ -93,7 +94,7 @@ inline bool CalculateFdmStrides(gsl::span<fast_divmod> p, const std::vector<int6
   }
   return true;
 }
-
+#ifndef USE_CUDA_MINIMAL
 class CublasMathModeSetter {
  public:
   CublasMathModeSetter(const cudaDeviceProp& prop, cublasHandle_t handle, cublasMath_t mode) : handle_(handle) {
@@ -189,6 +190,7 @@ const char* cublasGetErrorEnum(cublasStatus_t error);
 const char* CudaDataTypeToString(cudaDataType_t dt);
 
 const char* CublasComputeTypeToString(cublasComputeType_t ct);
+#endif
 
 cudaDataType_t ToCudaDataType(int32_t element_type);
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index f7b23f12e8193..644bcaaa24cd4 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cuda/cuda_contrib_kernels.h"
 #endif
@@ -27,6 +28,7 @@
 #ifdef USE_TRITON_KERNEL
 #include "core/providers/cuda/triton_kernel.h"
 #endif
+#endif
 
 #include "core/providers/cuda/cuda_stream_handle.h"
 
@@ -169,21 +171,23 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
                                                           ArenaExtendStrategy /*arena_extend_strategy*/, CUDAExecutionProviderExternalAllocatorInfo /*external_allocator_info*/,
                                                           OrtArenaCfg* /*default_memory_arena_cfg*/) {
   CUDA_CALL_THROW(cudaSetDevice(device_id));
-
+#ifndef USE_CUDA_MINIMAL
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
   CUBLAS_CALL_THROW(cublasLtCreate(&cublas_lt_handle_));
   CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
 
   CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
   CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
-
+#endif
   cuda_graph_.SetStream(stream);
 }
 
 CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
+#ifndef USE_CUDA_MINIMAL
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasDestroy(cublas_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasLtDestroy(cublas_lt_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUDNN_CALL(cudnnDestroy(cudnn_handle_)));
+#endif
 }
 
 bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
@@ -441,6 +445,7 @@ namespace cuda {
 // opset 1 to 9
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+#ifndef USE_CUDA_MINIMAL
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos);
@@ -1315,6 +1320,7 @@ class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
+#endif
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1326,6 +1332,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+#ifndef USE_CUDA_MINIMAL
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
@@ -2201,6 +2208,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2210,6 +2218,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     }
   }
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry));
 #endif
@@ -2220,6 +2229,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
 
 #ifdef ENABLE_TRAINING_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry));
+#endif
 #endif
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h
index f48554e8f1286..dfe50fe0a8832 100644
--- a/onnxruntime/core/providers/cuda/cuda_pch.h
+++ b/onnxruntime/core/providers/cuda/cuda_pch.h
@@ -10,12 +10,19 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
 #include <cudnn.h>
 #include <cufft.h>
 #include <cublasLt.h>
+#else
+typedef void* cudnnHandle_t;
+typedef void* cublasHandle_t;
+typedef void* cublasLtHandle_t;
+#endif
 
 #ifdef ORT_USE_NCCL
 #include <nccl.h>
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 7c866395ecf6e..0a256394b7d99 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -69,6 +69,7 @@ CudaStream::CudaStream(cudaStream_t stream,
                                                                    release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
                                                                    deferred_cpu_allocator_(*this),
                                                                    ep_info_(ep_info) {
+#ifndef USE_CUDA_MINIMAL
   if (own_flag) {
     CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
     CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
@@ -80,10 +81,12 @@ CudaStream::CudaStream(cudaStream_t stream,
     cudnn_handle_ = external_cudnn_handle;
     CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
   }
+#endif
 }
 
 CudaStream::~CudaStream() {
   ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd());
+#ifndef USE_CUDA_MINIMAL
   if (own_stream_) {
     cublasDestroy(cublas_handle_);
     cudnnDestroy(cudnn_handle_);
@@ -91,6 +94,7 @@ CudaStream::~CudaStream() {
     if (handle)
       cudaStreamDestroy(static_cast<cudaStream_t>(handle));
   }
+#endif
 }
 
 std::unique_ptr<synchronize::Notification> CudaStream::CreateNotification(size_t /*num_consumers*/) {
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 4df59a98b12e5..c850f7b583bfc 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -9,7 +9,7 @@
 #include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
 #include "core/providers/cpu/tensor/utils.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -222,3 +222,4 @@ const Float8E5M2 Consts<Float8E5M2>::One = Float8E5M2(1.0f, true);
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index 8a94a334ee688..fdd14dedad47e 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -7,7 +7,7 @@
 #include <cfloat>
 
 #include "core/providers/cuda/cuda_common.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -260,3 +260,4 @@ SetPoolingNdDescriptorHelper(cudnnPoolingDescriptor_t poolingDesc,
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif

From 146ebaf91e85185a0ac18c82bc69eba685ab9727 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 17 Jan 2024 15:03:43 -0800
Subject: [PATCH 087/100] [js/web] allow proxy to load model with 1GB <= size <
 2GB (#19178)

### Description

allow proxy to load model with 1GB <= size < 2GB

resolves #19157.
---
 js/web/lib/wasm/wasm-utils-load-file.ts | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/wasm-utils-load-file.ts b/js/web/lib/wasm/wasm-utils-load-file.ts
index abe480a43c790..c6cdba2320bde 100644
--- a/js/web/lib/wasm/wasm-utils-load-file.ts
+++ b/js/web/lib/wasm/wasm-utils-load-file.ts
@@ -47,9 +47,19 @@ export const loadFile = async(file: string|Blob|ArrayBufferLike|Uint8Array): Pro
         }
         const reader = response.body.getReader();
 
-        // use WebAssembly Memory to allocate larger ArrayBuffer
-        const pages = Math.ceil(fileSize / 65536);
-        const buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+        let buffer;
+        try {
+          // try to create ArrayBuffer directly
+          buffer = new ArrayBuffer(fileSize);
+        } catch (e) {
+          if (e instanceof RangeError) {
+            // use WebAssembly Memory to allocate larger ArrayBuffer
+            const pages = Math.ceil(fileSize / 65536);
+            buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+          } else {
+            throw e;
+          }
+        }
 
         let offset = 0;
         // eslint-disable-next-line no-constant-condition

From f87e69801f200a34ddb312f1d39e7296f19b660b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 17 Jan 2024 15:04:22 -0800
Subject: [PATCH 088/100] [js/web] show warning when numThreads is set but
 threads is not supported (#19179)

### Description
show warning when numThreads is set but threads is not supported.
Resolves #19148, #18933

for web: when crossOriginIsolated is false.
for node: always disable.
---
 js/web/lib/backend-wasm.ts      |  6 ++++++
 js/web/lib/wasm/wasm-factory.ts | 33 +++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index d9f63fec9c492..31ecffb07e40c 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -31,6 +31,12 @@ export const initializeFlags = (): void => {
   }
 
   if (typeof env.wasm.numThreads !== 'number' || !Number.isInteger(env.wasm.numThreads) || env.wasm.numThreads <= 0) {
+    // Web: when crossOriginIsolated is false, SharedArrayBuffer is not available so WebAssembly threads will not work.
+    // Node.js: onnxruntime-web does not support multi-threads in Node.js.
+    if ((typeof self !== 'undefined' && !self.crossOriginIsolated) ||
+        (typeof process !== 'undefined' && process.versions && process.versions.node)) {
+      env.wasm.numThreads = 1;
+    }
     const numCpuLogicalCores = typeof navigator === 'undefined' ? cpus().length : navigator.hardwareConcurrency;
     env.wasm.numThreads = Math.min(4, Math.ceil((numCpuLogicalCores || 1) / 2));
   }
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 81508a253ce8b..9b9334c93b78c 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -28,13 +28,34 @@ let initialized = false;
 let initializing = false;
 let aborted = false;
 
-const isMultiThreadSupported = (): boolean => {
-  try {
-    // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
-    if (typeof SharedArrayBuffer === 'undefined') {
-      return false;
+const isMultiThreadSupported = (numThreads: number): boolean => {
+  // WebAssembly threads are set to 1 (single thread).
+  if (numThreads === 1) {
+    return false;
+  }
+
+  // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
+  if (typeof SharedArrayBuffer === 'undefined') {
+    if (typeof self !== 'undefined' && !self.crossOriginIsolated) {
+      // eslint-disable-next-line no-console
+      console.warn(
+          'env.wasm.numThreads is set to ' + numThreads +
+          ', but this will not work unless you enable crossOriginIsolated mode. ' +
+          'See https://web.dev/cross-origin-isolation-guide/ for more info.');
     }
+    return false;
+  }
+
+  // onnxruntime-web does not support multi-threads in Node.js.
+  if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+    // eslint-disable-next-line no-console
+    console.warn(
+        'env.wasm.numThreads is set to ' + numThreads +
+        ', however, currently onnxruntime-web does not support multi-threads in Node.js. ' +
+        'Please consider using onnxruntime-node for performance critical scenarios.');
+  }
 
+  try {
     // Test for transferability of SABs (for browsers. needed for Firefox)
     // https://groups.google.com/forum/#!msg/mozilla.dev.platform/IHkBZlHETpA/dwsMNchWEQAJ
     if (typeof MessageChannel !== 'undefined') {
@@ -106,7 +127,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
   const numThreads = flags.numThreads!;
   const simd = flags.simd!;
 
-  const useThreads = numThreads > 1 && isMultiThreadSupported();
+  const useThreads = isMultiThreadSupported(numThreads);
   const useSimd = simd && isSimdSupported();
 
   const wasmPaths = flags.wasmPaths;

From 9da3e36138dd24377fbb0b4022d891b3baf07b84 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Wed, 17 Jan 2024 20:20:42 -0500
Subject: [PATCH 089/100] Fix buildJava from Zip-Nuget-Java-Nodejs Packaging
 Pipeline (#19187)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../c-api-noopenmp-packaging-pipelines.yml             |  2 ++
 .../stages/nuget-linux-cuda-packaging-stage.yml        | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 3803333bd880a..aa1a75bfcda45 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -204,6 +204,8 @@ stages:
     CudaVersion: ${{ parameters.CudaVersion }}
     docker_base_image: ${{ variables.docker_base_image }}
     linux_trt_version: ${{ variables.linux_trt_version }}
+    buildJava: true
+    buildNodejs: true
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index dbbc9ef27e513..db9bcacbf0754 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -6,6 +6,12 @@ parameters:
   type: string
 - name: linux_trt_version
   type: string
+- name: buildJava
+  type: boolean
+  default: false
+- name: buildNodejs
+  type: boolean
+  default: false
 
 stages:
   # Linux CUDA without TensorRT Packaging
@@ -66,9 +72,9 @@ stages:
   parameters:
     artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
     artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-    buildJava: false
+    buildJava: ${{ parameters.buildJava }}
     buildJavaOption: '--build_java'
-    buildNodejs: false
+    buildNodejs: ${{ parameters.buildNodejs }}
     buildNodejsOption: '--build_nodejs'
     CudaVersion: ${{ parameters.CudaVersion }}
 # Linux CUDA Combined Testing and Publishing

From dadd3ea704243a8c2b2ded790ae01f3b57c4da53 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 18 Jan 2024 11:11:14 -0800
Subject: [PATCH 090/100] Check the ep_cache_context and don't allow access
 outside the directory (#19174)

### Description
Check the ep_cache_context node property for EPContext node, and don't
allow relative path like "../file_path"
---
 .../qnn/builder/onnx_ctx_model_helper.cc      |  28 +++-
 .../test/providers/qnn/simple_op_htp_test.cc  | 129 ++++++++++++++++++
 2 files changed, 155 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index b157396306d01..fd9bf200c45ef 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -88,9 +88,33 @@ Status GetEpContextFromGraph(const onnxruntime::GraphViewer& graph_viewer,
                                                                qnn_model);
   }
 
-  std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
   std::filesystem::path folder_path = std::filesystem::path(ctx_onnx_model_path).parent_path();
-  std::filesystem::path context_binary_path = folder_path.append(external_qnn_context_binary_file_name);
+  std::string external_qnn_ctx_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name.empty(), "The file path in ep_cache_context should not be empty.");
+#ifdef _WIN32
+  onnxruntime::PathString external_qnn_context_binary_path = onnxruntime::ToPathString(external_qnn_ctx_binary_file_name);
+  auto ctx_file_path = std::filesystem::path(external_qnn_context_binary_path.c_str());
+  ORT_RETURN_IF(ctx_file_path.is_absolute(), "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  auto relative_path = ctx_file_path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+
+  std::filesystem::path context_binary_path = folder_path.append(relative_path);
+#else
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name[0] == '/',
+                "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  if (external_qnn_ctx_binary_file_name.find("..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+  std::filesystem::path context_binary_path = folder_path.append(external_qnn_ctx_binary_file_name);
+  std::string file_full_path = context_binary_path.string();
+#endif
+  if (!std::filesystem::is_regular_file(context_binary_path)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context does not exist or is not accessible.");
+  }
 
   size_t buffer_size{0};
   std::ifstream cache_file(context_binary_path.string().c_str(), std::ifstream::binary);
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index c4244fe532456..4ac1f5ddca643 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -908,6 +908,135 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
+std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) {
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, true, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  // The .. in the path will cause INVALID_GRAPH
+  ep_context_node.AddAttribute("ep_cache_context", external_bin_path);
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNN");
+  helper.SetGraphOutputs();
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  return model_data;
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has ".."
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryRelativePathTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("../qnn_context.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has absolute path
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryAbsolutePathTest) {
+#if defined(_WIN32)
+  std::string external_ctx_bin_path = "D:/qnn_context.bin";
+#else
+  std::string external_ctx_bin_path = "/data/qnn_context.bin";
+#endif
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode(external_ctx_bin_path);
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to a file not exist
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("qnn_context_not_exist.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to empty string
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
 // Run QDQ model on HTP with 2 inputs
 // 1st run will generate the Qnn context cache onnx file
 // 2nd run will load and run from QDQ model + Qnn context cache model

From dd2177c5d70b8e5b704f7ee0ddce134243eacb24 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Thu, 18 Jan 2024 13:11:47 -0800
Subject: [PATCH 091/100] enable webnn in ci build (#19163)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../github/azure-pipelines/templates/linux-wasm-ci.yml        | 4 ++--
 .../ci_build/github/azure-pipelines/templates/win-wasm-ci.yml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index d279e667f9091..360e3d5ef879b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -174,7 +174,7 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_simd_jsep | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_simd_jsep
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_jsep --enable_wasm_simd --use_jsep --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_jsep --enable_wasm_simd --use_jsep --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + JSEP)'
         WithCache: ${{ parameters.WithCache }}
     - template: build-linux-wasm-step.yml
@@ -185,7 +185,7 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_simd_threads_jsep | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_simd_threads_jsep
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + threads + JSEP)'
         WithCache: ${{ parameters.WithCache }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
index 79647cc5699c8..f2005ec5ada39 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
@@ -127,14 +127,14 @@ jobs:
       displayName: 'Build (simd + JSEP)'
       inputs:
         scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_jsep --target onnxruntime_webassembly --skip_tests'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.BuildJsep, true) }}:
     - task: PythonScript@0
       displayName: 'Build (simd + threads + JSEP)'
       inputs:
         scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep --target onnxruntime_webassembly --skip_tests'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - script: |

From 459c750b031339456e4061b1c4214904e6853ccd Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Fri, 19 Jan 2024 05:16:34 +0800
Subject: [PATCH 092/100] Update x64 template kernel library for 'sqnbitgemm'
 (#19016)

### Description
<!-- Describe your changes. -->
1. Make JBLAS codes an external module of ORT.
2. Move q4 gemm code to contrib_ops.
3. Update template kernel library to v0.1 release.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
We found that the current LLM model performance is far below our
expectations. Here is some performance data collected on Mistral-7B
model with Xeon-8480:
8 threads | prompt length=32 past_len=32 | prompt length=1   past_len=32
-- | -- | --
ORT-main | 1220ms | 263ms
Neural-speed | 564ms | 87ms
ORT-this PR|597ms|120ms

Although `Neural-speed` and `ORT-this PR` use the same int4 kernel code,
there is a 33ms(87ms vs. 120ms) latency gap between the two frameworks.
Through some statistics analysis, the summary latency of `MatMulNBits`
is 86.7ms
The summary latency of all int4 GEMMs in `Neural-speed` is 84.8ms. So
other OPs introduce an extra 30ms latency.

The performance of MatMulNBits in this PR meets our expectations.

### Remain Issues
1. For hybrid CPUs, like core 12900K, the ONNXRuntime thread pool uses
TaskGranularityFactor to scale its number of threads. This is not
expected in our code design. It may slow down the hybrid CPU performance
by 30~40%.
2. Prepack uses a single thread which is very slow to init a session.
3. MatMulNBits with zero points will fall through to COMP_FP32 even
accuracy_level=4. Our COMP_INT8 IGemmCore with zero points process is
not optimized for now. It will be updated in the future. So, for an int4
model with zero points, whether the accuracy_level is 0 or 4 will be no
difference.
---
 cmake/CMakeLists.txt                          |   18 +-
 cmake/deps.txt                                |    2 +-
 cmake/external/neural_speed.cmake             |   18 +
 cmake/onnxruntime_mlas.cmake                  |   13 -
 cmake/onnxruntime_providers_cpu.cmake         |   15 +
 .../cpu/quantization/matmul_nbits.cc          |   58 +-
 .../cpu/quantization/neural_speed_defs.h      |   45 +
 .../cpu/quantization/neural_speed_gemm.cc     |  438 ++
 .../cpu/quantization/neural_speed_gemm.h      |  129 +
 .../cpu/quantization/neural_speed_wrapper.h   |   39 +
 onnxruntime/core/mlas/inc/mlas_qnbit.h        |  130 -
 onnxruntime/core/mlas/lib/jblas_defs.h        |   73 -
 onnxruntime/core/mlas/lib/jblas_gemm.cpp      |  534 --
 onnxruntime/core/mlas/lib/jblas_gemm.h        |   61 -
 onnxruntime/core/mlas/lib/sqnbitgemm.cpp      |  128 -
 .../core/mlas/lib/x86_64/jblas/.clang-format  |    7 -
 .../core/mlas/lib/x86_64/jblas/CMakeLists.txt |   33 -
 .../mlas/lib/x86_64/jblas/jblas/jit_base.h    |  303 --
 .../mlas/lib/x86_64/jblas/jblas/jit_blas.h    |   96 -
 .../lib/x86_64/jblas/jblas/jit_blas_device.h  |  277 -
 .../x86_64/jblas/jblas/jit_blas_epilogue.h    |  329 --
 .../lib/x86_64/jblas/jblas/jit_blas_gemm.h    | 2699 ----------
 .../x86_64/jblas/jblas/jit_blas_parallel.h    |  678 ---
 .../x86_64/jblas/jblas/jit_blas_prologue_a.h  |  214 -
 .../x86_64/jblas/jblas/jit_blas_prologue_b.h  |  892 ----
 .../lib/x86_64/jblas/jblas/jit_blas_storage.h |  665 ---
 .../lib/x86_64/jblas/jblas/jit_blas_utils.h   |  638 ---
 .../lib/x86_64/jblas/jblas/jit_blas_wrapper.h |  281 -
 .../mlas/lib/x86_64/jblas/jblas/kernel_avx2.h |  874 ---
 .../x86_64/jblas/jblas/kernel_avx512_bf16.h   |   92 -
 .../lib/x86_64/jblas/jblas/kernel_avx512f.h   | 1966 -------
 .../mlas/lib/x86_64/jblas/jblas/kernel_jit.h  | 1375 -----
 .../x86_64/jblas/jblas/kernel_jit_injector.h  |  930 ----
 .../mlas/lib/x86_64/jblas/jblas/kernel_ref.h  | 1039 ----
 .../lib/x86_64/jblas/jblas/kernel_wrapper.h   |  702 ---
 .../mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h | 3313 ------------
 .../x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h  |  271 -
 .../x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h | 4728 -----------------
 .../lib/x86_64/jblas/jblas/xbyak/xbyak_util.h | 1160 ----
 .../test/contrib_ops/matmul_4bits_test.cc     |   49 +-
 .../test/mlas/bench/bench_sqnbitgemm.cpp      |   61 -
 41 files changed, 753 insertions(+), 24620 deletions(-)
 create mode 100644 cmake/external/neural_speed.cmake
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
 delete mode 100644 onnxruntime/core/mlas/lib/jblas_defs.h
 delete mode 100644 onnxruntime/core/mlas/lib/jblas_gemm.cpp
 delete mode 100644 onnxruntime/core/mlas/lib/jblas_gemm.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 712d5d76108aa..7d7304630c00e 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -910,6 +910,10 @@ function(onnxruntime_set_compile_flags target_name)
       target_compile_definitions(${target_name} PRIVATE USE_CUTLASS)
     endif()
 
+    if(USE_NEURAL_SPEED)
+      target_compile_definitions(${target_name} PRIVATE ORT_NEURAL_SPEED)
+    endif()
+
     set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
     if (onnxruntime_USE_CUDA)
       # Suppress a "conversion_function_not_usable" warning in gsl/span
@@ -1194,14 +1198,10 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-set(USE_JBLAS FALSE)
-if (onnxruntime_USE_JBLAS AND NOT onnxruntime_MINIMAL_BUILD)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
-  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+  include(neural_speed)
+  if (USE_NEURAL_SPEED)
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
   endif()
 endif()
 
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ff07803013071..fda27e5e93797 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -54,4 +54,4 @@ tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2
 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a795034a89d4f48a79d1f009f7a04c8dee
 utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
-composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
+composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
\ No newline at end of file
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
new file mode 100644
index 0000000000000..e66e2acfb209a
--- /dev/null
+++ b/cmake/external/neural_speed.cmake
@@ -0,0 +1,18 @@
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
+  set(USE_NEURAL_SPEED TRUE)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
+  set(USE_NEURAL_SPEED TRUE)
+endif()
+
+if(USE_NEURAL_SPEED)
+  FetchContent_Declare(
+      neural_speed
+      URL https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip
+      URL_HASH SHA1=65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
+  )
+  set(BTLA_USE_OPENMP OFF)
+  FetchContent_MakeAvailable(neural_speed)
+  if(NOT neural_speed_POPULATED)
+    FetchContent_Populate(neural_speed)
+  endif()
+endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index b995b27123218..f89d2150a6830 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -57,15 +57,6 @@ endif()
 
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
-function(add_jblas)
-    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
-    target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
-    target_sources(onnxruntime_mlas PRIVATE
-        ${MLAS_SRC_DIR}/jblas_gemm.cpp
-     )
-    set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
-endfunction()
-
 #TODO: set MASM flags properly
 function(setup_mlas_source_for_windows)
 
@@ -622,10 +613,6 @@ else()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
 
-if(USE_JBLAS)
-  add_jblas()
-endif()
-
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index f60faa4d39116..b81a5c79ac0cc 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -60,6 +60,15 @@ if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
       "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
     )
   endif()
+  set(onnxruntime_cpu_neural_speed_srcs 
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_defs.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.cc"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.h"
+  )
+  if(NOT USE_NEURAL_SPEED)
+    list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs ${onnxruntime_cpu_neural_speed_srcs})
+  endif()
   # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
   source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
   list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@@ -144,6 +153,12 @@ if (HAS_BITWISE_INSTEAD_OF_LOGICAL)
   target_compile_options(onnxruntime_providers PRIVATE "-Wno-bitwise-instead-of-logical")
 endif()
 
+if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+  if(USE_NEURAL_SPEED)
+    onnxruntime_add_include_to_target(onnxruntime_providers neural_speed::bestla)
+  endif()
+endif()
+
 if (MSVC)
    target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
 #   if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 406c73c95d444..72948c74d7877 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -9,6 +9,9 @@
 #include "core/mlas/inc/mlas_q4.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/common.h"
+#ifdef ORT_NEURAL_SPEED
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -24,15 +27,17 @@ class MatMulNBits final : public OpKernel {
         accuracy_level_{info.GetAttr<int64_t>("accuracy_level")} {
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
-    is_asym_ = info.GetInputCount() >= 4;
+#ifdef ORT_NEURAL_SPEED
     const Tensor* tensor_B = nullptr;
     const Tensor* tensor_scale = nullptr;
     const Tensor* tensor_zero_point = nullptr;
     bool B_constant = info.TryGetConstantInput(1, &tensor_B);
     bool scale_constant = info.TryGetConstantInput(2, &tensor_scale);
     bool zero_point_constant = info.TryGetConstantInput(3, &tensor_zero_point);
+    is_asym_ = info.GetInputCount() >= 4;
     all_constant_ = B_constant && scale_constant;
     all_constant_ = is_asym_ ? all_constant_ && zero_point_constant : all_constant_;
+#endif
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -53,30 +58,34 @@ class MatMulNBits final : public OpKernel {
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_;
   size_t packed_b_size_{0};
+#ifdef ORT_NEURAL_SPEED
   bool is_asym_{false};
   bool all_constant_{false};
+#endif
 };
 
 Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                             /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
+#ifdef ORT_NEURAL_SPEED
   if (!all_constant_) {
     return Status::OK();
   }
-
-#if defined(MLAS_JBLAS)
-
-  auto compt_type = static_cast<MLAS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
   MLAS_THREADPOOL* pool = NULL;
+  if (nbits_ != 4) {
+    return Status::OK();
+  }
+  auto comp_type = static_cast<NS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
+  auto nbits = static_cast<int>(nbits_);
   if (input_idx == 1) {
-    packed_b_size_ = MlasNBitsGemmPackBSize(N_, K_, block_size_, static_cast<int>(nbits_), is_asym_, compt_type);
+    packed_b_size_ = NSNBitsGemmPackBSize(N_, K_, block_size_, nbits, is_asym_, comp_type);
     if (packed_b_size_ == 0) return Status::OK();
     auto qptr = tensor.Data<uint8_t>();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
     std::memset(packed_b_.get(), 0, packed_b_size_);
-    MlasNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, false, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, false,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -85,8 +94,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 2 && packed_b_ != nullptr) {
     auto sptr = tensor.Data<float>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, !is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, !is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -95,8 +104,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 3 && packed_b_ != nullptr) {
     auto zptr = tensor.Data<uint8_t>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, nbits, is_asym_, is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -104,7 +113,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_);
@@ -119,7 +128,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   return Status::OK();
 }
@@ -127,9 +136,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
 Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                               /*out*/ bool& used_shared_buffers) {
   used_shared_buffers = false;
-
-#if defined(MLAS_JBLAS)
-
+#ifdef ORT_NEURAL_SPEED
   // Pack three tensors into one buffer
   if (input_idx == 1) {
     used_shared_buffers = true;
@@ -144,14 +151,14 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
   return Status::OK();
 }
 
@@ -160,9 +167,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
 
   const Tensor* a = ctx->Input<Tensor>(0);
   const auto* a_data = a->Data<float>();
-
-#if defined(MLAS_JBLAS)
-
+#ifdef ORT_NEURAL_SPEED
   if (packed_b_.get()) {
     TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
 
@@ -181,7 +186,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
     const size_t N = static_cast<size_t>(helper.N());
     const size_t K = static_cast<size_t>(helper.K());
     const size_t lda = helper.Lda(false);
-    std::vector<MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
+    std::vector<NS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
     AllocatorPtr allocator;
     auto status = ctx->GetTempSpaceAllocator(&allocator);
     ORT_RETURN_IF_ERROR(status);
@@ -192,15 +197,14 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
       gemm_params[i].C = y_data + helper.OutputOffsets()[i];
       gemm_params[i].ldc = N;
     }
-    auto ws_size = MlasSQNBitsGemmBatchPackedBWorkspaceSize(M, N, K, max_len, gemm_params.data());
+    auto ws_size = NSSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
     // workspace for activation process(dynamic quantization and others)
     auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
-    MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
-                                thread_pool);
+    NSSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(), thread_pool);
     return Status::OK();
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
new file mode 100644
index 0000000000000..864abffd131fe
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
@@ -0,0 +1,45 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+--*/
+
+#pragma once
+
+#include "contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+
+namespace bestla {
+
+using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
+using tAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
+using tAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
+using tAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
+using tAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
+using tAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>;
+using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>;
+using tAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+using tAVX_VNNI_KBlock = gemm::ICoreRowNAvxvnniKBlock<24, 2>;
+using tAVX512_VNNI_KBlock = gemm::ICoreRowNAvx512vnniKBlock<48, 4>;
+using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<48, 16>;
+using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<48, 16>;
+
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNFloat = prologue_b::gemm::WeightKBlockNFloat<GC_T, ISA_T>;
+
+class ORTThreading : public parallel::IThreading {
+ public:
+  explicit ORTThreading(void* tp);
+  void parallel_for(const parallel::thread_func& func) const override;
+  void set_threads(int nthreads) override {
+    (void)(nthreads);
+    assert(0);
+  }
+  void sync() const override { assert(0); }
+  void* mTp;
+};
+
+}  // namespace bestla
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
new file mode 100644
index 0000000000000..73aaa4ae61a6e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
@@ -0,0 +1,438 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.cpp
+
+Abstract:
+
+    GEMM template combinations of neural_speed.
+--*/
+
+#include "contrib_ops/cpu/quantization/neural_speed_defs.h"
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#include "core/platform/threadpool.h"
+
+using ThreadPool = onnxruntime::concurrency::ThreadPool;
+
+namespace bestla {
+
+ORTThreading::ORTThreading(void* tp)
+    : IThreading(ThreadPool::DegreeOfParallelism(reinterpret_cast<ThreadPool*>(tp))), mTp(tp) {}
+
+void ORTThreading::parallel_for(const parallel::thread_func& func) const {
+  ThreadPool::TrySimpleParallelFor(reinterpret_cast<ThreadPool*>(mTp), mThreadNum,
+                                   [&](ptrdiff_t tid) { func(static_cast<int>(tid)); });
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompF32(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                             parallel::IThreading* th) {
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  if (M <= 16) {
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                      prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::CompFp32BlockEpilogue,
+                                      epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
+    if (B->IsAsym()) {
+      reduceA.assign(WorkSpace);
+      ORTThreading single(nullptr);
+      kernel.mProA.reduce({A, lda_, &reduceA}, M_, K_, B->mBlockSize, &single);
+    }
+    typename Launcher::Param args{gp,
+                                  {A, lda_, &reduceA},
+                                  {B},
+                                  {B->template SPtr<int8_t>(), B->SDtype(), B->CStep(), B->template ZPtr<int8_t>(),
+                                   reduceA.template RPtr<float>(), reduceA.lda},
+                                  {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    typename Launcher::Param args{gp, {A, lda_}, {B}, {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  }
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompInt8(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                              storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                              parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                       prologue_b::gemm::WeightKBlockNInteger,
+                                       epilogue::gemm::AccumulatorWriteBackFp32>;
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  if (M <= 16) {
+    ORTThreading single(nullptr);
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
+  } else {
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
+  }
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  typename Launcher::Param args{gp, {A, lda_, &quanA}, {B}, {C, ldc_, nullptr}};
+  parallel::GemmRun<Parallel>(kernel, args, th);
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompF32WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                            storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  auto M_ = static_cast<int>(M);
+  auto K_ = static_cast<int>(K);
+  (void)(A);
+  (void)(N);
+  (void)(C);
+  (void)(lda);
+  (void)(ldc);
+  if (M <= 16) {
+    using ProA = prologue_a::gemm::ActivationKBlockBaseF32<GemmCore_T, GemmCore_T::ISA>;
+    static ProA proA;
+    if (B->IsAsym()) {
+      auto reduceA = proA.createStorage(M_, K_, B->mBlockSize);
+      return reduceA.mSize;
+    }
+    return 0;
+  } else {
+    // using ProA = prologue_a::gemm::ActivationBase<GemmCore_T, GemmCore_T::ISA>;
+    return 0;
+  }
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompInt8WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  (void)(N);
+  (void)(lda);
+  (void)(ldc);
+  (void)(A);
+  (void)(C);
+  using ProA = prologue_a::gemm::ActivationF32KBlockQuantize<GemmCore_T, GemmCore_T::ISA>;
+  static ProA proA;
+  auto quanA =
+      proA.createStorage(static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->IsAsym());
+  return quanA.mSize;
+}
+
+}  // namespace bestla
+
+using namespace bestla;
+
+static bool NSSQ4GemmBatchDriver(size_t M, size_t N, size_t K, size_t BatchN,
+                                 const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace,
+                                 void* ThreadPool) {
+  GetCPUDevice();
+  bestla::ORTThreading orth(ThreadPool);
+  bool processed = true;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = bestla::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<bestla::storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+      auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+      auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<bestla::storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == bestla::tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                       DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          } else if (NTile == bestla::tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C,
+                                                    DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == bestla::tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() &&
+              BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAMX_INT8_SS_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX512_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() &&
+                     BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+      }
+    } else {
+      processed = false;
+      break;
+    }
+  }
+  return processed;
+}
+
+static size_t NSSQ4GemmBatchWorkspaceSize(size_t M, size_t N, size_t K, size_t BatchN,
+                                          const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  GetCPUDevice();
+  size_t size = 0;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto NTile =
+            gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+        auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+        auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+        auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                    DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                 DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+      }
+    }
+  }
+  return size;
+}
+
+template <typename T>
+static size_t NSQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym) {
+  static T proB;
+  auto stor = proB.createStorage(static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size),
+                                 BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::BF16, isAsym);
+  // TODO(Yu) support more scale dtype
+  return stor.mSize;
+}
+
+static bool NSQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
+  auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+  ORTThreading orth(ThreadPool);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto ldb_ = static_cast<int>(ldb);
+  GetCPUDevice();
+  if (ptr) {
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto wptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+      auto BlkSize = wptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          static tWeiNInt<tAVX512F, tAVX512F::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          static tWeiNInt<tAVX2, tAVX2::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          static tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+static void NSQ4GemmPackBImpl(void* PackedBuf, size_t BlkSize, const uint8_t* QData, const float* Scale,
+                              const uint8_t* Zp, size_t N, size_t K, bool IsAsym, bool lastCall, size_t ldb,
+                              void* ThreadPool) {
+  static T proB;
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto stor = proB.createStorage(N_, K_, static_cast<int>(BlkSize), BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                 BTLA_DTYPE::BF16, IsAsym);
+  stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+  ORTThreading orth(ThreadPool);
+  proB.packNbitsWeightQ4(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
+  if (lastCall) {
+    proB.reduceWeight(&stor, &orth);
+  }
+}
+
+static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, NS_SQNBIT_COMPUTE_TYPE CompType) {
+  GetCPUDevice();
+  if (K % BlkSize != 0) {
+    return 0;
+  }
+  // from low precision to high precision
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX512F, tAVX512F::ISA>>(BlkSize, N, K, isAsym);
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX2, tAVX2::ISA>>(BlkSize, N, K, isAsym);
+      }
+      [[fallthrough]];
+    default:
+      return 0;
+  }
+}
+
+static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N,
+                          size_t K, size_t ldb, size_t BlkSize, bool isAsym, bool lastCall,
+                          NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  GetCPUDevice();
+  // explicit statement fall through.
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N,
+                                                                               K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX512F, tAVX512F::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym,
+                                                             lastCall, ldb, ThreadPool);
+        return true;
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX2, tAVX2::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall,
+                                                       ldb, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      return false;
+  }
+}
+
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym,
+                            NS_SQNBIT_COMPUTE_TYPE CompType) {
+  if (nbits == 4) {
+    auto jsize = NSQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
+    if (jsize) {
+      return jsize;
+    }
+  }
+  return 0;
+}
+
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t BlkSize, int nbits, bool isAsym, bool lastCall,
+                      NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  if (nbits == 4) {
+    if (NSQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
+      return;
+    }
+  }
+}
+
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
+    return;
+  }
+}
+
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  return NSSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
+}
+
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
+    // PackedWeight is created by bestla
+    return;
+  }
+}
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
new file mode 100644
index 0000000000000..ebcb3027a209f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
@@ -0,0 +1,129 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.h
+
+Abstract:
+
+    Prepack-weight GEMM APIs of neural_speed.
+--*/
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+
+/**
+ * @brief Define compute types of block quantization
+ */
+enum NS_SQNBIT_COMPUTE_TYPE {
+  NSCompUndef = 0, /*!< undef */
+  NSCompFp32 = 1,  /*!< input fp32, accumulator fp32 */
+  NSCompFp16 = 2,  /*!< input fp16, accumulator fp16 */
+  NSCompBf16 = 3,  /*!< input bf16, accumulator fp32 */
+  NSCompInt8 = 4   /*!< input int8, accumulator int32 */
+};
+
+/**
+ * @brief Data parameters for NBits GEMM routine
+ *        C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *        All except C are [in] parameters
+ */
+struct NS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
+  const float* A = nullptr; /**< address of A (float32 matrix)*/
+  const void* B = nullptr;  /**< address of B (packed nbits blob)*/
+  float* C = nullptr;       /**< address of result matrix */
+  size_t lda = 0;           /**< leading dimension of A */
+  size_t ldc = 0;           /**< leading dimension of C*/
+};
+
+/**
+ * @brief Compute the byte size of the parameter combination
+ *
+ * @param N      the number of columns of matrix B.
+ * @param K      the number of rows of matrix B.
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits  number of bits used for weight quantization
+ * @param is_asym  flag for asymmetric quantization
+ * @param comp_type  specify input data type and accumulator data type
+ * @return size of the packing buffer, 0 if the operation is not yet supported.
+ */
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t block_size, int nbits, bool is_asym,
+                            NS_SQNBIT_COMPUTE_TYPE comp_type);
+
+/**
+ * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
+ *
+ * @param PackedBuf     packed data buffer
+ * @param QData         quantized data buffer
+ * @param Scale         scale pointer
+ * @param Zp            zero point pointer
+ * @param N             the number of columns of matrix B.
+ * @param K             the number of rows of matrix B.
+ * @param ldb           leading dimension of B
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits         number of bits used for weight quantization (default 4)
+ * @param is_asym       flag for asymmetric quantization
+ * @param comp_type     specify input data type and accumulator data type
+ * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
+ * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
+ * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
+ * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
+ * (is_asym is false) and Zp(is_asym is true).
+ * @param thread_pool
+ */
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t block_size, int nbits, bool is_asym, bool last_call,
+                      NS_SQNBIT_COMPUTE_TYPE comp_type, void* thread_pool);
+
+/**
+ * @brief Unpack and dequantize to fp32
+ *
+ * @param FpData     unpacked float32 data
+ * @param PackedBuf  quantized and packed data
+ * @param N          the number of columns of matrix B.
+ * @param K          the number of rows of matrix B.
+ * @param ldb        leading dimension of B
+ * @param thread_pool
+ */
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* thread_pool);
+
+/**
+ * @brief Get the workspace size required by computation.
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @return     Workspace size in bytes
+ */
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams);
+
+/**
+ * @brief Batched GEMM:  C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @param[in]  WorkSpace  temporary buffer
+ * @param[in]  ThreadPool
+ * @return
+ */
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool = nullptr);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
new file mode 100644
index 0000000000000..d3902f9bd68c7
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -0,0 +1,39 @@
+//-----------------------------------------------------------------------------
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+//-----------------------------------------------------------------------------
+#pragma once
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-value"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4457)
+#pragma warning(disable : 4189)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4267)
+#pragma warning(disable : 4702)
+#endif
+
+#include "bestla/bestla_prologue_a.h"
+#include "bestla/bestla_wrapper.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index bc0bfc92c85a0..047011e70bd4d 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -183,133 +183,3 @@ MlasSQNBitGemmPackQuantBData(
     void* PackedQuantBData,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
-
-/**
- * @brief Data parameters for NBits GEMM routine
- *        C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *        All except C are [in] parameters
- */
-struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
-    const float* A = nullptr; /**< address of A (float32 matrix)*/
-    const void* B = nullptr;  /**< address of B (packed nbits blob)*/
-    float* C = nullptr;       /**< address of result matrix */
-    size_t lda = 0;           /**< leading dimension of A */
-    size_t ldc = 0;           /**< leading dimension of C*/
-};
-
-/**
- * @brief Compute the byte size of the parameter combination
- *
- * @param N      the number of columns of matrix B.
- * @param K      the number of rows of matrix B.
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits  number of bits used for weight quantization
- * @param is_asym  flag for asymmetric quantization
- * @param comp_type  specify input data type and accumulator data type
- * @return size of the packing buffer, 0 if the operation is not yet supported.
- */
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type
-);
-
-/**
- * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
- *
- * @param PackedBuf     packed data buffer
- * @param QData         quantized data buffer
- * @param Scale         scale pointer
- * @param Zp            zero point pointer
- * @param N             the number of columns of matrix B.
- * @param K             the number of rows of matrix B.
- * @param ldb           leading dimension of B
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits         number of bits used for weight quantization (default 4)
- * @param is_asym       flag for asymmetric quantization
- * @param comp_type     specify input data type and accumulator data type
- * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
- * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
- * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
- * (is_asym is false) and Zp(is_asym is true).
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t block_size,
-    int nbits,
-    bool is_asym,
-    bool last_call,
-    MLAS_SQNBIT_COMPUTE_TYPE comp_type,
-    MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Unpack and dequantize to fp32
- *
- * @param FpData     unpacked float32 data
- * @param PackedBuf  quantized and packed data
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmUnPackB(
-    float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Get the workspace size required by computation.
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @return     Workspace size in bytes
- */
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
-
-/**
- * @brief Batched GEMM:  C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @param[in]  WorkSpace  temporary buffer
- * @param[in]  ThreadPool
- * @return
- */
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool = nullptr
-);
diff --git a/onnxruntime/core/mlas/lib/jblas_defs.h b/onnxruntime/core/mlas/lib/jblas_defs.h
deleted file mode 100644
index 9cd1711a3ffd2..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_defs.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
---*/
-
-#pragma once
-
-#include "jblas/jit_blas_prologue_b.h"
-#include "jblas/jit_blas_wrapper.h"
-
-namespace jblas
-{
-
-/*
-Name conversion explaination:
-Fp32:   comp type, determined by GemmCore, can be any jblas::gemm::SCorexxx(float GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(also support other integer and float weight
-classes)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompFp32BlockEpilogue is a fixed class for all fp32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Fp32_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationKBlockBaseF32,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompFp32BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-/*
-Name conversion explaination:
-Int8:   comp type, determined by GemmCore, can be any jblas::gemm::ICorexxx(integer GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(support integer weight classes only)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompInt8BlockEpilogue is a fixed class for all int32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Int8_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationF32KBlockQuantize,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompInt8BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-using tAVX512F = jblas::gemm::SCoreRowNAvx512f<48, 8>;
-using tAMX_BF16 = jblas::gemm::HCoreRowNAmxbf16<64, 16>;
-using tAVX512_FP16 = jblas::gemm::HCoreRowNAvx512fp16<96, 8>;
-using tAVX_VNNI = jblas::gemm::ICoreRowNAvxvnni<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-using tAVX512_VNNI = jblas::gemm::ICoreRowNAvx512vnni<48, 8>;
-using tAMX_INT8_US = jblas::gemm::ICoreRowNAmxint8<64, 16>;
-using tAMX_INT8_SS = jblas::gemm::ICoreRowNAmxint8SS<64, 16>;
-using tAVX2 = jblas::gemm::SCoreRowNAvx2<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-
-class ORTThreading : public jblas::parallel::IThreading
-{
-   public:
-    ORTThreading(void* tp);
-    void parallel_for(const jblas::parallel::thread_func& func) override;
-    void set_threads(int nthreads) override { assert(0); }
-    void sync() override { assert(0); }
-    void* mTp;
-};
-
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.cpp b/onnxruntime/core/mlas/lib/jblas_gemm.cpp
deleted file mode 100644
index f3cae3186c28e..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.cpp
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#include "jblas_gemm.h"
-
-#include "jblas_defs.h"
-#include "mlasi.h"
-
-using namespace jblas;
-
-jblas::ORTThreading::ORTThreading(void* tp)
-    : IThreading(MLAS_THREADPOOL::DegreeOfParallelism(reinterpret_cast<MLAS_THREADPOOL*>(tp))), mTp(tp)
-{
-}
-
-void
-jblas::ORTThreading::parallel_for(const jblas::parallel::thread_func& func)
-{
-    MlasTrySimpleParallel(reinterpret_cast<MLAS_THREADPOOL*>(mTp), mThreadNum, [&](ptrdiff_t tid) {
-        func(static_cast<int>(tid));
-    });
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompF32(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    if (M <= 16) {
-        using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-        if (B->mIsAsym) {
-            reduceA.assign(WorkSpace);
-            ORTThreading single(nullptr);
-            kernel.mProA.reduce({A, lda_}, &reduceA, M_, K_, &single);
-        }
-        typename Launcher::BEpiParam blkargs{
-            B->template SPtr<int8_t>(),    B->mScaT,   B->mCStep, B->template ZPtr<int8_t>(),
-            reduceA.template get<float>(), reduceA.lda};
-
-        typename Launcher::Param args{M_, N_, K_, B->mBlockSize, {A, lda_}, {B}, blkargs, {C, ldc_}};
-        jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-    } else {
-        using Parallel = jblas::parallel::gemm::SchedulerBase<GemmCore_T>;
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-
-        typename Launcher::Param args{M_, N_, K_, {A, lda_}, {B}, {C, ldc_}};
-        jblas::parallel::GemmBaseRun<Parallel>(kernel, args, th);
-    }
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompInt8(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    static Launcher kernel;
-    auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->mIsAsym);
-    quanA.assign(WorkSpace);
-    if (M <= 16) {
-        ORTThreading single(nullptr);
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
-    } else {
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
-    }
-    typename Launcher::Param args{
-        M_,
-        N_,
-        K_,
-        B->mBlockSize,
-        {A, lda_, &quanA},
-        {B},
-        {B->template SPtr<int8_t>(), B->mScaT, B->mCStep, quanA.template SPtr<float>(), quanA.mCStep,
-         quanA.template ZPtr<uint8_t>(), B->template RPtr<float>(), B->mRedT, B->template ZPtr<int8_t>(),
-         quanA.template RPtr<float>(), B->mBlockSize},
-        {C, ldc_}};
-    jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-}
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    ORTThreading orth(ThreadPool);
-    bool processed = true;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        JblasSQ4GemmCompF32<tAVX512F>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        JblasSQ4GemmCompF32<tAVX2>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_US>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX512_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_SS>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-            }
-        } else {
-            processed = false;
-            break;
-        }
-    }
-    return processed;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompF32WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto K_ = static_cast<int>(K);
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    if (M <= 16) {
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        if (B->mIsAsym) {
-            auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-            return reduceA.mSize;
-        }
-        return 0;
-    } else {
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-        return 0;
-    }
-    return 0;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompInt8WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    static Launcher kernel;
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    auto quanA = kernel.mProA.createStorage(
-        static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->mIsAsym
-    );
-    return quanA.mSize;
-}
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-    GetCPUDevice();
-    size_t size = 0;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX512F>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX2>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_US>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-            }
-        }
-    }
-    return size;
-}
-
-template <typename T>
-static size_t
-JblasQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym)
-{
-    static T launcher;
-    auto stor = launcher.mProB.createStorage(
-        static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32,
-        JBLAS_DTYPE::BF16, isAsym
-    );
-    // TODO(Yu) support more scale dtype
-    return stor.mSize;
-}
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType)
-{
-    GetCPUDevice();
-    if (K % BlkSize != 0) {
-        return 0;
-    }
-    // from low precision to high precision
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(BlkSize, N, K, isAsym);
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512F>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX2>>(BlkSize, N, K, isAsym);
-            }
-            break;
-        default:
-            return 0;
-    }
-    return 0;
-}
-
-template <typename T>
-static void
-JblasQ4GemmPackBImpl(
-    void* PackedBuf,
-    size_t BlkSize,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    bool IsAsym,
-    bool lastCall,
-    size_t ldb,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    static T JblasKernel;
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto stor = JblasKernel.mProB.createStorage(
-        N_, K_, static_cast<int>(BlkSize), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32, JBLAS_DTYPE::BF16, IsAsym
-    );
-    stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
-    ORTThreading orth(ThreadPool);
-    JblasKernel.mProB.packNbitsWeight(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
-    if (lastCall) {
-        JblasKernel.mProB.reduceWeight(&stor, &orth);
-    }
-}
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    // explicit statement fall through.
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX512F>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX2>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        default:
-            return false;
-    }
-    return false;
-}
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-    auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
-    auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-    ORTThreading orth(ThreadPool);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto ldb_ = static_cast<int>(ldb);
-    GetCPUDevice();
-    if (ptr) {
-        if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-            auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-            );
-            auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-            );
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_FP32)) {
-                if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512F, tAVX512F::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX2, tAVX2::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_US_INT32)) {
-                if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_US, tAMX_INT8_US::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512_VNNI, tAVX512_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX_VNNI, tAVX_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_SS_INT32)) {
-                if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_SS, tAMX_INT8_SS::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-        }
-        return true;
-    }
-    return false;
-}
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.h b/onnxruntime/core/mlas/lib/jblas_gemm.h
deleted file mode 100644
index 044dc5e849a0a..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.h
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#pragma once
-
-#include "mlas_qnbit.h"
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType);
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb
-	, MLAS_THREADPOOL* ThreadPool);
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
index 7d877848017fe..0d8a5692359a6 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
@@ -19,10 +19,6 @@ Module Name:
 
 #include <cassert>
 
-#ifdef MLAS_JBLAS
-#include "jblas_gemm.h"
-#endif
-
 namespace
 {
 
@@ -694,127 +690,3 @@ MlasSQNBitGemmBatch(
         ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
     });
 }
-
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        auto jsize = JblasQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
-        if (jsize) {
-            return jsize;
-        }
-    }
-#endif
-    (void)(N);
-    (void)(K);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(CompType);
-    return 0;
-}
-
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    int nbits,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        if (JblasQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
-            return;
-        }
-    }
-#endif
-    (void)(PackedBuf);
-    (void)(QData);
-    (void)(Scale);
-    (void)(Zp);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(lastCall);
-    (void)(CompType);
-    (void)(ThreadPool);
-}
-
-void MLASCALL
-MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-#ifdef MLAS_JBLAS
-    if (JblasQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
-        return;
-    }
-#endif
-    (void)(FpData);
-    (void)(PackedBuf);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(ThreadPool);
-}
-
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-#ifdef MLAS_JBLAS
-    return JblasSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    return 0;
-}
-
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetMlasPlatform();
-#ifdef MLAS_JBLAS
-    if (JblasSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
-        // PackedWeight is created by jblas
-        return;
-    }
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    (void)(WorkSpace);
-    (void)(ThreadPool);
-}
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format b/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
deleted file mode 100644
index 84b876706161d..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
+++ /dev/null
@@ -1,7 +0,0 @@
-Language:        Cpp
-BasedOnStyle:  Google
-DerivePointerAlignment: false
-ColumnLimit: 120
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SortIncludes: false
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt b/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
deleted file mode 100644
index 5d9c5edf45a96..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-
-project(jblas LANGUAGES CXX VERSION 0.1.0)
-
-file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
-file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
-
-add_library(${PROJECT_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
-
-target_include_directories(
-	${PROJECT_NAME} INTERFACE
-	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
-)
-
-if(WIN32)
-	target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
-	target_compile_options(${PROJECT_NAME} INTERFACE /wd4068 /wd4849 /wd6262 /wd4702 /wd4100) 
-	#4068 ignore unroll and GCC flags
-	#4849 ignore collapse
-	#6262 ignore stack too large
-	#4702 unreachable code(false warning on constexpr condition)
-	#4100 unreferenced formal parameter
-
-	target_link_options(${PROJECT_NAME} INTERFACE /STACK:3145728) #Stack requires up to L2 cache size
-endif(WIN32)
-
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
deleted file mode 100644
index 143adb771760b..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
+++ /dev/null
@@ -1,303 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-
-#include <cstddef>
-#include <type_traits>
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-#define OFFSET(field) offsetof(params, field)
-
-namespace jblas {
-
-namespace xbyak {
-class JitBase : protected Xbyak::CodeGenerator {
- protected:
-  JitBase(size_t size = 16 * 1024) : CodeGenerator(size) {}
-
-  void load32(const Xbyak::Reg64& reg, const Xbyak::Address& addr) {
-    xor_(reg, reg);
-    mov(reg.cvt32(), addr);
-  }
-
-  void vreg_push(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(xword[baseaddr + i * 16], Xbyak::Xmm(6 + i));
-    }
-#endif
-  }
-
-  void vreg_pop(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(Xbyak::Xmm(6 + i), xword[baseaddr + i * 16]);
-    }
-#endif
-  }
-
-  void padto_le(const Xbyak::Reg64& _src, int padding) {
-    // _src=_src/padding*padding
-    if (padding == 1) {
-      return;
-    }
-    for (int i = 1; i < 16; i++) {
-      if ((1 << i) == padding) {
-        shr(_src, i);
-        shl(_src, i);
-        return;
-      }
-    }
-    assert(0);
-  }
-
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Address& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    inLocalLabel();
-    lea(_tmp, _total);
-    sub(_tmp, _pos);
-    cmp(_tmp, N);
-    jb(".maskflag");
-    cmp(_tmp, 0);
-    jl(".zeroflag");
-    uint64_t allmask = (static_cast<uint64_t>(1) << N) - 1;
-    if (N == 64) {
-      allmask = static_cast<uint64_t>(-1);
-    }
-    mov(_tmp, allmask);
-    kmovq(_msk, _tmp);
-    jmp(".maskend");
-    L(".maskflag");
-    mov(_tmp1, 1);
-    shlx(_tmp1, _tmp1, _tmp);
-    sub(_tmp1, 1);
-    kmovq(_msk, _tmp1);
-    jmp(".maskend");
-    L(".zeroflag");
-    mov(_tmp1, 0);
-    kmovq(_msk, _tmp1);
-    L(".maskend");
-    outLocalLabel();
-  }
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Reg64& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    generate_Nbitsmask(_msk, _pos, ptr[_total], _tmp, _tmp1, N);
-  }
-};
-
-class JitAvx : protected JitBase {
- protected:
-  static int constexpr VBits = 256;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 16;
-  typedef Xbyak::Ymm vreg_t;
-};
-
-class JitAvx2 : protected JitAvx {
- protected:
-  static int constexpr VBits = 256;
-  typedef Xbyak::Ymm vreg_t;
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxor(x1, x2, op); }
-
-  void loadbf16_f32(const Xbyak::Ymm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-};
-
-class JitAvx512f : protected JitAvx2 {
- protected:
-  static int constexpr VBits = 512;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 32;
-  typedef Xbyak::Zmm vreg_t;
-
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxorq(x1, x2, op); }
-
-  void interleave_2rows_4regs(Xbyak::Zmm* src_2regs, Xbyak::Zmm* tmp_2reg) {
-    vpunpcklwd(tmp_2reg[0], src_2regs[0], src_2regs[1]);
-    vpunpckhwd(tmp_2reg[1], src_2regs[0], src_2regs[1]);
-    vshuff32x4(src_2regs[0], tmp_2reg[0], tmp_2reg[1], 0 | (1 << 2) | (0 << 4) | (1 << 6));
-    vshuff32x4(src_2regs[0], src_2regs[0], src_2regs[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], tmp_2reg[0], tmp_2reg[1], 2 | (3 << 2) | (2 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], src_2regs[1], src_2regs[1], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-  }
-
-  void transpose16x16_4B(Xbyak::Zmm* src, Xbyak::Zmm* tmp, const int N = 16) {
-    for (int i = 0; i < 8; ++i) {
-      vpunpckldq(tmp[2 * i + 0], src[2 * i], src[2 * i + 1]);
-      vpunpckhdq(tmp[2 * i + 1], src[2 * i], src[2 * i + 1]);
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      vpunpcklqdq(src[4 * i + 0], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpckhqdq(src[4 * i + 1], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpcklqdq(src[4 * i + 2], tmp[4 * i + 1], tmp[4 * i + 3]);
-      vpunpckhqdq(src[4 * i + 3], tmp[4 * i + 1], tmp[4 * i + 3]);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      vshufi32x4(tmp[8 * i + 0], src[8 * i + 0], src[8 * i + 4], 0x88);
-      vshufi32x4(tmp[8 * i + 1], src[8 * i + 1], src[8 * i + 5], 0x88);
-      vshufi32x4(tmp[8 * i + 2], src[8 * i + 2], src[8 * i + 6], 0x88);
-      vshufi32x4(tmp[8 * i + 3], src[8 * i + 3], src[8 * i + 7], 0x88);
-      vshufi32x4(tmp[8 * i + 4], src[8 * i + 0], src[8 * i + 4], 0xdd);
-      vshufi32x4(tmp[8 * i + 5], src[8 * i + 1], src[8 * i + 5], 0xdd);
-      vshufi32x4(tmp[8 * i + 6], src[8 * i + 2], src[8 * i + 6], 0xdd);
-      vshufi32x4(tmp[8 * i + 7], src[8 * i + 3], src[8 * i + 7], 0xdd);
-    }
-
-    // last step and move out
-    for (int i = 0; i < N; ++i) {
-      vshufi32x4(src[i], tmp[i % 8], tmp[8 + i % 8], i < 8 ? 0x88 : 0xdd);
-    }
-  }
-
-  void interleave_4rows_6regs(Xbyak::Zmm* src_4regs, Xbyak::Zmm* tmp_regs, const Xbyak::Opmask* masks) {
-    vpunpcklbw(tmp_regs[0], src_4regs[0], src_4regs[1]);
-    vpunpckhbw(tmp_regs[1], src_4regs[0], src_4regs[1]);
-    vpunpcklbw(tmp_regs[2], src_4regs[2], src_4regs[3]);
-    vpunpckhbw(tmp_regs[3], src_4regs[2], src_4regs[3]);
-
-    vpunpcklwd(tmp_regs[4], tmp_regs[0], tmp_regs[2]);
-    vpunpckhwd(tmp_regs[5], tmp_regs[0], tmp_regs[2]);
-    vpunpcklwd(tmp_regs[0], tmp_regs[1], tmp_regs[3]);
-    vpunpckhwd(tmp_regs[2], tmp_regs[1], tmp_regs[3]);
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (4 << 4) | 4);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (4 << 4) | 4);
-    vmovups(src_4regs[0], tmp_regs[1]);
-    vshuff32x4(src_4regs[0] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[1], tmp_regs[3]);
-    vshuff32x4(src_4regs[1] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (14 << 4) | 14);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (14 << 4) | 14);
-    vmovups(src_4regs[2], tmp_regs[1]);
-    vshuff32x4(src_4regs[2] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[3], tmp_regs[3]);
-    vshuff32x4(src_4regs[3] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-  }
-
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) {
-    vpsrld(_fp32, _fp32, 16);
-    vpmovdw(_bf16, _fp32);
-  }
-
-  void loadbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-
-  void broadcastbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Reg64& tmp, const Xbyak::Address& addr) {
-    mov(tmp.cvt16(), addr);
-    shl(tmp.cvt32(), 16);
-    vpbroadcastd(dst, tmp.cvt32());
-  }
-
-  void store_fp32_bf16(const Xbyak::Zmm& _fp32, const Xbyak::Address& _add) {
-    auto bf16 = Xbyak::Ymm(_fp32.getIdx());
-    cvt_fp32_bf16(bf16, _fp32);
-    vmovups(_add, bf16);
-  }
-};
-
-class JitAvx512_bf16 : protected JitAvx512f {};
-
-class JitAvx512_fp16 : protected JitAvx512f {};
-
-class JitAvx512vnni : protected JitAvx512f {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::EvexEncoding);
-  }
-};
-
-class JitAvxvnni : protected JitAvx2 {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::VexEncoding);
-  }
-};
-
-class JitAmxtile : protected JitAvx512f {
- public:
-  struct alignas(64) tileconfig_t {
-    uint8_t palette_id;
-    uint8_t reserved[15];
-    uint16_t colb[16];
-    uint8_t rows[16];
-  };
-  static int constexpr TileCount = 8;
-
-  typedef long long (*configure_t)(void*);
-
-  static void generate_config(Xbyak::CodeGenerator* g) {
-    Xbyak::util::StackFrame st(g, 1, 0, 0);
-    auto& parambase = st.p[0];
-    g->ldtilecfg(g->ptr[parambase]);
-  }
-
-  static void configure_tiles(tileconfig_t& tc, int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum,
-                              int CNum) {
-    // Filling tile configure structure. Could be done offline.
-    tc.palette_id = 1;
-    // Configure C tiles
-    int t = 0;
-    for (; t < CNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-    // Configure A tiles
-    for (; t < CNum + ANum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_K * elesize);
-    }
-    // Configure B tile. B effectively has 64 rows and 16 columns.
-    int kpack = 4 / elesize;
-    for (; t < CNum + ANum + BNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_K / kpack);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-  }
-};
-
-class JitAmxbf16 : protected JitAmxtile {
- protected:
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) { vcvtneps2bf16(_bf16, _fp32); }
-};
-
-class JitAmxint8 : protected JitAmxtile {
- protected:
-  template <class, class>
-  void _tdpb(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3);
-};
-template <>
-inline void JitAmxint8::_tdpb<int8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbssd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<int8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbsud(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbusd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbuud(x1, x2, x3);
-}
-}  // namespace xbyak
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
deleted file mode 100644
index 8ecf3535c17f4..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-enum JBLAS_CODE {
-  JblasSuccess = 0,
-  JblasInvalidParam = 1,
-  JblasInvalidISA = 2,
-  JblasRuntimeError = 4,
-  JblasNotSupport = 8,
-};
-enum JBLAS_ISA : uint32_t {
-  JblasNoSIMD = 0,
-  JblasAVX,
-  JblasAVX2,
-  JblasAVX_VNNI,
-  JblasAVX512F,
-  JblasAVX512_VNNI,
-  JblasAMX_BF16,
-  JblasAMX_INT8,
-  JblasAVX512_FP16,
-  JblasAVX512_BF16,
-};
-enum class JBLAS_DTYPE : uint32_t {
-  EleBitsMask = 0xff,
-  EleBitsUndef = 0,
-  EleBits4 = 4,
-  EleBits8 = 8,
-  EleBits16 = 16,
-  EleBits32 = 32,
-  EleBits64 = 64,
-  TypeMask = 0xff00,
-  TypeFloat = 0 << 8,
-  TypeInt = 1 << 8,
-  SubTypeMask = 0xff0000,
-  SubType0 = 0 << 16,
-  SubType1 = 1 << 16,
-  SubType2 = 2 << 16,
-  F64 = EleBits64 | TypeFloat,
-  F32 = EleBits32 | TypeFloat,
-  F16 = EleBits16 | TypeFloat,
-  BF16 = EleBits16 | TypeFloat | SubType1,
-  F8_E4M3 = EleBits8 | TypeFloat,
-  F8_E5M2 = EleBits8 | TypeFloat | SubType1,
-  F8_E3M4 = EleBits8 | TypeFloat | SubType2,
-  S8 = EleBits8 | TypeInt,
-  U8 = EleBits8 | TypeInt | SubType1,
-  S4_CLIP = EleBits4 | TypeInt,
-  S4_FULLRANGE = EleBits4 | TypeInt | SubType1,
-  F4_E2M1 = EleBits4 | TypeFloat,
-  F4_BNB = EleBits4 | TypeFloat | SubType1,
-  F4_NF4 = EleBits4 | TypeFloat | SubType2,
-  S32 = EleBits32 | TypeInt,
-  U32 = EleBits32 | TypeInt | SubType1,
-};
-
-enum JBLAS_LAYOUT { JblasRowMajor = 101, JblasColMajor = 102 };
-enum JBLAS_TRANSPOSE {
-  JblasNoTrans = 111,
-  JblasTrans = 112,
-  JblasConjTrans = 113,
-};
-enum JBLAS_ELTWISEOP {
-  GELU,
-  SWISH,
-  TANH,
-  EXP,
-  LOW_PRECISION_EXP,
-  RELU,
-  LINEAR,
-};
-
-enum class JBLAS_PROLOGUEB_IDS : uint32_t {
-  Undef = (uint32_t)-1,
-  Begin = 0,
-  NormalBegin = Begin,
-  WeightPack = NormalBegin,
-  NormalEnd,
-  KBlockBegin = NormalEnd,
-  WeightKBlockS8 = KBlockBegin,
-  WeightKBlockS4,
-  WeightKBlockF4,
-  KBlockEnd,
-  End,
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
deleted file mode 100644
index 5cac1080bc610..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
+++ /dev/null
@@ -1,277 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas.h"
-#include "xbyak/xbyak_util.h"
-
-namespace jblas {
-
-namespace device {
-
-struct X64_ISA {
-  int64_t MMX : 1;                  // 0
-  int64_t SSE : 1;                  // 1
-  int64_t SSE2 : 1;                 // 2
-  int64_t SSE3 : 1;                 // 3
-  int64_t SSSE3 : 1;                // 4
-  int64_t SSE41 : 1;                // 5
-  int64_t SSE42 : 1;                // 6
-  int64_t AVX : 1;                  // 7
-  int64_t F16C : 1;                 // 8
-  int64_t FMA : 1;                  // 9
-  int64_t AVX2 : 1;                 // 10
-  int64_t AVX_VNNI : 1;             // 11
-  int64_t AVX_VNNI_INT8 : 1;        // 12
-  int64_t AVX_NE_CONVERT : 1;       // 13
-  int64_t AVX_IFMA : 1;             // 14
-  int64_t AVX512F : 1;              // 15
-  int64_t AVX512BW : 1;             // 16
-  int64_t AVX512CD : 1;             // 17
-  int64_t AVX512DQ : 1;             // 18
-  int64_t AVX512ER : 1;             // 19
-  int64_t AVX512IFMA52 : 1;         // 20
-  int64_t AVX512PF : 1;             // 21
-  int64_t AVX512VL : 1;             // 22
-  int64_t AVX512VPOPCNTDQ : 1;      // 23
-  int64_t AVX512_4FMAPS : 1;        // 24
-  int64_t AVX512_4VNNIW : 1;        // 25
-  int64_t AVX512_BF16 : 1;          // 26
-  int64_t AVX512_BITALG : 1;        // 27
-  int64_t AVX512_VBMI : 1;          // 28
-  int64_t AVX512_VBMI2 : 1;         // 29
-  int64_t AVX512_VNNI : 1;          // 30
-  int64_t AVX512_VP2INTERSECT : 1;  // 31
-  int64_t AVX512_FP16 : 1;          // 32
-  int64_t AMX_TILE : 1;             // 33
-  int64_t AMX_BF16 : 1;             // 34
-  int64_t AMX_INT8 : 1;             // 35
-  int64_t AMX_FP16 : 1;             // 36
-  int64_t AMX_COMPLEX : 1;          // 37
-  int64_t reserved : (64 - 38);
-};
-
-class AVX2_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 0;
-  static constexpr bool AVX512BW = 0;
-  static constexpr bool AVX512CD = 0;
-  static constexpr bool AVX512DQ = 0;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 0;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 0;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class AVX512_VNNI_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class SapphireRapids {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 1;
-  static constexpr bool AMX_BF16 = 1;
-  static constexpr bool AMX_INT8 = 1;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-class CpuDevice {
- public:
-  inline void setThreads(int _nth) {
-    if (_nth <= 0) {
-      numthreads = numcores;
-    } else {
-      numthreads = std::min(numcores, _nth);
-    }
-  }
-  inline int getThreads() { return numthreads; }
-  inline int getCores() { return numcores; }
-  inline uint32_t getL2CacheSize() { return L2Cache; }
-  inline uint32_t getL1CacheSize() { return L1Cache; }
-  inline bool AVX() { return mHasAVX; }
-  inline bool AVX2() { return mHasAVX2; }
-  inline bool AVX_VNNI() { return mHasAVX_VNNI; }
-  inline bool AVX512F() { return mHasAVX512F; }
-  inline bool AVX512_VNNI() { return mHasAVX512_VNNI; }
-  inline bool AMX_INT8() { return mHasAMX_INT8; }
-  inline bool AMX_BF16() { return mHasAMX_BF16; }
-  inline bool AVX512_BF16() { return mHasAVX512_BF16; }
-  inline bool AVX512_FP16() { return mHasAVX512_FP16; }
-#define ADD_FLAG(isa) mHas##isa = _cpu.has(_cpu.t##isa)
-  CpuDevice() {
-    static Xbyak::util::Cpu _cpu;
-    L1Cache = _cpu.getDataCacheSize(0);
-    L2Cache = _cpu.getDataCacheSize(1);
-    ADD_FLAG(AVX);
-    ADD_FLAG(AVX2);
-    ADD_FLAG(AVX512F);
-    ADD_FLAG(AVX512_VNNI);
-    ADD_FLAG(AVX_VNNI);
-    ADD_FLAG(AMX_BF16);
-    ADD_FLAG(AMX_INT8);
-    ADD_FLAG(AVX512_BF16);
-    ADD_FLAG(AVX512_FP16);
-    numcores = _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
-    numthreads = numcores;
-  }
-
-  static CpuDevice* getInstance() {
-    static CpuDevice instance;
-    return &instance;
-  }
-
-  void print() {
-    printf(
-        "AVX:%d AVX2:%d AVX512F:%d AVX_VNNI:%d AVX512_VNNI:%d AMX_INT8:%d AMX_BF16:%d AVX512_BF16:%d AVX512_FP16:%d\n",
-        mHasAVX, mHasAVX2, mHasAVX512F, mHasAVX_VNNI, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512_BF16,
-        mHasAVX512_FP16);
-  }
-#undef ADD_FLAG
-
- protected:
-  uint32_t L2Cache, L1Cache;
-  bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
-      mHasAVX512_FP16;
-  int numcores;
-  int numthreads;
-};
-
-#define GetCPUDevice() auto _cd = jblas::device::CpuDevice::getInstance();
-
-class CpuBase {
- public:
-  CpuBase() {
-    GetCPUDevice();
-    mL2Cache = _cd->getL2CacheSize();
-    mL1Cache = _cd->getL1CacheSize();
-    mNumThreads = _cd->getThreads();
-  }
-  size_t mL2Cache, mL1Cache;
-  int mNumThreads;
-};
-}  // namespace device
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
deleted file mode 100644
index ceb7a545092d8..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
+++ /dev/null
@@ -1,329 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <tuple>
-
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace epilogue {
-namespace gemm {
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T>
-class AccumulatorWriteBack {
- public:
-  using SType = _SRC_T;
-  using DType = _DST_T;
-  struct Param {
-    DType* C;
-    int ldc;
-    void* elt_const_v;
-  };
-
-  template <typename... Eltops>
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize, Eltops... ops) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    bool constexpr Valid = !std::is_same<DType, utils::bf16>::value || std::is_same<SType, float>::value;
-    static_assert(Valid, "fp32 to bf16 conversion only.");
-    if constexpr (std::is_same<DType, utils::bf16>::value) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (std::is_same<std::tuple<SType, DType>, std::tuple<utils::fp16, float>>::value) {
-      return kernel::wrapper::Memcpy2DFp16CvtFp32::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (sizeof(SType) == sizeof(DType)) {
-      return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep,
-                                                                              _param.ldc, _param.elt_const_v, ops...);
-    } else {
-      assert(false);
-    }
-  }
-};
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP _OP>
-class CustomAccumulatorWriteBackWithEltop {
- public:
-  struct Param {
-    _DST_T* C;
-    int ldc;
-    void* elt_const_v;
-  };
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
-      return kernel::wrapper::Memcpy2D::template forward1<ISA_T, float, float, _OP>(cacheptr, cptr, M, N, cachestep,
-                                                                                    _param.ldc, _param.elt_const_v);
-    } else {
-      assert(false);
-    }
-  }
-};
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32 = AccumulatorWriteBack<ISA_T, float, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackInt32 = AccumulatorWriteBack<ISA_T, int, int>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackBf16 = AccumulatorWriteBack<ISA_T, utils::bf16, utils::bf16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16 = AccumulatorWriteBack<ISA_T, utils::fp16, utils::fp16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<ISA_T, utils::fp16, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<ISA_T, float, utils::bf16>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, GELU>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithSwishFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, SWISH>;
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessFp32 {
- public:
-  struct Param {
-    float *C, *D;
-    int ldc, ldd;
-    float alpha, beta;
-  };
-
-  JBLAS_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto DOffset = M_offset * _param.ldd + N_offset;
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto dptr = _param.D + DOffset;
-    return kernel::wrapper::AlphaBetaF32F32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, _param.beta,
-                                                                     dptr, _param.ldd, cptr, _param.ldc, M, N);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompFp32BlockEpilogue {
- public:
-  struct Param {
-    void* scales;
-    JBLAS_DTYPE scaledtype;
-    int ldsb;
-    int8_t* zps = nullptr;
-    float* reduce = nullptr;
-    int ldra;
-  };
-  JBLAS_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    auto ret = JblasNotSupport;
-    if (_param.scaledtype == JBLAS_DTYPE::F32) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(ret == JblasSuccess);
-      if (_param.zps != nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zps + K_offset * _param.ldsb + N_offset,
-            reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, _param.ldra,
-            _param.reduce + M_offset * _param.ldra + K_offset);
-      }
-      assert(ret == JblasSuccess);
-      return ret;
-    } else if (_param.scaledtype == JBLAS_DTYPE::BF16) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(_param.zps == nullptr);
-      assert(ret == JblasSuccess);
-      return ret;
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class DequantInt32ToFp32 {
- public:
-  struct Param {
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                   _param.scalesA + M_offset * _param.ldsa, _param.ldsa,
-                                                                   _param.scalesB + N_offset);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompInt8BlockEpilogue {
- public:
-  struct Param {
-    void* scalesB;
-    JBLAS_DTYPE scaleBdtype;
-    int ldsb;
-    float* scalesA;
-    int ldsa;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    void* reduceB = nullptr;
-    JBLAS_DTYPE reduceBdtype = JBLAS_DTYPE::F32;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    float* scab = nullptr;
-    size_t ScaleBTmpSize = N * sizeof(float);
-    size_t ReduceBTmpSize = N * sizeof(float);
-    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
-    if (_param.scaleBdtype == JBLAS_DTYPE::BF16) {
-      auto scache = reinterpret_cast<float*>(tmpcache);
-      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scalesB) + N_offset + K_offset * _param.ldsb, scache, 1, N, N, N,
-          false);
-      assert(ret == JblasSuccess);
-      scab = scache;
-    } else if (_param.scaleBdtype == JBLAS_DTYPE::F32) {
-      scab = reinterpret_cast<float*>(_param.scalesB) + N_offset + K_offset * _param.ldsb;
-    }
-    float* redb = nullptr;
-    if (_param.reduceB) {
-      if (_param.reduceBdtype == JBLAS_DTYPE::BF16) {
-        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
-        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-            reinterpret_cast<utils::bf16*>(_param.reduceB) + N_offset + K_offset * _param.ldsb, rcache, 1, N, N, N,
-            false);
-        assert(ret == JblasSuccess);
-        redb = rcache;
-      } else if (_param.reduceBdtype == JBLAS_DTYPE::F32) {
-        redb = reinterpret_cast<float*>(_param.reduceB) + N_offset + K_offset * _param.ldsb;
-      }
-    }
-    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(
-        srcptr, cachestep, reinterpret_cast<float*>(const_cast<int32_t*>(srcptr)), cachestep, M, N,
-        _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, scab);
-    assert(ret == JblasSuccess);
-    ret = kernel::wrapper::AccumulateFp32::template forward<ISA_T>(reinterpret_cast<const float*>(srcptr), cachestep,
-                                                                   dstptr, cachestep, M, N);
-    assert(ret == JblasSuccess);
-
-    if (_param.zpA == nullptr) {
-      if (_param.zpB == nullptr) {
-        return ret;
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpB + N_offset + K_offset * _param.ldsb, scab, _param.ldsa,
-            _param.reduceA + M_offset * _param.ldsa + K_offset);
-      }
-    } else {
-      if (_param.zpB == nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, redb);
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.zpB + N_offset + K_offset * _param.ldsb, _param.scalesA + M_offset * _param.ldsa + K_offset, scab,
-            _param.ldsa, _param.K, _param.reduceA + M_offset * _param.ldsa + K_offset, redb);
-      }
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class ZpDequantInt32ToFp32 {
- public:
-  struct Param {
-    // necessary
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    float* reduceB = nullptr;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                       _param.scalesA + M_offset * _param.ldsa,
-                                                                       _param.ldsa, _param.scalesB + N_offset);
-    if (ret != JblasSuccess) {
-      return ret;
-    }
-    if (_param.zpA == nullptr && _param.zpB == nullptr) {
-      return ret;
-    } else if (_param.zpA != nullptr && _param.zpB == nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.scalesA + M_offset * _param.ldsa,
-          _param.ldsa, _param.reduceB + N_offset);
-    } else if (_param.zpA == nullptr && _param.zpB != nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpB + N_offset, _param.scalesB + N_offset, _param.ldsa,
-          _param.reduceA + M_offset * _param.ldsa);
-    } else {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.zpB + N_offset,
-          _param.scalesA + M_offset * _param.ldsa, _param.scalesB + N_offset, _param.ldsa, _param.K,
-          _param.reduceA + M_offset * _param.ldsa, _param.reduceB + N_offset);
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessS32U8 {
- public:
-  struct Param {
-    uint8_t* C;
-    int ldc;
-    float alpha;
-    float scaleAcc, scaleC;
-    int zpC;
-  };
-
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
-                                                                   M, N, _param.scaleAcc, _param.scaleC, _param.zpC);
-  }
-};
-
-}  // namespace gemm
-}  // namespace epilogue
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
deleted file mode 100644
index 364da9223940f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
+++ /dev/null
@@ -1,2699 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-
-#include "jit_blas_utils.h"
-#include "jit_base.h"
-
-namespace jblas {
-namespace gemm {
-enum class CompType : uint32_t {
-  COMP_FP32 = 0,
-  COMP_BF16_FP32 = 1,
-  COMP_FP16_FP16 = 2,
-  COMP_INT_START = 3,
-  COMP_INT8_US_INT32 = COMP_INT_START,
-  COMP_INT8_UU_INT32 = 4,
-  COMP_INT8_SS_INT32 = 5,
-  COMP_INT8_SU_INT32 = 6,
-  COMP_INT16_SS_INT32 = 7,
-  COMP_INT8_US_FP32 = 8,
-  COMP_INT8_UU_FP32 = 9,
-  COMP_INT8_SS_FP32 = 10,
-  COMP_INT8_SU_FP32 = 11,
-};
-
-class CoreAttr {
- public:
-  // INT32=LSB|**8bits:NTile**||**8bits:PackRow**||**8bits:CompType**||**8bits:Reserve**|
-  static uint32_t constexpr NTILE_MASK = 0xff, NTILE_SHIFT = 0, PACKROW_MASK = 0xff00, PACKROW_SHIFT = 8,
-                            COMP_MASK = 0xff0000, COMP_SHIFT = 16, ISA_MASK = 0xff000000, ISA_SHIFT = 24;
-
-  static inline uint32_t get_mask_val(uint32_t raw, uint32_t mask, uint32_t shift) { return (raw & mask) >> shift; }
-  static constexpr uint32_t make_core_id(uint32_t NTile, uint32_t PackRow, uint32_t CompType, uint32_t ISA) {
-    return (NTile << NTILE_SHIFT) | (PackRow << PACKROW_SHIFT) | (CompType << COMP_SHIFT) | (ISA << ISA_SHIFT);
-  }
-
-  static void parse_id(uint32_t id, uint32_t* vals) {
-    vals[0] = get_mask_val(id, NTILE_MASK, NTILE_SHIFT);
-    vals[1] = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    vals[2] = get_mask_val(id, COMP_MASK, COMP_SHIFT);
-    vals[3] = get_mask_val(id, ISA_MASK, ISA_SHIFT);
-  }
-
-  static const char* to_str(uint32_t id) {
-    static char tmp[128];
-    uint32_t vals[4];
-    parse_id(id, vals);
-    sprintf(tmp, "N%d_PACK%d_COMP%d_ISA%d", vals[0], vals[1], vals[2], vals[3]);
-    return tmp;
-  }
-
-  static inline size_t get_bsize(uint32_t id) {
-    auto packrow = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    return size_t(4 / packrow);
-  }
-};
-
-namespace code {
-
-template <int _NTILE, int _MTILE = 0>
-class Avx2N8P1 : protected jblas::xbyak::JitAvx2 {
- public:
-  static int constexpr RegLen = 8, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX2;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fp16N32P1 : protected jblas::xbyak::JitAvx512_fp16 {
- public:
-  static int constexpr RegLen = 32, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_FP16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP16_FP16;
-  typedef utils::fp16 AType;
-  typedef utils::fp16 BType;
-  typedef utils::fp16 CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastw(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastw(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512bf16N16P2 : protected jblas::xbyak::JitAvx512_bf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 2;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                        ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class AvxvnniN8P4 : protected jblas::xbyak::JitAvxvnni {
- public:
-  static int constexpr RegLen = 8, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Amxbf16N16P2 : protected jblas::xbyak::JitAmxbf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 32;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-
-template <typename AT, typename BT, int _NTILE, int _MTILE = 0>
-class Amxint8N16P4 : protected jblas::xbyak::JitAmxint8 {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 64;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_INT8;
-  static uint32_t constexpr COMPUTE =
-      (uint32_t)(std::is_same_v<AT, int8_t>
-                     ? std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_SS_INT32 : CompType::COMP_INT8_SU_INT32
-                 : std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_US_INT32
-                                              : CompType::COMP_INT8_UU_INT32);
-  using AType = AT;
-  using BType = BT;
-  typedef int32_t CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-template <int N, int M>
-using Amxint8N16P4US = Amxint8N16P4<uint8_t, int8_t, N, M>;
-
-template <int N, int M>
-using Amxint8N16P4SS = Amxint8N16P4<int8_t, int8_t, N, M>;
-
-class AmxConfigure : protected jblas::xbyak::JitAmxtile {
- public:
-  typedef long long (*func_t)(tileconfig_t*);
-
-  static void configure(int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum, int CNum) {
-    static AmxConfigure code;
-    tileconfig_t cfg;
-    std::memset(&cfg, 0, sizeof(cfg));
-    configure_tiles(cfg, TILE_M, TILE_N, TILE_K, elesize, ANum, BNum, CNum);
-    code.mKernel(&cfg);
-  }
-
- protected:
-  AmxConfigure() {
-    generate_config(this);
-    mKernel = getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-};
-
-namespace kblock {
-// optimize for kblock gemm, each block size in k dimension has dequant operation
-// all accumulators use fp32 dtype.
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1 - NRegs) / (NRegs * 2) : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_FP32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    uint8_t* zpA;
-    float* scaleA;
-    int ldsa;
-    float* scaleB;
-    float* reduceB;
-    int ldsb;
-    int k;
-    int n;
-    int kblock;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_iterkb;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_tmp4;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = NRegs;
-    CReg = 0;
-    CF32Reg = CReg + CRegCount;
-    BReg = CF32Reg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg < RegCount);
-    TmpRegCount = RegCount - TmpReg;
-    assert(TmpRegCount >= 1);
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_iterkb = st.t[12];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_tmp4 = st.t[11];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    xor_(reg_iterkb, reg_iterkb);
-    L(".kloop");
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vpxorq(Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j));
-      }
-    }
-    xor_(reg_tmp2, reg_tmp2);
-    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
-    mov(reg_tmp, reg_tmp3);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kbloop", T_NEAR);
-    L(".unkbloop");
-    generate_fma(_mtile, KUNROLL, reg_tmp1);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_tmp2, KUNROLL * KTILE);
-    cmp(reg_tmp2, reg_tmp);
-    jb(".unkbloop");
-    cmp(reg_tmp, reg_tmp3);
-    jge(".kend", T_NEAR);
-    L(".kbloop");
-    generate_fma(_mtile, 1, reg_tmp1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_tmp2, 1 * KTILE);
-    cmp(reg_tmp2, reg_tmp3);
-    jb(".kbloop");
-    L(".kend");
-    add(reg_iterk, reg_tmp2);
-    generate_f32_accumulate(_mtile);
-    generate_zp_correction(_mtile);
-    inc(reg_iterkb);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
-      for (int i = 0; i < NRegs; i++) {
-        vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-      }
-      for (int mm = 0; mm < _mtile; mm++) {
-        vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-        add(reg_tmp1, reg_astride);
-        for (int i = 0; i < NRegs; i++) {
-          vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void generate_f32_accumulate(int _mtile) {
-    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-
-    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
-    }
-    for (int mm = 0; mm < _mtile; mm++) {
-      vbroadcastss(Xbyak::Zmm(TmpReg), ptr[reg_tmp]);
-      lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
-      for (int i = 0; i < NRegs; i++) {
-        vcvtdq2ps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-        vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(TmpReg), Xbyak::Zmm(BReg + i));
-        vmulps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(AReg));
-        vaddps(Xbyak::Zmm(CF32Reg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-      }
-    }
-  }
-
-  void generate_zp_correction(int _mtile) {
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp1, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-    auto& reg_redB = reg_tmp2;
-
-    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
-    auto& reg_zpA = reg_tmp;
-
-    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
-    auto& reg_scaleA = reg_tmp1;
-
-    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
-    auto& reg_ldsa = reg_tmp3;
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_redB + i * VecBytes]);
-    }
-
-    for (int i = 0; i < _mtile; i++) {
-      vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
-      vpmovzxbd(Xbyak::Zmm(AReg), Xbyak::Xmm(AReg));
-      vcvtdq2ps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg));
-      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), zword_b[reg_scaleA]);
-      for (int j = 0; j < NRegs; j++) {
-        vmulps(Xbyak::Zmm(CReg + j), Xbyak::Zmm(AReg), Xbyak::Zmm(BReg + j));
-        vsubps(Xbyak::Zmm(CF32Reg + i * NRegs + j), Xbyak::Zmm(CReg + j));
-      }
-      lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
-      lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
-    }
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-}  // namespace kblock
-}  // namespace code
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBase {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA);
-  void configure() { (void)(0); }
-
- protected:
-  CoreCodeBase() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code(i + 1);
-    }
-  }
-  std::array<Code, Code::MTILE> mCodes;
-};
-
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBaseAMX {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(_NTILE, PACK_ROW, COMP, ISA);
-  Xbyak::CodeGenerator cfgcode;
-
- protected:
-  CoreCodeBaseAMX() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code((i + 1) * 16);
-    }
-  }
-  std::array<Code, Code::MRegs> mCodes;
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx2 : public CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx512f : public CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512fp16 : public CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE>::Code;
-
-  void forward(utils::fp16* matA, utils::fp16* matB, utils::fp16* matC, int _m, int _n, int _k, int _astride,
-               int _bstride, int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512bf16 : public CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE>::Code;
-  void forward(utils::bf16* matA, utils::bf16* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAmxbf16 : public CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(AType* matA, BType* matB, CType* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnni : public CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnniKBlock : public CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
-               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
-                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvxvnni : public CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE>::Code;
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8 : public CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8SS : public CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-}  // namespace gemm
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
deleted file mode 100644
index a1607c9012187..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
+++ /dev/null
@@ -1,678 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <functional>
-#include <thread>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "jit_blas_utils.h"
-#include "jit_blas_device.h"
-
-namespace jblas {
-namespace parallel {
-struct Config2D {
-  int threads;
-  int size[2];
-  int step[2];
-};
-struct ThreadProblem2D {
-  int tid;
-  int tidx[2];
-  int loc[2];
-  int size[2];
-  bool valid;
-  void print() {
-    printf("Thread %d indice:(%d,%d)\n", tid, tidx[0], tidx[1]);
-    printf("Thread location:(%d,%d)\n", loc[0], loc[1]);
-    printf("Thread problem size:(%d,%d)\n", size[0], size[1]);
-  }
-};
-class Scheduler2D {
- public:
-  Scheduler2D() = default;
-  Scheduler2D(const Config2D& config) { update(config); }
-  using ThreadProblem = ThreadProblem2D;
-
-  virtual void getIndex(ThreadProblem& problem) {
-    if (problem.tid >= mThdValid) {
-      problem.size[0] = 0;
-      problem.size[1] = 0;
-      problem.valid = false;
-      return;
-    }
-    auto& tid = problem.tid;
-    problem.tidx[1] = tid % mThdPerRow;
-    problem.tidx[0] = tid / mThdPerRow;
-    problem.loc[0] = problem.tidx[0] * mThdSize[0];
-    problem.loc[1] = problem.tidx[1] * mThdSize[1];
-    problem.size[0] = utils::remainsize(problem.loc[0], mSize[0], mThdSize[0]);
-    problem.size[1] = utils::remainsize(problem.loc[1], mSize[1], mThdSize[1]);
-    problem.valid = true;
-  }
-
-  virtual void update(const Config2D& config) {
-    mThdCount = config.threads;
-    for (size_t i = 0; i < 2; i++) {
-      mSize[i] = config.size[i];
-      mStep[i] = config.step[i];
-    }
-    schedule();
-  }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-  }
-
- protected:
-  void set(const int* thdsize, const int* size, const int* step) {
-    for (size_t i = 0; i < 2; i++) {
-      mThdSize[i] = thdsize[i];
-      mSize[i] = size[i];
-      mStep[i] = step[i];
-    }
-  }
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    float ratio = colnum * rownum / static_cast<float>(mThdCount);
-    if (ratio <= 1) {
-      mThdSize[0] = mStep[0];
-      mThdSize[1] = mStep[1];
-      mThdPerRow = colnum;
-      calc_valid_threads();
-      return;
-    }
-    float colratio = ratio > colnum ? colnum : ceil(ratio);
-    mThdSize[1] = static_cast<int>(colratio * mStep[1]);
-    mThdPerRow = static_cast<int>(ceil(static_cast<float>(colnum) / colratio));
-    mThdSize[0] = static_cast<int>(ceil(rownum / (static_cast<float>(mThdCount) / mThdPerRow)) * mStep[0]);
-    calc_valid_threads();
-  }
-  void calc_valid_threads() {
-    mThdValid = mThdPerRow * static_cast<int>(std::ceil(static_cast<float>(mSize[0]) / mThdSize[0]));
-  }
-
-  int mThdPerRow = 0;
-  int mThdValid = 0;
-  int mThdCount = 0;
-
- private:
-  int mThdSize[2] = {0, 0};
-  int mSize[2] = {0, 0};
-  int mStep[2] = {0, 0};
-};
-
-namespace gemm {
-
-struct ConfigGemmBase {
-  int threads;
-  int size[3];
-  size_t l2cache = 1024ULL * 1024;
-  size_t l1cache = 32ULL * 1024;
-};
-
-struct ThreadProblemBase : ThreadProblem2D {
-  int block[3];
-  size_t l2cachesize;
-  size_t tmpcachesize;
-};
-
-template <class _GemmCore_T>
-class SchedulerBase : public Scheduler2D {
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerBase() = default;
-  SchedulerBase(const ConfigGemmBase& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.l2cachesize = mL2Size;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmBase& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // cache = mMStep * mNStep * CSize + mNStep * mKStep * BSize
-  //       = mNStep * (mMStep*CSize + mKStep*BSize)
-  // C Access = K/mKStep
-  // B Access = M/mMStep
-  // A Access = N/mNStep
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    size_t csize_total = mL2Size - _GemmCore_T::PREFERRED_N * KRef * mEleSize[1];
-    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / mEleSize[2]);
-    maxM = utils::downdiv(maxM, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-    int maxN = static_cast<int>(mL2Size / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1]));
-    maxN = utils::downdiv(maxN, mStep[1]);
-    int nthdn = mThdSize[1] / mStep[1];
-    if (maxN < nthdn) {
-      int niter = utils::updiv(nthdn, maxN);
-      mBlock[1] = utils::updiv(nthdn, niter) * mStep[1];
-    } else {
-      mBlock[1] = mThdSize[1];
-    }
-    auto rawk = static_cast<int>((mL2Size - mBlock[0] * mBlock[1] * mEleSize[2]) /
-                                 (mBlock[0] * mEleSize[0] + mBlock[1] * mEleSize[1]));
-    rawk = std::min(rawk, mSizePadded[2]);
-    mBlock[2] = utils::padto_le(rawk, mStep[2]);
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = mThdSize[0];
-    mBlock[1] = mStep[1];
-    size_t reservsize = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    size_t maxK = (mL1Size - reservsize) / (mBlock[1] * mEleSize[1] + mBlock[0] * mEleSize[0]);
-    size_t Bsize = maxK * mBlock[1] * mEleSize[1];
-    size_t Bsize_1K = utils::padto_le(Bsize, 1024);
-    mBlock[2] = static_cast<int>(Bsize_1K / mEleSize[1] / mBlock[1]);
-    mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-  }
-
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-
-struct ConfigGemmKBlock : ConfigGemmBase {
-  int kblock;
-};
-
-template <class _GemmCore_T>
-class SchedulerKBlock : public Scheduler2D {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlock() = default;
-  SchedulerKBlock(const ConfigGemmKBlock& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.l2cachesize = mL2Size;
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmKBlock& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    mKBlock = config.kblock;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2] * 2;
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio * 1.f;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitStage * mStep[2] >= mSize[2]) {
-      mBlock[2] = mSize[2];
-    } else if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-    }      
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = static_cast<int>(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = static_cast<int>(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = static_cast<int>(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = static_cast<int>(getMaxK(mBlock[1]));
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-      auto tmp = utils::updiv(mKBlock, mBlock[2]);
-      while (mKBlock % tmp != 0) tmp++;  // TODO(Yu) optimize
-      mBlock[2] = utils::downdiv(mKBlock, tmp);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#if 0
-template <class _GemmCore_T>
-class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlockS() = default;
-  SchedulerKBlockS(const ConfigGemmKBlock& config) { update(config); }
-
- protected:
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-    }
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = int(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = int(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = int(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = getMaxK(mBlock[1]);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#endif
-}  // namespace gemm
-using thread_func = std::function<void(int tid)>;
-
-class IThreading {
- public:
-  IThreading(int nthreads) : mThreadNum(nthreads) {}
-  virtual void parallel_for(const thread_func& func) = 0;
-  virtual inline void sync() = 0;
-  virtual int num_threads() { return mThreadNum; };
-  virtual void set_threads(int nthreads) = 0;
-
- protected:
-  int mThreadNum;
-};
-#ifdef _OPENMP
-class OMPThreading : public IThreading {
- public:
-  OMPThreading(int nthreads) : IThreading(nthreads) { omp_set_num_threads(nthreads); }
-  void parallel_for(const thread_func& func) override {
-#pragma omp parallel
-    {
-      int tidx = omp_get_thread_num();
-      func(tidx);
-    }
-  }
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    omp_set_num_threads(nthreads);
-  }
-  virtual inline void sync() override {
-#pragma omp barrier
-    (void)(0);  // make msvc happy with c++20
-  }
-};
-#endif
-
-class StdThreading : public IThreading {
- public:
-  StdThreading(int nthreads) : IThreading(nthreads) { thdset.resize(nthreads); }
-  void parallel_for(const thread_func& func) override {
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i] = std::thread([&](int tidx) { func(tidx); }, int(i));
-    }
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i].join();
-    }
-  }
-
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    thdset.resize(nthreads);
-  }
-
-  virtual inline void sync() override { assert(0); }
-
- private:
-  std::vector<std::thread> thdset;
-};
-
-template <class Parallel_T, class Launch_T>
-void GemmBaseRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRunWithA(Launch_T& launcher, const typename Launch_T::Param& args,
-                        const typename Launch_T::AParam& Aargs, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  using AParall = typename Launch_T::PrologueA::Parallel;
-  AParall apara({th->num_threads(), args.M, args.K, 1, args.KBlock});
-  th->parallel_for([&](int tidx) {
-    typename AParall::ThreadProblem thdpA{tidx};
-    apara.getIndex(thdpA);
-    if (thdpA.valid) {
-      launcher.mProA.run(Aargs, thdpA);
-    }
-    th->sync();
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-}  // namespace parallel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
deleted file mode 100644
index b006e0b410cd8..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
+++ /dev/null
@@ -1,214 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include <cassert>
-
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_a {
-namespace gemm {
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class ActivationBase {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = AType;
-  struct Param {
-    const AType* A;
-    int lda;
-  };
-  ActivationBase() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<AType*>(_param.A);
-    if (k_size % _GemmCore_T::KTILE == 0 && m_size >= _GemmCore_T::MTILE) {
-      *dstptr = aptr + m_offset * _param.lda + k_offset;
-      *dststep = _param.lda;
-      return JblasSuccess;
-    } else {
-      auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-      *dststep = k_pad;
-      return kernel::wrapper::Memcpy2D::forward<ISA_T, AType, AType>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                     m_size, k_size, _param.lda, k_pad);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationConverter {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-  };
-  ActivationConverter() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<SRC_T*>(_param.A);
-    auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-    *dststep = k_pad;
-    if constexpr (std::is_same_v<AType, utils::bf16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, utils::fp16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtFp16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, float> && std::is_same_v<SRC_T, utils::bf16>) {
-      return kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else {
-      assert(0);
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterFp32 = ActivationConverter<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterBf16 = ActivationConverter<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockQuantize {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = float;
-  using QParam = storage::gemm::StorageQuantActivation;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-    QParam* quan;
-  };
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline QParam createStorage(int m, int k, int kblock, bool hasreduce) {
-    QParam tmp;
-    int kpad = utils::padto(k, _GemmCore_T::KTILE);
-    int mpad = utils::padto(m, _GemmCore_T::MTILE);
-    tmp.resize(mpad, kpad, kblock == -1 ? kpad : kblock, JBLAS_DTYPE::U8, JBLAS_DTYPE::F32, JBLAS_DTYPE::U8,
-               JBLAS_DTYPE::F32, std::is_same_v<AType, uint8_t>, hasreduce);
-    return tmp;
-  }
-
-  void run(const Param& _param, ThreadProblem& thdp) {
-    auto quan = _param.quan;
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto thdqptr = quan->template APtr<AType>() + thdp.loc[0] * quan->lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * quan->mCStep + thdp.loc[1] / quan->kblock;
-      auto thdsptr = quan->template SPtr<float>() + blk_offset;
-      auto thdzptr = quan->template ZPtr<AType>() + blk_offset;
-      auto thdrptr = quan->template RPtr<float>() == nullptr ? nullptr : quan->template RPtr<float>() + blk_offset;
-      if constexpr (std::is_same_v<AType, uint8_t>) {
-        kernel::wrapper::QuantizeU8ColBlock::template forward<ISA_T, SRC_T>(
-            thdp.size[0], thdp.size[1], srcptr, _param.lda, thdqptr, quan->lda, thdsptr, quan->mCStep, thdzptr,
-            quan->kblock, thdrptr);
-      }
-      if constexpr (std::is_same_v<AType, int8_t>) {
-        kernel::wrapper::QuantizeS8ColBlock::template forward<ISA_T, SRC_T>(thdp.size[0], thdp.size[1], srcptr,
-                                                                            _param.lda, thdqptr, quan->lda, thdsptr,
-                                                                            quan->mCStep, quan->kblock, thdrptr);
-      }
-    }
-  }
-
-  JBLAS_CODE quantize(const Param& _param, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, _param.quan->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, thdp);
-    });
-    return JblasSuccess;
-  }
-
- public:  // Runtime get by launcher
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    (void)m_size;
-    (void)k_size;
-    auto quan = _param.quan;
-    auto aptr = quan->template APtr<AType>();
-    *dstptr = aptr + m_offset * quan->lda + k_offset;
-    *dststep = quan->lda;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationF32KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationBf16KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockBase : public ActivationBase<_GemmCore_T, ISA_T> {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = storage::gemm::StorageReduce;
-  using SRCType = SRC_T;
-  using Param = typename ActivationBase<_GemmCore_T, ISA_T>::Param;
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline SType createStorage(int m, int k, int kblock) {
-    SType tmp;
-    tmp.resize(m, k, kblock == -1 ? k : kblock, JBLAS_DTYPE::F32);
-    return tmp;
-  }
-
-  void run(const Param& _param, SType* stor, int m, int k, ThreadProblem& thdp) {
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * stor->lda + thdp.loc[1] / stor->kblock;
-      auto thdrptr = stor->template get<float>() + blk_offset;
-      auto ret = kernel::wrapper::ColBlockReduceSum::template forward<ISA_T, SRC_T>(
-          srcptr, _param.lda, thdp.size[0], thdp.size[1], stor->kblock, thdrptr, stor->lda);
-      assert(ret == JblasSuccess);
-    }
-  }
-
-  JBLAS_CODE reduce(const Param& _param, SType* stor, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, stor->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, stor, m, k, thdp);
-    });
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationKBlockBaseF32 = ActivationKBlockBase<_GemmCore_T, ISA_T, float>;
-}  // namespace gemm
-}  // namespace prologue_a
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
deleted file mode 100644
index 7fd632d4d3c6c..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
+++ /dev/null
@@ -1,892 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_b {
-namespace gemm {
-
-template <typename WT, JBLAS_ISA ISA_T>
-static inline void transposeWeight(const int Row, const int Col, const WT* src, const int ld_src, WT* dst,
-                                   const int ld_dst, parallel::IThreading* threading) {
-  jblas::parallel::Scheduler2D _para;
-  _para.update({threading->num_threads(), Row, Col, 16, 16});
-  threading->parallel_for([&](int tidx) {
-    jblas::parallel::ThreadProblem2D thdp{tidx};
-    _para.getIndex(thdp);
-    if (thdp.valid) {
-      kernel::wrapper::Transpose2D<WT>::template forward<ISA_T>(src + thdp.loc[0] * ld_src + thdp.loc[1],
-                                                                   dst + thdp.loc[0] + thdp.loc[1] * ld_dst,
-                                                                   thdp.size[0], thdp.size[1], ld_src, ld_dst);
-    }
-  });
-}
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightPack {
- public:
-  using WType = typename _GemmCore_T::BType;
-  using StorageType = storage::gemm::StoragePackedWeight;
-  struct Param {
-    const WType* B;
-    const int ldb;
-    StorageType* packedW;
-  };
-
-  StorageType createStorage(int n, int k) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageType tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, n, k, utils::jblas_dtype<WType>);
-    return tmp;
-  }
-
-  void packWeightTranspose(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<WType>(static_cast<size_t>(N) * K);
-    transposeWeight<WType, ISA_T>(N, K, _param.B, _param.ldb, B_NT, N, threading);
-    packWeight(N, K, {B_NT, N, _param.packedW}, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN int8 symmetric weight to packed N//NtilexKPadxNTile int4 weight
-  void packWeight(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        run(_param, thdp);
-      }
-    });
-  }
-
-  void run(const Param& _param, parallel::ThreadProblem2D& thdp) {
-    auto packedw = _param.packedW;
-    auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-    auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-    const auto src = _param.B + thdp.loc[0] * _param.ldb + thdp.loc[1];
-    const auto dst = packedw->template get<WType>() + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * packedw->mKPad;
-    using PaddingInterleaveMNWType = kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-    auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-        src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, _param.ldb, packedw->mKPad);
-    assert(ret == JblasSuccess);
-    (void)ret;
-  }
-
-  inline JBLAS_CODE getWeight(WType** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param param, void* tmpcache, size_t cachesize) {
-    auto wptr = param.packedW;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->template get<WType>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, WType, WType>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS8 {
- public:
-  using StorageWeight = storage::gemm::StorageWeightKBlockS8;
-  using BType = typename _GemmCore_T::BType;
-  struct Param {
-    const storage::gemm::WeightKBlockBase* packedW;
-  };
-
-  StorageWeight createStorage(int n, int k, int blocksize, JBLAS_DTYPE scat, JBLAS_DTYPE redt, bool is_asym) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, n, k, scat, redt, is_asym);
-    return tmp;
-  }
-
-  virtual void packTransposeWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                                   parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    transposeWeight<float, ISA_T>(N, K, B, ldb, B_NT, N, threading);
-    packWeight(N, K, B_NT, N, stor, threading);
-    utils::afree(B_NT);
-  }
-
-  // from packed N//NtilexKPadxNTile int8 weight to KxN f32 weight
-  virtual void unpackTransposeWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                                     parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    unpackWeight(N, K, stor, B_NT, N, threading);
-    transposeWeight<float, ISA_T>(K, N, B_NT, N, B, ldb, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN f32 weight to packed N//NtilexKPadxNTile int8 weight
-  virtual void packWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                          parallel::IThreading* threading) {
-    auto tmpq = utils::amalloc<int8_t>(static_cast<size_t>(N) * K);
-    auto ptr = reinterpret_cast<StorageWeight*>(stor);
-    int nk_scale = utils::updiv(K, ptr->mBlockSize);
-    auto ssize = static_cast<size_t>(N) * nk_scale;
-    auto Tscales = utils::amalloc<float>(ssize);
-    auto Tzps = utils::amalloc<int8_t>(ptr->mIsAsym ? ssize : 0);
-    quantizeWeight(N, K, B, ldb, ptr->mBlockSize, tmpq, Tscales, Tzps, ptr->mDType, threading);
-    packQWeight(N, K, tmpq, N, Tscales, Tzps, stor, threading);
-    utils::afree(tmpq);
-    utils::afree(Tscales);
-    utils::afree(Tzps);
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<float>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, int8_t* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<int8_t>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void setQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales, void* ptr,
-                                  parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr)
-                std::memcpy(stor->template SPtr<float>() + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
-              if (zero_points != nullptr)
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[j + i * stor->mNPad] = static_cast<utils::bf16>(scales[i * N + j]);
-                }
-              }
-              if (zero_points != nullptr) {
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-              }
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    }
-  }
-
-  virtual void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
-                                           void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<float>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
-                }
-              } else {
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              }
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[i * stor->mNPad + j] = utils::bf16(scales[j * rawnk_scale + i]);
-                }
-              } else {
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              }
-            }
-          }
-        }
-      });
-    }
-    if (stor->mIsAsym && zero_points)
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              for (size_t j = 0; j < N; j++) {
-                stor->template ZPtr<int8_t>()[i * stor->mNPad + j] = zero_points[j * rawnk_scale + i];
-              }
-            } else {
-              std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) {
-    setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    reorderWeight(N, K, B, ldb, stor->WPtr(), threading);
-    reduceWeight(ptr, threading);
-  }
-
-  void reduceWeight(void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    if (stor->mHasReduce) {
-      auto deq = utils::amalloc<float>((size_t)stor->mK * stor->mN);
-      unpackWeight(stor->mN, stor->mK, stor, deq, stor->mN, threading);
-      if (stor->mRedT == JBLAS_DTYPE::F32) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<float>(), stor->mCStep,
-               threading);
-      } else if (stor->mRedT == JBLAS_DTYPE::BF16) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<utils::bf16>(), stor->mCStep,
-               threading);
-      } else {
-        assert(0);
-      }
-      utils::afree(deq);
-    }
-  }
-  template <typename RED_T>
-  void reduce(const int N, const int K, const int KBlock, const float* B, const int ldb, RED_T* rptr, const int ldr,
-              parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, KBlock, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = rptr + thdp.loc[1] + thdp.loc[0] / KBlock * ldr;
-        using RowReduceSum = kernel::wrapper::RowReduceSum<RED_T>;
-        for (int i = 0; i < thdp.size[0]; i += KBlock) {
-          int rowremain = utils::remainsize(thdp.loc[0] + i, K, KBlock);
-          auto ret = RowReduceSum::template forward<ISA_T>(  //
-              src + i * ldb, ldb, rowremain, thdp.size[1], dst + i / KBlock * ldr);
-          assert(ret == JblasSuccess);
-          (void)ret;
-        }
-      }
-    });
-  }
-
-  void quantizeWeight(const int N, const int K, const float* B, const int ldb, int blocksize, int8_t* qB, float* scales,
-                      int8_t* zero_points, JBLAS_DTYPE quant_dtype, parallel::IThreading* threading) {
-    int bsize = blocksize == -1 ? K : blocksize;
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, bsize, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        quantRowBlock(B + thdp.loc[0] * ldb + thdp.loc[1], qB + thdp.loc[0] * N + thdp.loc[1], thdp.size[0],
-                      thdp.size[1], ldb, N, scales + thdp.loc[0] / bsize * N + thdp.loc[1],
-                      zero_points == nullptr ? zero_points : zero_points + thdp.loc[0] / bsize * N + thdp.loc[1], bsize,
-                      quant_dtype);
-      }
-    });
-  }
-
-  void reorderWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr,
-                     parallel::IThreading* threading) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = dstptr + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * KPad;
-        using PaddingInterleaveMNWType =
-            kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-        auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-            src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, ldb, KPad);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  virtual inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    auto zptr = wptr->template ZPtr<int8_t>();
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, float>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, int8_t, int8_t>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S8) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S8>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS4 : public WeightKBlockS8<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockS4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE weiT, JBLAS_DTYPE scaT,
-                              JBLAS_DTYPE redT, bool is_asym = false) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, weiT, scaT, redT, is_asym);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) override {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>((size_t)stor->mKPad * stor->mNPad);
-    auto reorded = (int8_t*)tmp;
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reduceWeight(ptr, threading);
-    utils::afree(tmp);
-  }
-
-  virtual void packNbitsWeight(const int N, const int K, bool isasym, const uint8_t* B, const int ldb,
-                               const float* scales, const uint8_t* zero_points, void* ptr,
-                               parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    auto blks = utils::updiv(K, stor->mBlockSize);
-    auto blks_padding2 = utils::padto(blks, 2);
-    auto tmpscales = tmp;
-    auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
-    if (scales) {
-      for (size_t i = 0; i < N * blks; i += 2) {
-        tmpscales[i] = scales[i] / 16;
-        tmpscales[i + 1] = scales[i + 1] / 16;
-      }
-    }
-    if (zero_points) {
-      for (size_t i = 0; i < N; i += 1) {
-        for (size_t ib = 0; ib < blks; ib += 2) {
-          auto tmpzp = *(zero_points + i * blks_padding2 / 2 + ib / 2);
-          tmpzeropoints[i * blks + ib] = ((tmpzp & 0xf) - 8) << 4;
-          if (ib + 1 < blks) {
-            tmpzeropoints[i * blks + ib + 1] = (((tmpzp & 0xf0) >> 4) - 8) << 4;
-          }
-        }
-      }
-    }
-
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setTransposeQuantCorrection(N, K, zero_points ? tmpzeropoints : nullptr,
-                                                                    scales ? tmpscales : nullptr, ptr, threading);
-    if (B) {
-      auto s8ptr = (int8_t*)tmp;
-      auto transposeunpackfunc_u4s4 = [&]() {
-        parallel::Scheduler2D para({threading->num_threads(), N, K, 1, 2});
-        threading->parallel_for([&](int tid) {
-          parallel::ThreadProblem2D thdp{tid};
-          para.getIndex(thdp);
-          if (thdp.valid) {
-            for (size_t i = thdp.loc[0]; i < thdp.loc[0] + thdp.size[0]; i++) {
-              for (size_t j = thdp.loc[1]; j < thdp.loc[1] + thdp.size[1]; j += 2) {
-                auto src = *(B + i * ldb / 2 + j / 2);
-                s8ptr[(j + 0) * N + i] = ((src & 0xf) - 8) << 4;
-                s8ptr[(j + 1) * N + i] = (((src & 0xf0) >> 4) - 8) << 4;
-              }
-            }
-          }
-        });
-      };
-      transposeunpackfunc_u4s4();
-      auto reorded = s8ptr + static_cast<size_t>(K) * N;
-      WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, s8ptr, N, reorded, threading);
-      compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    }
-    utils::afree(tmp);
-  }
-
-  void compressWeight(const int N, const int K, const int8_t* B, const int ldb, utils::bit4x2* dstptr,
-                      parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto ret = doCompress(B + thdp.loc[0] * ldb + thdp.loc[1], dstptr + thdp.loc[0] * ldb / 2 + thdp.loc[1] / 2,
-                              thdp.size[0], thdp.size[1], ldb, ldb);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return kernel::wrapper::CompressS8S4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::int4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S4_FULLRANGE) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::S4_CLIP) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    }
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename _T>
-  inline JBLAS_CODE getFpWeight(_T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto zptr = wptr->template ZPtr<int8_t>();
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockF4 : public WeightKBlockS4<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockF4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE f4T, JBLAS_DTYPE scaT) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, f4T, scaT);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales, void* ptr,
-                           parallel::IThreading* threading) {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, NULL, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto reorded = utils::amalloc<int8_t>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    WeightKBlockS4<_GemmCore_T, ISA_T>::compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(),
-                                                       threading);
-    utils::afree(reorded);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) override {
-    if (quant_dtype == JBLAS_DTYPE::F4_BNB) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_BNB>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_E2M1) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_NF4) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_NF4>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    }
-  }
-
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) override {
-    return kernel::wrapper::CompressFp4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::f4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_NF4>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_BNB>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-}  // namespace gemm
-}  // namespace prologue_b
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
deleted file mode 100644
index 052728dba687f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
+++ /dev/null
@@ -1,665 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace storage {
-
-constexpr size_t Alignment = 64;
-class ISerialObject {
- protected:
-  virtual size_t getSerializedSize() = 0;
-
-  virtual void serializeToBuffer(int8_t*& wptr) = 0;
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) = 0;
-};
-
-class ISerializable : public ISerialObject {
- public:
-  virtual ~ISerializable() = default;
-
-  virtual void assign(int8_t* buf) = 0;
-
-  virtual void serialize(int8_t* wptr) = 0;
-
-  virtual void deserialize(int8_t* rptr) = 0;
-  size_t mSize = 0;
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mSize);
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override { utils::serialize(wptr, mSize); }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mSize);
-    }
-  }
-};
-
-class ISerialBuffer : public ISerialObject {
- public:
-  template <typename T>
-  inline constexpr T* get() {
-    return reinterpret_cast<T*>(mBufPtr);
-  };
-  template <typename T>
-  inline size_t size() {
-    return mBufSize / sizeof(T);
-  };
-
-  void resize(size_t bytes) { mBufSize = bytes; }
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mBufSize);
-    totalsize += mBufSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mBufSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mBufPtr) {
-      std::memcpy(wptr, mBufPtr, mBufSize);
-    }
-    wptr += mBufSize;
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mBufSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mBufSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mBufPtr = rptr;
-    rptr += mBufSize;
-  }
-
-  int8_t* mBufPtr = NULL;
-  size_t mBufSize = 0;
-};
-namespace gemm {
-// Storage classes for GEMM cases:
-// Weight K*N
-// Activation M*K
-
-class WeightBase : public storage::ISerializable {
- public:
-  JBLAS_PROLOGUEB_IDS mPrologueID = JBLAS_PROLOGUEB_IDS::Undef;
-  uint32_t mCoreId = 0;
-  JBLAS_DTYPE mDType = JBLAS_DTYPE::F32;
-  int mNPad = 0, mKPad = 0;
-  int mN = 0, mK = 0;
-
-  WeightBase(uint32_t _id) { mCoreId = _id; }
-
-  // bytes offset to mPrologueID
-  static constexpr inline size_t offset() { return sizeof(mSize); }
-
- protected:
-  void resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    mNPad = NPad;
-    mKPad = KPad;
-    mN = N;
-    mK = K;
-    mDType = dtype;
-  }
-
-  virtual size_t getSerializedSize() { return ISerializable::getSerializedSize() + getMiscSize(); }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    utils::serialize(wptr, mPrologueID);
-    utils::serialize(wptr, mCoreId);
-    utils::serialize(wptr, mNPad);
-    utils::serialize(wptr, mKPad);
-    utils::serialize(wptr, mN);
-    utils::serialize(wptr, mK);
-    utils::serialize(wptr, mDType);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    ISerializable::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mPrologueID = utils::deserialize<JBLAS_PROLOGUEB_IDS>(rptr);
-      mCoreId = utils::deserialize<uint32_t>(rptr);
-      mNPad = utils::deserialize<int>(rptr);
-      mKPad = utils::deserialize<int>(rptr);
-      mN = utils::deserialize<int>(rptr);
-      mK = utils::deserialize<int>(rptr);
-      mDType = utils::deserialize<JBLAS_DTYPE>(rptr);
-    } else {
-      utils::serialize<JBLAS_PROLOGUEB_IDS>(rptr, mPrologueID);
-      utils::serialize<uint32_t>(rptr, mCoreId);
-      utils::serialize<int>(rptr, mNPad);
-      utils::serialize<int>(rptr, mKPad);
-      utils::serialize<int>(rptr, mN);
-      utils::serialize<int>(rptr, mK);
-      utils::serialize<JBLAS_DTYPE>(rptr, mDType);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mPrologueID);
-    totalsize += sizeof(mCoreId);
-    totalsize += sizeof(mNPad);
-    totalsize += sizeof(mKPad);
-    totalsize += sizeof(mN);
-    totalsize += sizeof(mK);
-    totalsize += sizeof(mDType);
-    return totalsize;
-  }
-};
-
-class WeightKBlockBase : public WeightBase {
- public:
-  int mBlockSize = 1;
-  WeightKBlockBase(uint32_t _id) : WeightBase(_id) {}
-  void resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    mBlockSize = Block;
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    size_t totalsize = WeightBase::getSerializedSize() + getMiscSize();
-    return totalsize;
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    utils::serialize(wptr, mBlockSize);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    WeightBase::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mBlockSize = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, mBlockSize);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = sizeof(mBlockSize);
-    return totalsize;
-  }
-};
-
-class StorageQuantCorrection : public ISerialObject {
-  // ser
- public:
-  size_t mCSize = 0;
-  int mCStep = 0;
-  bool mIsAsym = false;
-  bool mHasReduce = false;
-  JBLAS_DTYPE mScaT = JBLAS_DTYPE::F32, mZpT = JBLAS_DTYPE::F32, mRedT = JBLAS_DTYPE::F32;
-
- protected:
-  int8_t* mSPtr = nullptr;
-  int8_t* mZPtr = nullptr;
-  int8_t* mRPtr = nullptr;
-
-  // non-ser
- public:
-  int mScaEleSize = 0, mZpEleSize = 0, mRedEleSize = 0;
-
- public:
-  template <typename T>
-  inline T* SPtr() {
-    return (T*)mSPtr;
-  }
-
-  template <typename T>
-  inline T* ZPtr() {
-    return (T*)mZPtr;
-  }
-
-  template <typename T>
-  inline T* RPtr() {
-    return (T*)mRPtr;
-  }
-
-  size_t resize(int Rows, int Step, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt, bool _is_asym,
-                bool _has_reduce) {
-    mScaT = scalet;
-    mZpT = zpt;
-    mRedT = redt;
-    updateSize();
-    mIsAsym = _is_asym;
-    mHasReduce = _has_reduce;
-    mCStep = Step;
-    mCSize = static_cast<size_t>(Rows) * Step;
-    return getSerializedSize();
-  }
-
- protected:
-  inline void updateSize() {
-    mScaEleSize = int(utils::jblas_dtype_size(mScaT));
-    mZpEleSize = int(utils::jblas_dtype_size(mZpT));
-    mRedEleSize = int(utils::jblas_dtype_size(mRedT));
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mScaT);
-    totalsize += sizeof(mZpT);
-    totalsize += sizeof(mRedT);
-    totalsize += sizeof(mIsAsym);
-    totalsize += sizeof(mHasReduce);
-    totalsize += sizeof(mCStep);
-    totalsize += sizeof(mCSize);
-    return totalsize;
-  }
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = getMiscSize();
-    totalsize += mCSize * mScaEleSize + Alignment;
-    if (mIsAsym) totalsize += mCSize * mZpEleSize + Alignment;
-    if (mHasReduce) totalsize += mCSize * mRedEleSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mScaT);
-    utils::serialize(wptr, mZpT);
-    utils::serialize(wptr, mRedT);
-    utils::serialize(wptr, mIsAsym);
-    utils::serialize(wptr, mHasReduce);
-    utils::serialize(wptr, mCStep);
-    utils::serialize(wptr, mCSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mSPtr) {
-      std::memcpy(wptr, mSPtr, mScaEleSize);
-    }
-    wptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mZPtr) {
-        std::memcpy(wptr, mZPtr, mZpEleSize);
-      }
-      wptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mRPtr) {
-        std::memcpy(wptr, mRPtr, mCSize * mRedEleSize);
-      }
-      wptr += mCSize * mRedEleSize;
-    }
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool locate_buf) override {
-    if (!locate_buf) {
-      mScaT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mZpT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mRedT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      updateSize();
-      mIsAsym = utils::deserialize<bool>(rptr);
-      mHasReduce = utils::deserialize<bool>(rptr);
-      mCStep = utils::deserialize<int>(rptr);
-      mCSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<JBLAS_DTYPE>(rptr, mScaT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mZpT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mRedT);
-      utils::serialize<bool>(rptr, mIsAsym);
-      utils::serialize<bool>(rptr, mHasReduce);
-      utils::serialize<int>(rptr, mCStep);
-      utils::serialize<size_t>(rptr, mCSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mSPtr = rptr;
-    rptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mZPtr = rptr;
-      rptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mRPtr = rptr;
-      rptr += mCSize * mRedEleSize;
-    }
-  }
-};
-
-class StorageReduce : public ISerializable, public ISerialBuffer {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, k = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _k, int _kblock, JBLAS_DTYPE redt) {
-    kblock = _kblock;
-    m = _m;
-    k = _k;
-    lda = utils::updiv(_k, _kblock);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(redt);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, k);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, k);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(k);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StorageQuantActivation : public ISerializable, public ISerialBuffer, public StorageQuantCorrection {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _lda, int _kblock, JBLAS_DTYPE buft, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt,
-                bool is_asym, bool has_reduce) {
-    kblock = _kblock;
-    lda = _lda;
-    m = _m;
-    CorrectionType::resize(_m, utils::updiv(_lda, _kblock), scalet, zpt, redt, is_asym, has_reduce);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(buft);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize() +
-           CorrectionType::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StoragePackedWeight : public WeightBase, public ISerialBuffer {
- public:
-  StoragePackedWeight(uint32_t _id) : WeightBase(_id) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightPack; }
-
-  size_t resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    auto bsize = static_cast<size_t>(NPad) * KPad * jblas::utils::jblas_dtype_size(dtype);
-    ISerialBuffer::resize(bsize);
-    mSize = WeightBase::getSerializedSize() + ISerialBuffer::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    WeightBase::deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    WeightBase::deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-};
-
-class Buffer8Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(size); }
-  inline int8_t* WPtr() { return get<int8_t>(); }
-};
-
-class Buffer4Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(utils::updiv(size, 2)); }
-  inline utils::bit4x2* WPtr() { return get<utils::bit4x2>(); }
-};
-
-class StorageWeightKBlockS8 : public WeightKBlockBase, public Buffer8Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer8Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS8(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS8; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE scalet, JBLAS_DTYPE redt, bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, JBLAS_DTYPE::S8);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockS4 : public WeightKBlockBase, public Buffer4Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer4Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS4(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS4; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE s4t, JBLAS_DTYPE scalet, JBLAS_DTYPE redt,
-                bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, s4t);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockF4 : public StorageWeightKBlockS4 {
- public:
-  StorageWeightKBlockF4(uint32_t _type) : StorageWeightKBlockS4(_type) {
-    mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockF4;
-  }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE f4t, JBLAS_DTYPE scalet) {
-    StorageWeightKBlockS4::InfoType::resize(NPad, KPad, Block, N, K, f4t);
-    StorageWeightKBlockS4::QWeightType::resize((size_t)NPad * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    StorageWeightKBlockS4::CorrectionType::resize(nk_scale, NPad, scalet, JBLAS_DTYPE::S8, JBLAS_DTYPE::F32, false,
-                                                  false);
-    mSize = StorageWeightKBlockS4::InfoType::getSerializedSize() +
-            StorageWeightKBlockS4::QWeightType::getSerializedSize() +
-            StorageWeightKBlockS4::CorrectionType::getSerializedSize();
-    return mSize;
-  }
-};
-
-class PackedWeightParser {
- public:
-  static gemm::WeightBase* deserialBuffer(const void* serialized_buf) {
-    auto rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-    rptr += WeightBase::offset();
-    int mProID = utils::deserialize<int>(rptr);
-    WeightBase* ptr = NULL;
-    if (mProID >= int(JBLAS_PROLOGUEB_IDS::Begin) && mProID < int(JBLAS_PROLOGUEB_IDS::End)) {
-      rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-      auto type = static_cast<JBLAS_PROLOGUEB_IDS>(mProID);
-      switch (type) {
-        case JBLAS_PROLOGUEB_IDS::WeightPack:
-          ptr = new gemm::StoragePackedWeight(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS8:
-          ptr = new gemm::StorageWeightKBlockS8(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS4:
-          ptr = new gemm::StorageWeightKBlockS4(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockF4:
-          ptr = new gemm::StorageWeightKBlockF4(0);
-          break;
-        default:
-          break;
-      }
-      if (ptr) {
-        ptr->deserialize(rptr);
-      }
-    }
-    return ptr;
-  }
-};
-}  // namespace gemm
-}  // namespace storage
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
deleted file mode 100644
index 96d9e94c9bfc0..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
+++ /dev/null
@@ -1,638 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <functional>
-#include <cassert>
-#include <vector>
-#include <cstdio>
-#ifdef _WIN32
-#include <cstdlib>
-#else
-#include <err.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/signal.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <stdlib.h>
-
-#define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
-#define XFEATURE_XTILECFG 17
-#define XFEATURE_XTILEDATA 18
-#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
-#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
-#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-
-#define ARCH_GET_XCOMP_PERM 0x1022
-#define ARCH_REQ_XCOMP_PERM 0x1023
-
-#endif
-#include "jit_blas.h"
-
-// As long as the compiler supports the ISA, we will enable it.
-// Only the ISA you use in your project will be compiled.
-#ifdef __GNUC__
-#define CompileAVX512F() (__GNUC__ >= 6)
-#define CompileAVX2() (__GNUC__ >= 5)
-#define CompileAMX() (__GNUC__ >= 11)
-#define CompileBF16() (__GNUC__ >= 13)
-#define CompileFP16() (__GNUC__ >= 13)
-#define CompileAMXBF16() (CompileAMX())
-#define CompileAMXINT8() (CompileAMX())
-#else
-#define CompileAVX512F() _MSC_VER && (_MSC_VER >= 1911)
-#define CompileAVX2() _MSC_VER && (_MSC_VER >= 1900)
-#define CompileAMX() 0
-#define CompileBF16() 0
-#define CompileFP16() 0
-#define CompileAMXBF16() 0
-#define CompileAMXINT8() 0
-#endif
-#if CompileBF16() || CompileFP16()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace utils {
-
-template <typename T2, typename T1>
-inline const T2 bit_cast(T1 i) {
-  static_assert(sizeof(T1) == sizeof(T2), "Bit-casting must preserve size.");
-  T2 o;
-  memcpy(&o, &i, sizeof(T2));
-  return o;
-}
-
-template <typename T>
-inline uint32_t bitand_u32(const T& src, const T& src1) {
-  return uint32_t(src) & uint32_t(src1);
-}
-
-struct bf16 {
-  uint16_t x;
-  union bf16f32 {
-    float f32;
-    unsigned int u;
-    uint16_t bf16[2];
-  };
-  bf16() : x(0) {}
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  static uint16_t f32_to_bf16(float v) {
-    auto mm = _mm_load_ss(&v);
-    auto mm2 = _mm_cvtneps_pbh(mm);
-    uint16_t dst;
-    _mm_storeu_si16(reinterpret_cast<uint16_t*>(&dst), reinterpret_cast<__m128i>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-  explicit bf16(float vf32) : x(bit_cast<uint16_t>(f32_to_bf16(vf32))) {}
-#else
-  explicit bf16(float vf32) { fromfloat(vf32); }
-#endif
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  float tofloat() const {
-    auto mm = _mm_loadu_si16(&(this->x));
-    auto mm2 = _mm_bslli_si128(mm, 2);
-    float dst;
-    _mm_store_ss(&dst, reinterpret_cast<__m128>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-#else
-  float tofloat() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-#endif
-
-  float tofloat_nosimd() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-
-  operator float() const { return tofloat(); }
-
-  static bf16 from_bin(const uint16_t x) {
-    bf16 res;
-    res.x = x;
-    return res;
-  }
-
-  void fromfloat(float _v) {
-#if CompileBF16()
-    x = bit_cast<uint16_t>(f32_to_bf16(_v));
-#else
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-#endif
-  }
-
-  void fromfloat_nosimd(float _v) {
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures
-    // Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-  }
-};
-
-struct fp16 {
-  uint16_t x;
-
-  fp16() { x = 0; }
-  explicit fp16(float val) { (*this) = val; }
-  explicit fp16(bf16 val) { (*this) = static_cast<float>(val); }
-
-  fp16& operator=(float val) {
-#if CompileFP16()
-    this->x = bit_cast<uint16_t>(static_cast<_Float16>(val));
-#else
-    // round-to-nearest-even: add last bit after truncated mantissa
-    const uint32_t b = bit_cast<uint32_t>(val) + 0x00001000;
-    const uint32_t e = (b & 0x7F800000) >> 23;  // exponent
-    // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
-    const uint32_t m = b & 0x007FFFFF;
-    // sign : normalized : denormalized : saturate
-
-    this->x = static_cast<uint16_t>((b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
-                                    ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
-                                    (e > 143) * 0x7FFF);
-#endif
-    return *this;
-  }
-  explicit operator float() const {
-#if CompileFP16()
-    return static_cast<float>(bit_cast<_Float16>(this->x));
-#else
-    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15,
-    // +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
-    const uint32_t e = (x & 0x7C00) >> 10;  // exponent
-    const uint32_t m = (x & 0x03FF) << 13;  // mantissa
-    // evil log2 bit hack to count leading zeros in denormalized format
-    const uint32_t v = bit_cast<uint32_t>(static_cast<float>(m)) >> 23;
-    // sign : normalized : denormalized
-    return bit_cast<float>((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
-                           ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)));
-#endif
-  }
-  explicit operator bf16() const {
-#if CompileBF16() && CompileFP16()
-    return bf16(static_cast<float>(bit_cast<_Float16>(this->x)));
-#else
-    // Extract the exponent, and mantissa from the fp16 value.
-    int exponent = x >> 10 & 0x1f;
-    int mantissa = x & 0x3ff;
-
-    // If the exponent is 0, the bf16 value is 0.
-    if (exponent == 0) {
-      return bf16();
-    }
-    // If the exponent is 31, the bf16 value is the sign bit plus 0x7fff.
-    else if (exponent == 31) {
-      bf16 res{};
-      return bf16::from_bin(x | 0x7fff);
-    }
-    // Otherwise, the bf16 value is the sign bit plus the exponent minus 15,
-    // followed by the mantissa.
-    else {
-      int sign = x & 0x8000;
-      return bf16::from_bin(static_cast<uint16_t>(sign | (exponent + 128 - 16) << 7 | mantissa >> 3));
-    }
-#endif
-  }
-};
-
-struct bit4x2 {
-  int8_t x : 4;
-  int8_t y : 4;
-  bit4x2(int8_t v) : x(v), y(v) {}
-  bit4x2() : x(0), y(0) {}
-};
-
-struct int4x2 : bit4x2 {
-  int4x2(int8_t v) : bit4x2(v) {}
-  int4x2() : bit4x2() {}
-  static int8_t convert(int8_t src) {
-    int32_t dst = src;
-    dst = dst >= 0 ? dst + 8 : dst - 8;
-    dst = dst / 16;
-    dst = dst > 7 ? 7 : dst;
-    dst = dst < -8 ? -8 : dst;
-    return static_cast<int8_t>(dst);
-  }
-};
-
-struct f4x2 : bit4x2 {
-  f4x2(int8_t v) : bit4x2(v) {}
-  f4x2() : bit4x2() {}
-};
-
-template <typename T>
-inline constexpr JBLAS_DTYPE jblas_dtype = std::is_same_v<T, double>        ? JBLAS_DTYPE::F64
-                                           : std::is_same_v<T, float>       ? JBLAS_DTYPE::F32
-                                           : std::is_same_v<T, utils::bf16> ? JBLAS_DTYPE::BF16
-                                           : std::is_same_v<T, utils::fp16> ? JBLAS_DTYPE::F16
-                                           : std::is_same_v<T, int8_t>      ? JBLAS_DTYPE::S8
-                                           : std::is_same_v<T, uint8_t>     ? JBLAS_DTYPE::U8
-                                                                            : (assert(0), JBLAS_DTYPE::F32);
-template <typename T>
-inline constexpr const char* type_str = std::is_same_v<T, double>    ? "double"
-                                        : std::is_same_v<T, float>   ? "float"
-                                        : std::is_same_v<T, bf16>    ? "bf16"
-                                        : std::is_same_v<T, fp16>    ? "fp16"
-                                        : std::is_same_v<T, int8_t>  ? "int8_t"
-                                        : std::is_same_v<T, uint8_t> ? "uint8_t"
-                                                                     : (assert(0), "undef");
-
-inline const char* dtype2str(JBLAS_DTYPE dtype) {
-  switch (dtype) {
-    case JBLAS_DTYPE::F64:
-      return "float64";
-    case JBLAS_DTYPE::F32:
-      return "float32";
-    case JBLAS_DTYPE::F16:
-      return "float16";
-    case JBLAS_DTYPE::BF16:
-      return "bfloat16";
-    case JBLAS_DTYPE::F8_E4M3:
-      return "fp8_e4m3";
-    case JBLAS_DTYPE::F8_E5M2:
-      return "fp8_e5m2";
-    case JBLAS_DTYPE::F8_E3M4:
-      return "fp8_e3m4";
-    case JBLAS_DTYPE::S8:
-      return "signed_int8";
-    case JBLAS_DTYPE::U8:
-      return "unsigned_int8";
-    case JBLAS_DTYPE::S4_CLIP:
-      return "int4_clip";
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      return "int4_fullrange";
-    case JBLAS_DTYPE::F4_E2M1:
-      return "fp4_e2m1";
-    case JBLAS_DTYPE::F4_BNB:
-      return "fp4_bitsandbytes";
-    case JBLAS_DTYPE::F4_NF4:
-      return "fp4_nf4";
-    case JBLAS_DTYPE::S32:
-      return "signed_int32";
-    case JBLAS_DTYPE::U32:
-      return "unsigned_int32";
-    default:
-      return "ErrType";
-  }
-}
-
-template <JBLAS_DTYPE DT>
-inline constexpr const char* dtype_str() {
-  return dtype2str(DT);
-}
-
-inline constexpr size_t jblas_dtype_size(const JBLAS_DTYPE t) {
-  auto bits = static_cast<uint32_t>(t) & static_cast<uint32_t>(0xff);
-  return bits >> 3;  // bits to bytes
-}
-
-#ifndef _WIN32
-static void request_perm_xtile_data() {
-  unsigned long bitmask;
-  long rc;
-
-  rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
-  if (rc) fatal_error("XTILE_DATA request failed: %ld", rc);
-
-  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
-  if (rc) fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
-#ifndef NDEBUG
-  if (bitmask & XFEATURE_MASK_XTILE) printf("ARCH_REQ_XCOMP_PERM XTILE_DATA successful.\n");
-#endif
-}
-#else
-static void request_perm_xtile_data() {}
-#endif
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-static inline int padto_le(int src, int padding) { return src / padding * padding; }
-
-static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
-
-static inline int updiv(int a, int b) { return (a + b - 1) / b; }
-
-static inline size_t updiv(size_t a, int b) { return (a + b - 1) / b; }
-
-static inline int downdiv(int a, int b) { return a / b; }
-
-static inline int remainsize(int pos, int size, int N) { return pos + N <= size ? N : size - pos; }
-
-template <typename _SRCT, typename _DSTT>
-static inline _DSTT cast(_SRCT _src) {
-  return static_cast<_DSTT>(_src);
-}
-
-template <>
-int8_t cast(float _src) {
-  _src = roundf(_src);
-  _src = std::min(_src, 127.f);
-  _src = std::max(_src, -128.f);
-  return static_cast<int8_t>(_src);
-}
-
-template <>
-uint8_t cast(float _src) {
-  _src += 0.5f;
-  _src = std::min(_src, 255.f);
-  _src = std::max(_src, 0.f);
-  return static_cast<uint8_t>(_src);
-}
-
-template <>
-int cast(float _src) {
-  return int(roundf(_src));
-}
-
-template <>
-float cast(bf16 _src) {
-  return _src.tofloat();
-}
-
-template <>
-bf16 cast(float _src) {
-  bf16 tmp;
-  tmp.fromfloat(_src);
-  return tmp;
-}
-
-template <typename _T>
-void serialize(int8_t*& buf, _T _val) {
-  *reinterpret_cast<_T*>(buf) = _val;
-  buf += sizeof(_T);
-}
-
-template <typename _T>
-_T deserialize(int8_t*& buf) {
-  auto val = *reinterpret_cast<_T*>(buf);
-  buf += sizeof(_T);
-  return val;
-}
-
-static inline int padto(int a, int b) { return updiv(a, b) * b; }
-static inline size_t padto(size_t a, int b) { return updiv(a, b) * b; }
-
-template <int _Alignment, typename _T>
-static inline _T* pointer_align(_T* src) {
-  auto uptr = reinterpret_cast<uint64_t>(src);
-  return reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-}
-
-template <typename _T>
-static inline _T* amalloc(size_t _size, size_t _alignment = 64) {
-  if (_size == 0) {
-    return NULL;
-  }
-  auto psize = padto(_size * sizeof(_T), static_cast<int>(_alignment));
-#ifdef _WIN32
-  return reinterpret_cast<_T*>(_aligned_malloc(psize, _alignment));
-#else
-  return reinterpret_cast<_T*>(aligned_alloc(_alignment, psize));
-#endif
-}
-
-static inline void afree(void* ptr) {
-  if (ptr == NULL) {
-    return;
-  }
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  free(ptr);
-#endif
-}
-
-template <typename _T, int _Alignment = 64>
-class aligned_vector {
- public:
-  aligned_vector() : mRawsize(0), mPtr(nullptr), mAlignedsize(0) {}
-  aligned_vector(size_t _size) { resize(_size); }
-  aligned_vector(size_t _size, _T _val) {
-    resize(_size);
-    std::fill_n(mVec.begin(), mVec.size(), _val);
-  }
-  size_t size() { return mRawsize; }
-  void resize(size_t size) {
-    mRawsize = size;
-    mAlignedsize = (mRawsize + _Alignment - 1) / _Alignment * _Alignment + _Alignment;
-    if (size) {
-      mVec.resize(mAlignedsize);
-      auto uptr = reinterpret_cast<uint64_t>(mVec.data());
-      mPtr = reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-    } else {
-      mPtr = NULL;
-    }
-  }
-  _T* data() const { return mPtr; }
-  _T& operator[](size_t _n) noexcept { return mPtr[_n]; }
-
- protected:
-  size_t mAlignedsize, mRawsize;
-  std::vector<_T> mVec;
-  _T* mPtr;
-};
-
-template <typename _T, int _Alignment = 64>
-using avector = aligned_vector<_T, _Alignment>;
-
-using milliseconds = std::chrono::milliseconds;
-using nanoseconds = std::chrono::nanoseconds;
-using microseconds = std::chrono::microseconds;
-template <typename _DUR = std::chrono::milliseconds>
-class timer {
- public:
-  using sclock_t = std::chrono::steady_clock;
-  using stime_point_t = std::chrono::time_point<sclock_t>;
-
-  timer() { clear(); }
-
-  void start() { startT = sclock_t::now(); }
-
-  void clear() { startT = stime_point_t::min(); }
-
-  bool null_state() { return startT == stime_point_t::min(); }
-
-  float stop() { return static_cast<float>(std::chrono::duration_cast<_DUR>(sclock_t::now() - startT).count()); }
-
-  stime_point_t startT;
-};
-
-template <typename T>
-class minmax_statistics {
- public:
-  minmax_statistics() { clear(); }
-
-  void clear() {
-    min_val = std::numeric_limits<T>::max();
-    max_val = std::numeric_limits<T>::min();
-    avg_val = 0;
-    count = 0;
-  }
-
-  void add(T _val) {
-    min_val = min_val > _val ? _val : min_val;
-    max_val = max_val < _val ? _val : max_val;
-    count += 1;
-    avg_val = (avg_val * (count - 1) + _val) / count;
-  }
-
-  T min_val, max_val, avg_val;
-  size_t count;
-};
-
-template <int _PRINT_CYCLE_MS = 100, typename _PRECISION = microseconds, typename _LOG_PRECISION = milliseconds>
-class timer_statistics_logger {
- public:
-  typedef timer<milliseconds> log_timer_t;
-  timer_statistics_logger() {
-    clear();
-    log_ratio = static_cast<float>(std::chrono::duration_cast<_PRECISION>(_LOG_PRECISION(1)).count());
-  }
-
-  void clear() {
-    statis.clear();
-    logtm.clear();
-  }
-
-  void start() {
-    if (logtm.null_state()) {
-      logtm.start();
-    }
-    tm.start();
-  }
-
-  bool stop() {
-    auto elapsed = tm.stop();
-    statis.add(elapsed);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  bool add(float time) {
-    statis.add(time);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  const char* get_log_str() {
-    sprintf(str, "Min:%.4f, Max:%.4f, Average:%.4f", min_val, max_val, avg_val);
-    return str;
-  }
-  float min_val, max_val, avg_val;
-
- private:
-  void record() {
-    min_val = statis.min_val / log_ratio;
-    max_val = statis.max_val / log_ratio;
-    avg_val = statis.avg_val / log_ratio;
-  }
-  float log_ratio;
-  char str[256];
-  timer<_PRECISION> tm;
-  minmax_statistics<float> statis;
-  timer<milliseconds> logtm;
-};
-}  // namespace utils
-
-static float fp4_bnb_dequant_fp32_LUT[] = {
-    0.00000000f,        5.208333333e-03f,   0.66666667f,        1.00000000f,        0.33333333f,
-    0.50000000f,        0.16666667f,        0.25000000f,        -1.f * 0.00000000f, -1.f * 5.208333333e-03f,
-    -1.f * 0.66666667f, -1.f * 1.00000000f, -1.f * 0.33333333f, -1.f * 0.50000000f, -1.f * 0.16666667f,
-    -1.f * 0.25000000f};
-
-static float fp4_e2m1_dequant_fp32_LUT[] = {
-    0.f,
-    0.010416666666666666f,
-    0.16666666666666666f,
-    0.25f,
-    0.333333333333333f,
-    0.5f,
-    0.6666666666666f,
-    1.f,
-    -1.f * 0.f,
-    -1.f * 0.010416666666666666f,
-    -1.f * 0.16666666666666666f,
-    -1.f * 0.25f,
-    -1.f * 0.333333333333333f,
-    -1.f * 0.5f,
-    -1.f * 0.6666666666666f,
-    -1.f * 1.f,
-};
-
-static float nf4_dequant_fp32_LUT[] = {0.f,
-                                       -0.6961928009986877f,
-                                       -0.5250730514526367f,
-                                       -0.39491748809814453f,
-                                       -0.28444138169288635f,
-                                       -0.18477343022823334f,
-                                       -0.09105003625154495f,
-                                       -1.f,
-                                       0.07958029955625534f,
-                                       0.16093020141124725f,
-                                       0.24611230194568634f,
-                                       0.33791524171829224f,
-                                       0.44070982933044434f,
-                                       0.5626170039176941f,
-                                       0.7229568362236023f,
-                                       1.0f};
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
deleted file mode 100644
index 27e240a822cdc..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
+++ /dev/null
@@ -1,281 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <thread>
-
-#include "jit_blas_epilogue.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_prologue_a.h"
-#include "jit_blas_prologue_b.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx512f.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace wrapper {
-namespace gemm {
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _Epilogue_T>
-class LauncherBase {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using EpiParam = typename Epilogue::Param;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K;
-    const AParam paramA;
-    const BParam paramB;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpCache = (void*)(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                      tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-        int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-        auto cptr_cache = tmpC + i * _config.block[1];
-        int ccache_stride = _config.block[1] * sizeof(CType);
-        if (k_paddedle) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                              (blk_m + i + _config.loc[0]), iterk, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                            acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
-                            _config.tmpcachesize);
-        }
-        int k_tail = k_remain - k_paddedle;
-        if (k_tail) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail, (blk_m + i + _config.loc[0]),
-                              iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                            GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                            iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-        }
-      }
-    }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
-  }
-};
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _BlockEpilogue_T,
-          template <JBLAS_ISA> class _Epilogue_T>
-class LauncherKBlock {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using BlockEpilogue = _BlockEpilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using BEpiParam = typename BlockEpilogue::Param;
-  using EpiParam = typename Epilogue::Param;
-  using AccType = float;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K, KBlock;
-    const AParam paramA;
-    const BParam paramB;
-    const BEpiParam paramBlk;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  BlockEpilogue mBlockEpi;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<AccType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpBlk = reinterpret_cast<CType*>(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpBlk = utils::pointer_align<64>(tmpBlk);
-    auto tmpCache = reinterpret_cast<void*>(tmpBlk + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        std::memset(tmpC, 0, _config.block[0] * _config.block[1] * sizeof(AccType));
-        if (_param.KBlock <= _config.block[2]) {
-          run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        } else {
-          run_block_large(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        }
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                            tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-
-      for (int ikk = 0; ikk < k_remain; ikk += _param.KBlock) {
-        int k_remain1 = utils::remainsize(iterk + ikk, _param.K, _param.KBlock);
-        int k_paddedle1 = utils::padto_le(k_remain1, GemmCore::KTILE);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle1) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle1,
-                                (blk_m + i + _config.loc[0]), iterk + ikk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + ikk * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_paddedle1, acache_step * sizeof(AType), bcache_stride, ccache_stride, 0, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain1 - k_paddedle1;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + ikk + k_paddedle1, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + (ikk + k_paddedle1) * GemmCore::NTILE, cptr_cache, m_remain,
-                              n_padded, k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                              0 + k_paddedle1, tmpcache, _config.tmpcachesize);
-          }
-        }
-        mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                          (iterk + ikk) / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache,
-                          _config.tmpcachesize);
-      }
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-
-  void run_block_large(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                       int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC,
-                       void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    assert(_param.K % _param.KBlock == 0);
-    for (int iterk = 0; iterk < _param.K; iterk += _param.KBlock) {
-      memset(tmpBlk, 0, sizeof(CType) * blk_msize * _config.block[1]);
-      for (int iblkk = 0; iblkk < _param.KBlock; iblkk += _config.block[2]) {
-        int k_remain = utils::remainsize(iterk + iblkk, iterk + _param.KBlock, _config.block[2]);
-        int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-        int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-        auto bptr_cache = tmpB;
-        int bcache_step = 0;
-        mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk + iblkk, _config.loc[1] + blk_n,
-                              _param.paramB, tmpcache, _config.tmpcachesize);
-        int bcache_stride = bcache_step * sizeof(BType);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                                (blk_m + i + _config.loc[0]), iterk + iblkk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                              acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain - k_paddedle;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + k_paddedle + iblkk, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk + k_paddedle,
-                              tmpcache, _config.tmpcachesize);
-          }
-        }
-      }
-      mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                        iterk / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache, _config.tmpcachesize);
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-};
-}  // namespace gemm
-}  // namespace wrapper
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
deleted file mode 100644
index 56472aba64f91..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
+++ /dev/null
@@ -1,874 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jblas/jit_blas.h"
-#include "kernel_ref.h"
-#include "jit_blas_utils.h"
-#if CompileAVX2()
-#include <immintrin.h>
-#endif
-namespace jblas {
-namespace kernel {
-namespace avx2 {
-#if CompileAVX2()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx2", "fma")
-#else
-#endif
-
-static uint8_t shuffle_map[] = {0x00, 0x01, 0x02, 0x03, 0xff, 0xff, 0xff, 0xff,
-                                0x04, 0x05, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff};
-
-template <JBLAS_DTYPE S4_T>
-static inline __m128i unpack_4bits_sse(void* srcptr) {
-  auto shuffle_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(shuffle_map));
-  auto raw_data = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto xmm0 = _mm_shuffle_epi8(raw_data, shuffle_v);
-  auto xmm1 = _mm_srli_epi32(xmm0, 0x04);
-  auto and_helper = _mm_set1_epi8(0x0f);
-  xmm0 = _mm_and_si128(xmm0, and_helper);
-  xmm1 = _mm_and_si128(xmm1, and_helper);
-  auto xmm2 = _mm_unpacklo_epi8(xmm0, xmm1);
-  auto xmm3 = _mm_unpackhi_epi8(xmm0, xmm1);
-  xmm2 = _mm_unpacklo_epi64(xmm2, xmm3);
-  if constexpr (S4_T != JBLAS_DTYPE::S4_FULLRANGE) xmm2 = _mm_slli_epi32(xmm2, 4);
-  return xmm2;
-}
-
-inline __m256 ymm_cvt_bf16_fp32(__m128i vbf16) {
-  auto vf32 = _mm256_cvtepu16_epi32(vbf16);
-  return _mm256_castsi256_ps(_mm256_slli_epi32(vf32, 16));
-}
-
-inline __m128i ymm_cvtepi32_epi16(__m256i src) {
-  __m128i tmp;
-#ifdef __GNUC__
-  for (size_t i = 0; i < 8; i++) {
-    (reinterpret_cast<int16_t*>(&tmp))[i] = (reinterpret_cast<int32_t*>(&src))[i];
-  }
-#else
-  for (size_t i = 0; i < 8; i++) {
-    tmp.m128i_i16[i] = src.m256i_i32[i];
-  }
-#endif
-  return tmp;
-}
-
-inline __m128i ymm_cvt_fp32_bf16(__m256 vfp32) {
-  return ymm_cvtepi32_epi16(_mm256_bsrli_epi128(_mm256_castps_si256(vfp32), 2));
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8_16_sse(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<S4_T>(srcptr);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    auto s8 = _mm_set1_epi8(8);
-    dst0 = _mm_sub_epi8(dst0, s8);
-  }
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v8(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto ymm = _mm256_cvtepi8_epi32(xmm);
-  auto ymm1 = _mm256_cvtepi32_ps(ymm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto xmm = ymm_cvt_fp32_bf16(ymm1);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), xmm);
-  } else {
-    _mm256_storeu_ps(dstptr, ymm1);
-  }
-}
-
-static inline void fp4_pad_4bit(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<JBLAS_DTYPE::S4_FULLRANGE>(srcptr);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <int N, bool _IS_SYM>
-static inline void dequant_s8_N_avx2(float* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps = nullptr) {
-  static_assert(N % 8 == 0);
-  int constexpr VLoop = N / 8;
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto zmm = _mm256_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm256_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm256_cvtepi32_ps(zmm);
-    fzmm = _mm256_mul_ps(fzmm, vscales[iv]);
-    _mm256_storeu_ps(dstptr + iv * 8, fzmm);
-  }
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm256_set1_ps(alpha);
-  auto vbeta = _mm256_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm256_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        vdst = _mm256_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <bool WITH_ZP>
-JBLAS_CODE dequant_kblock_s8_f32_fwd(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                     float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  const int Vlen = 8;
-  size_t simd_process_num = utils::padto_le(col, Vlen);
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    int j = 0;
-    for (; j < simd_process_num; j += Vlen) {
-      auto s8_ymm_v = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + i * ld_src + j));
-      auto s32_ymm_v = _mm256_cvtepi8_epi32(s8_ymm_v);
-      if constexpr (WITH_ZP) {
-        s32_ymm_v = _mm256_sub_epi32(
-            s32_ymm_v,
-            _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + kpos * NPad + j))));
-      }
-      auto f32_ymm_v = _mm256_cvtepi32_ps(s32_ymm_v);
-      f32_ymm_v = _mm256_mul_ps(f32_ymm_v, _mm256_loadu_ps(sptr + j));
-      _mm256_storeu_ps(dstptr + i * ld_dst + j, f32_ymm_v);
-    }
-    for (; j < col; j++) {
-      float tmp = (float)(srcptr[i * ld_src + j]);
-      if constexpr (WITH_ZP) tmp -= (float)(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = tmp * sptr[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequant_kblock_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                               float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  if (zero_points == nullptr)
-    return dequant_kblock_s8_f32_fwd<false>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                            kblock, NPad);
-  else
-    return dequant_kblock_s8_f32_fwd<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                           kblock, NPad);
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col8 = utils::padto_le(col, 8);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm256_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col8; icol += 8) {
-      __m256 vwscale;
-      if constexpr (std::is_same_v<SCAB_T, float>) {
-        vwscale = _mm256_loadu_ps(scaleB + icol);
-      } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-        auto tmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(scaleB + icol));
-        vwscale = ymm_cvt_bf16_fp32(tmp);
-      }
-      auto vscale = _mm256_mul_ps(valpha, vwscale);
-      auto vsrcd = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + irow * srcstep + icol));
-      auto vsrc = _mm256_cvtepi32_ps(vsrcd);
-      vsrc = _mm256_mul_ps(vsrc, vscale);
-      _mm256_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm256_set1_ps(-zpf);
-    for (; j < col8; j += VLen) {
-      auto vreduce = _mm256_loadu_ps(reduce + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm256_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zps + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scales + j));
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  auto vk = _mm256_set1_ps(static_cast<float>(k));
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm256_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm256_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zpb + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scaleb + j));
-      auto vreduceb = _mm256_loadu_ps(reduceb + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm256_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm256_mul_ps(vzpb, vk);
-      vacc = _mm256_fmadd_ps(vzpa, vzpb, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zpb[j]) * scaleb[j] * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * static_cast<float>(zpb[j]) * scaleb[j] * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2));
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      convert_s8_fp_v8(dstptr + i, tmp);
-      convert_s8_fp_v8(dstptr + i + 8, tmp + 8);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 8) {
-          convert_s8_fp_v8(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m256 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm256_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j));
-      valpha = ymm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm256_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm256_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += alpha[j] * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    fp32_dq_v = _mm256_mul_ps(fp32_dq_v, vscales[iv]);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      fp4_pad_4bit(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      unpack_f4_N<16, DST_T, F4_T>(dstptr + i, tmp);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmpbuf,
-                                                         size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    __m256 vscales[6];
-    __m256i vzps[6];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop16 = 48 * UnrollRow / 16;
-    assert(tmpsize >= (48 * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * 48, vscales, vzps);
-      }
-      for (; irow < row0; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);
-    assert(ld_dst == 48);
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + irr + iterr) * ld_src, tmpbuf + iterr * 48, vscales, vzps);
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < row; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  } else {
-    assert(0);
-  }
-  return JblasNotSupport;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmp,
-                                                         size_t tmpsize) {
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-enum class AVX2_REDUCE_TYPE { MAX, MIN, ADD };
-#define AVX2_REDUCE_OP                                                  \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) x = _mm256_max_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) x = _mm256_min_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) x = _mm256_add_ps(x, y);
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline float avx2_reduce_ps(__m256 x) {
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b01001110);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b10110001);
-  AVX2_REDUCE_OP
-  return _mm256_cvtss_f32(x);
-}
-
-#define AVX2_REDUCE_OP_EPI32(dst, src)                                           \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) dst = _mm256_max_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) dst = _mm256_min_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) dst = _mm256_add_epi32(dst, src);
-
-#ifndef _mm256_cvtsi256_si32
-#define _mm256_cvtsi256_si32(a) (_mm_cvtsi128_si32(_mm256_castsi256_si128(a)))
-#endif
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline int avx2_reduce_epi32(__m256i xd) {
-  auto x = _mm256_castsi256_ps(xd);
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  auto yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b01001110);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b10110001);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  return _mm256_cvtsi256_si32(xd);
-}
-
-inline __m128i avx2_cvtepi32_epu8(__m256i x) {
-  auto out_v = _mm_packus_epi32(_mm256_castsi256_si128(x), _mm256_extractf128_si256(x, 1));
-  out_v = _mm_packus_epi16(out_v, out_v);
-  return out_v;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 8;
-  auto vff = _mm256_set1_epi32(255);
-  auto v0 = _mm256_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m256 vmaxval = _mm256_set1_ps(0.f);
-      __m256 vminval = _mm256_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m256 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) assert(0);
-        vmaxval = _mm256_max_ps(vmaxval, vsrc);
-        vminval = _mm256_min_ps(vminval, vsrc);
-      }
-      auto maxval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MAX>(vmaxval);
-      auto minval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MIN>(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm256_set1_ps(rscale);
-      auto vdzp = _mm256_set1_epi32(zp);
-      ij = 0;
-      if (blkreduce) {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          sum += avx2_reduce_epi32<AVX2_REDUCE_TYPE::ADD>(vdsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      } else {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-        srcval = srcval * rscale;
-        auto srcint = int(roundf(srcval));
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        maxval = std::max((float)srcptr[ij + i * ld_src], maxval);
-        minval = std::min((float)srcptr[ij + i * ld_src], minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto srcint = utils::cast<float, int>(srcptr[ij + i * ld_src] * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = utils::cast<int, uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 8;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm256_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-        auto s1 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 8;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      auto bf16_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(src + j));
-      auto fp32_v = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(bf16_v), 2));
-      _mm256_storeu_ps(dst + j, fp32_v);
-    }
-    for (; j < col; j++) {
-      *(dst + j) = (src + j)->tofloat();
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-static const uint8_t avx2_bf16_convert_maigc_num[32] = {
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-static inline __m128i cvt_fp32_to_bf16(const __m256 src, __m256i* and_helper, __m256i* add_helper) {
-  auto shuffle_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(avx2_bf16_convert_maigc_num));
-  auto round_bias = _mm256_castps_si256(src);
-  round_bias = _mm256_and_si256(*and_helper, _mm256_srli_si256(round_bias, 2));
-  round_bias = _mm256_add_epi32(round_bias, *add_helper);
-  auto round_fp32_v = _mm256_add_epi32(_mm256_castps_si256(src), round_bias);
-  __m256i trunc_elements = _mm256_shuffle_epi8(round_fp32_v, shuffle_v);
-  __m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
-  return _mm256_castsi256_si128(ordered);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 8;
-  auto bf16_and_helper = _mm256_set1_epi32(0X00000001);
-  auto bf16_add_helper = _mm256_set1_epi32(0x00007FFF);
-  auto col_body_loop = col / simd_proc_elt * simd_proc_elt;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j += simd_proc_elt) {
-      auto pack_bf16_value = cvt_fp32_to_bf16(_mm256_loadu_ps(reinterpret_cast<const float*>(src) + j),
-                                              &bf16_and_helper, &bf16_add_helper);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + j * sizeof(jblas::utils::bf16)), pack_bf16_value);
-    }
-    for (; j < col; j++) {
-      (reinterpret_cast<jblas::utils::bf16*>(dst) + j)->fromfloat(*(reinterpret_cast<const float*>(src) + j));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx2
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
deleted file mode 100644
index 70cea4749aa79..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include "kernel_avx512f.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace avx512_bf16 {
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512bf16", "avx512vl", "avx512bw")
-#endif
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileBF16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,  //
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#endif
-  return avx512f::bf16_cvt_fp32_2D_write_back(src_ptr, dst_ptr, row, col, src_step, dst_step, zeropadding);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-#if CompileBF16()
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 32;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const uint32_t tail_mask = (1U << col_tail) - 1;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      _mm512_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-          (__m512i)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-                                       _mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)), tail_mask,  //
-          (__m512i)_mm512_cvtne2ps_pbh(
-              _mm512_maskz_loadu_ps(tail_mask >> 16, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-              _mm512_maskz_loadu_ps(tail_mask >> 0, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-#endif
-  return avx512f::fp32_cvt_bf16_2D_write_back(raw_srcptr, raw_dstptr, row, col, srcstride, dststride, zeropadding);
-}
-#if CompileBF16()
-#pragma GCC pop_options
-#endif
-}  // namespace avx512_bf16
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
deleted file mode 100644
index 3dc0278b8b801..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
+++ /dev/null
@@ -1,1966 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_utils.h"
-#include "kernel_ref.h"
-
-#include <array>
-#include <cstring>
-#include <type_traits>
-#if CompileAVX512F()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace kernel {
-namespace avx512f {
-#if CompileAVX512F()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx512f", "avx512bw", "avx512vl", "avx512vbmi", "avx512dq")
-#if CompileBF16()
-#pragma GCC target("avx512bf16")
-#endif
-#if CompileFP16()
-#pragma GCC target("avx512fp16")
-#endif
-#else
-#endif
-
-inline __m512 zmm_cvt_bf16_fp32(__m256i vbf16) {
-#if CompileBF16()
-  return _mm512_cvtpbh_ps((__m256bh)vbf16);
-#else
-  auto vf32 = _mm512_cvtepu16_epi32(vbf16);
-  return _mm512_castsi512_ps(_mm512_slli_epi32(vf32, 16));
-#endif
-}
-
-inline __m256i zmm_cvt_fp32_bf16(__m512 vfp32) {
-#if CompileBF16()
-  return (__m256i)_mm512_cvtneps_pbh(vfp32);
-#else
-  return _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(vfp32), 2));
-#endif
-}
-
-static inline __m512i unpack_4bits(__m256i v4bits, __m512i vmask) {
-  auto ymm1 = _mm256_slli_epi32(v4bits, 4);
-  auto zmm = _mm512_cvtepi8_epi16(v4bits);
-  auto zmm1 = _mm512_cvtepi8_epi16(ymm1);
-  zmm = _mm512_slli_epi16(zmm, 8);
-  zmm1 = _mm512_mask_mov_epi8(zmm1, 0xaaaaaaaaaaaaaaaa, zmm);
-  zmm1 = _mm512_and_epi32(zmm1, vmask);
-  return zmm1;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int LoadMask) {
-  auto ymm = _mm256_maskz_loadu_epi32(__mmask8(LoadMask), reinterpret_cast<const __m256i*>(srcptr));
-  auto zmm = unpack_4bits(ymm, vmask);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    zmm = _mm512_srli_epi32(zmm, 4);
-    auto s8 = _mm512_set1_epi8(8);
-    zmm = _mm512_sub_epi8(zmm, s8);
-  }
-  _mm512_mask_storeu_epi64(dstptr, __mmask8(LoadMask), zmm);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v16(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcptr));
-  auto zmm = _mm512_cvtepi8_epi32(xmm);
-  auto zmm1 = _mm512_cvtepi32_ps(zmm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto ymm = zmm_cvt_fp32_bf16(zmm1);
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), ymm);
-  } else {
-    _mm512_storeu_ps(dstptr, zmm1);
-  }
-}
-
-constexpr void (*pad_fp4)(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int) = &convert_s4_s8<JBLAS_DTYPE::S4_CLIP>;
-
-template <int N, typename _DST_T, bool _IS_SYM>
-static inline void dequant_s8_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    auto zmm = _mm512_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm512_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm512_cvtepi32_ps(zmm);
-    fzmm = _mm512_mul_ps(fzmm, vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_mul_ps(_mm512_castsi512_ps(fp32_dq_v), vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_castsi512_ps(fp32_dq_v);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <typename _ST>
-static inline __m512 vec_loadscalex16(_ST* ptr) {
-  return _mm512_loadu_ps(ptr);
-}
-
-template <>
-inline __m512 vec_loadscalex16(utils::bf16* ptr) {
-  auto vbf16 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr));
-  return zmm_cvt_bf16_fp32(vbf16);
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs) {
-  dst2regs[0] = _mm512_unpacklo_epi32(src1regs[0], src1regs[0]);
-  dst2regs[1] = _mm512_unpackhi_epi32(src1regs[0], src1regs[0]);
-}
-
-static inline void vec_broadcast_ps_1_2(__m512* dst2regs, __m512* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, _mm512_castps_si512(src1regs[0]));
-  dst2regs[0] = _mm512_castsi512_ps(_mm512_unpacklo_epi32(tmpreg, tmpreg));
-  dst2regs[1] = _mm512_castsi512_ps(_mm512_unpackhi_epi32(tmpreg, tmpreg));
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, src1regs[0]);
-  dst2regs[0] = _mm512_unpacklo_epi32(tmpreg, tmpreg);
-  dst2regs[1] = _mm512_unpackhi_epi32(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_pi8_1_2(__m128i* dst2regs, __m128i* src1regs, __m128i idxreg) {
-  auto tmpreg = _mm_permutexvar_epi16(idxreg, src1regs[0]);
-  dst2regs[0] = _mm_unpacklo_epi8(tmpreg, tmpreg);
-  dst2regs[1] = _mm_unpackhi_epi8(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_epi32_2_4(__m512i* dst4regs, __m512i* src2regs) {
-  vec_broadcast_epi32_1_2(dst4regs, src2regs);
-  vec_broadcast_epi32_1_2(dst4regs + 2, src2regs + 1);
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    constexpr int ColTile = 48;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    constexpr int LoadMask48 = (1 << (48 / 8)) - 1;
-    __m512 vscales[NRegs];
-    __m512i vzps[NRegs];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop64 = ColTile * UnrollRow / 64;
-    assert(tmpsize >= (ColTile * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 32 * iter64), zmm_mask,
-                   LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-      for (; irow < row0; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);  // no padding for unroll process
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 32 * iter64),
-                   zmm_mask, LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-    }
-    for (; irow < row; irow++) {
-      pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-      if constexpr (_IS_SYM) {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-      } else {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM = true>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  auto broadcast_idx = _mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
-  auto broadcast_idx_128 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-  if (col % 64 == 0) {
-    constexpr int ColTile = 64;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (int icol = 0; icol < col; icol += ColTile) {
-      __m512 vscales[NRegs];
-      __m512i vzps[NRegs];
-      assert(tmpsize >= ColTile);
-      int row0 = kblock - k_offset % kblock;
-      row0 = row0 == kblock ? 0 : row0;
-      row0 = row0 > row ? row : row0;
-      int row1 = row - row0;
-      int irow = 0;
-      if (row0) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (; irow < row0; irow++) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-
-      int row1_blk = utils::padto_le(row1, kblock) + row0;
-      for (; irow < row1_blk; irow += kblock) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (int irr = 0; irr < kblock; irr += 1) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2), zmm_mask,
-                   LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-      if (irow < row) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-      }
-      for (; irow < row; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int8_t* zero_points, int k_offset, int kblock,
-                                                 int NPad, int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  } else if constexpr (_PACK_ROW == 2) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      pad_fp4(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 64) {
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        pad_fp4(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i, tmp);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(dstptr + i + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_sym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                            int ld_src, int ld_dst, float* scales, int blocksize) {
-  int constexpr VLen = 16;
-  auto v127 = _mm512_set1_ps(127.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      vscale = _mm512_div_ps(vmaxval, v127);
-      auto vrscale = _mm512_div_ps(v127, vmaxval);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_asym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                             int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                             int blocksize) {
-  int constexpr VLen = 16;
-  auto v255 = _mm512_set1_ps(255.f);
-  auto v2 = _mm512_set1_ps(2.f);
-  auto v0 = _mm512_set1_ps(0.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vzp;
-      __m512 vmaxval = v0;
-      __m512 vminval = vmaxval;
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto vsub = _mm512_sub_ps(vmaxval, vminval);
-      vscale = _mm512_div_ps(vsub, v255);
-      auto vrscale = _mm512_div_ps(v255, vsub);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      auto vsum = _mm512_add_ps(vmaxval, vminval);
-      auto vmedium = _mm512_div_ps(vsum, v2);
-      vzp = _mm512_mul_ps(_mm512_sub_ps(v0, vmedium), vrscale);
-      auto vbzp = _mm512_cvtsepi32_epi8(_mm512_cvtps_epi32(vzp));
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(&zero_points[j / blocksize * ld_dst + i]), vbzp);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(_mm512_sub_ps(vsrc, vmedium), vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtsepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = 0;
-      float minval = 0;
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(maxval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255.f;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2.f;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                        int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                        int blocksize) {
-  if (zero_points == nullptr)
-    return quantize_f32_sign_int_rowblock_sym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, blocksize);
-  else
-    return quantize_f32_sign_int_rowblock_asym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-}
-
-static float F4_NF4_quant_sub_helper[] = {0.f,         0.23746347f, 0.38810113f, 0.50841697f, 0.61348899f, 0.71018467f,
-                                          0.80257138f, 0.88788655f, 0.96835165f, 1.05161765f, 1.14011017f, 1.23740894f,
-                                          1.34975982f, 1.49088332f, 1.70957482f, 2.0f};
-static float F4_BNB_quant_sub_helper[] = {0.00260417f, 0.0859375f, 0.20833333f, 0.29166667f,
-                                          0.4166667f,  0.583333f,  0.8333333f,  1.01f};
-static float F4_E2M1_quant_sub_helper[] = {0.00520833f, 0.08854167f, 0.20833333f, 0.29166667f,
-                                           0.41666667f, 0.58333333f, 0.83333333f, 1.01f};
-constexpr static int8_t F4_NF4_simd_quant_v[] = {0b0111, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0000,
-                                                 0b1000, 0b1001, 0b1010, 0b1011, 0b1100, 0b1101, 0b1110, 0b1111};
-constexpr static int8_t F4_BNB_simd_quant_v[] = {0b0000, 0b0001, 0b0110, 0b0111, 0b0100, 0b0101, 0b0010, 0b0011};
-constexpr static int8_t F4_E2M1_simd_quant_v[] = {0b0000, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0111};
-
-template <std::size_t N, std::size_t... I>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr, std::index_sequence<I...>) {
-  return std::array<int8_t, N * 16>{(arr[I / 16])...};
-}
-
-template <std::size_t N>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr) {
-  return broadcast_N_2_Nx16<N>(arr, std::make_index_sequence<N * 16>{});
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_4x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m128i xmm0{}, xmm1{}, xmm2{}, xmm3{};
-  __m512 zmm0{}, zmm1{}, zmm2{}, zmm3{}, zmm4, zmm5, zmm6, zmm7, zmm_scale{};
-  __mmask16 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm1 = _mm512_mask_loadu_ps(zmm1, ls_mask, srcptr + 1 * ld_src);
-  zmm2 = _mm512_mask_loadu_ps(zmm2, ls_mask, srcptr + 2 * ld_src);
-  zmm3 = _mm512_mask_loadu_ps(zmm3, ls_mask, srcptr + 3 * ld_src);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  zmm1 = _mm512_mul_ps(zmm1, zmm_scale);
-  zmm2 = _mm512_mul_ps(zmm2, zmm_scale);
-  zmm3 = _mm512_mul_ps(zmm3, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zmm_zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zmm_zp);
-    zmm1 = _mm512_add_ps(zmm1, zmm_zp);
-    zmm2 = _mm512_add_ps(zmm2, zmm_zp);
-    zmm3 = _mm512_add_ps(zmm3, zmm_zp);
-  } else {
-    mask4 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    mask5 = _mm512_cmplt_ps_mask(zmm1, zmm_v0);
-    mask6 = _mm512_cmplt_ps_mask(zmm2, zmm_v0);
-    mask7 = _mm512_cmplt_ps_mask(zmm3, zmm_v0);
-
-    zmm0 = _mm512_abs_ps(zmm0);
-    zmm1 = _mm512_abs_ps(zmm1);
-    zmm2 = _mm512_abs_ps(zmm2);
-    zmm3 = _mm512_abs_ps(zmm3);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm4 = _mm512_sub_ps(zmm0, sub_v);
-    zmm5 = _mm512_sub_ps(zmm1, sub_v);
-    zmm6 = _mm512_sub_ps(zmm2, sub_v);
-    zmm7 = _mm512_sub_ps(zmm3, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm4, zmm_v0);
-    mask1 = _mm512_cmple_ps_mask(zmm5, zmm_v0);
-    mask2 = _mm512_cmple_ps_mask(zmm6, zmm_v0);
-    mask3 = _mm512_cmple_ps_mask(zmm7, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm1 = _mm_mask_blend_epi8(mask1, xmm1, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm2 = _mm_mask_blend_epi8(mask2, xmm2, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm3 = _mm_mask_blend_epi8(mask3, xmm3, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-    zmm1 = _mm512_mask_add_ps(zmm1, mask1, zmm1, avoid_double_cmp);
-    zmm2 = _mm512_mask_add_ps(zmm2, mask2, zmm2, avoid_double_cmp);
-    zmm3 = _mm512_mask_add_ps(zmm3, mask3, zmm3, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask4, xmm0, xmm_bias);
-    xmm1 = _mm_mask_add_epi8(xmm1, mask5, xmm1, xmm_bias);
-    xmm2 = _mm_mask_add_epi8(xmm2, mask6, xmm2, xmm_bias);
-    xmm3 = _mm_mask_add_epi8(xmm3, mask7, xmm3, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-  _mm_mask_storeu_epi8(dstptr + 1 * ld_dst, ls_mask, xmm1);
-  _mm_mask_storeu_epi8(dstptr + 2 * ld_dst, ls_mask, xmm2);
-  _mm_mask_storeu_epi8(dstptr + 3 * ld_dst, ls_mask, xmm3);
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_1x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m512 zmm0{}, zmm1, zmm_scale{};
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  __m128i xmm0{};
-  __mmask16 mask0, mask1;
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zp);
-  } else {
-    mask1 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    zmm0 = _mm512_abs_ps(zmm0);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm1 = _mm512_sub_ps(zmm0, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm1, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask1, xmm0, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-}
-
-inline void calc_blkx16_scale(const float* srcptr, int blocksize, int ld_src, float* scales, __mmask16 ls_mask) {
-  auto absmax = _mm512_set1_ps(0.f);
-  __m512 tmp{};
-  for (int i = 0; i < blocksize; i++) {
-    absmax = _mm512_range_ps(absmax, _mm512_mask_loadu_ps(tmp, ls_mask, srcptr + i * ld_src), 7);
-  }
-  _mm512_mask_storeu_ps(scales, ls_mask, absmax);
-}
-
-constexpr auto broadcast_F4_NF4_quantv = broadcast_N_2_Nx16<16>(F4_NF4_simd_quant_v);
-constexpr auto broadcast_F4_BNB_quantv = broadcast_N_2_Nx16<8>(F4_BNB_simd_quant_v);
-constexpr auto broadcast_F4_E2M1_quantv = broadcast_N_2_Nx16<8>(F4_E2M1_simd_quant_v);
-
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  // assert(col % 16 == 0);
-  auto align_row = row / blocksize * blocksize;
-  auto align_blk = blocksize / 4 * 4;
-  int8_t* broadcast_f4_quantv;
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_NF4_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_BNB_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1)
-    broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_E2M1_quantv.data());
-  int i = 0;
-  int align_col = col / 16 * 16;
-
-  auto process_row_blk = [&](int i, int col_size) {
-    int j = 0;
-    __mmask16 ls_mask = _cvtu32_mask16(0xffff >> (16 - col_size));
-    for (; j < align_row; j += blocksize) {
-      calc_blkx16_scale(srcptr + j * ld_src + i, blocksize, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      for (; k < align_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < blocksize; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-    if (j < row) {
-      auto fin_row = row - align_row;
-      calc_blkx16_scale(srcptr + j * ld_src + i, fin_row, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      auto align_fin_blk = fin_row / 4 * 4;
-      for (; k < align_fin_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < fin_row; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-  };
-
-  for (; i < align_col; i += 16) process_row_blk(i, 16);
-  if (i < col) process_row_blk(i, col - i);
-
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 16;
-  auto vff = _mm512_set1_epi32(255);
-  auto v0 = _mm512_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      __m512 vminval = _mm512_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      auto minval = _mm512_reduce_min_ps(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      auto vdzp = _mm512_set1_epi32(zp);
-      int sum = 0;
-      ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        if (blkreduce) {
-          sum += _mm512_reduce_add_epi32(vdsrc);
-        }
-        vdsrc = _mm512_add_epi32(vdsrc, vdzp);
-        vdsrc = _mm512_min_epi32(vdsrc, vff);
-        vdsrc = _mm512_max_epi32(vdsrc, v0);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        srcval = srcval * rscale;
-        auto srcint = utils::cast<float, int>(srcval);
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        auto srcint = utils::cast<float, int>(fsrc * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = srcint;
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, int blocksize,
-                                                 float* reduce) {
-  int constexpr VLen = 16;
-  auto vpos = _mm512_set1_epi32(127);
-  auto vneg = _mm512_set1_epi32(-128);
-  int VBlockSize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(std::numeric_limits<float>::min());
-      size_t ij = 0;
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = std::abs(static_cast<float>(srcptr[(j + ij) + i * ld_src]));
-          maxval = std::max(maxval, srcval);
-        }
-      }
-      float scale = maxval / 127;
-      scales[j / blocksize + i * ld_scale] = scale;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      ij = 0;
-      int sum = 0;
-
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        sum += _mm512_reduce_add_epi32(vdsrc);
-        vdsrc = _mm512_min_epi32(vdsrc, vpos);
-        vdsrc = _mm512_max_epi32(vdsrc, vneg);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          srcval = srcval * rscale;
-          auto srcint = int(roundf(srcval));
-          sum += srcint;
-          srcint = std::min(srcint, 127);
-          srcint = std::max(srcint, -127);
-          dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-        }
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        absmaxval = std::max(std::abs((float)srcptr[(j + ij) + i * ld_src]), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>((float)srcptr[(ij) + i * ld_src] * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm512_set1_ps(alpha);
-  auto vbeta = _mm512_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm512_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        vdst = _mm512_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 16) {
-        convert_s8_fp_v16(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, tmp + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m512 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm512_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(alpha + j));
-      valpha = zmm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += static_cast<float>(alpha[j]) * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_add_ps(vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline void vec_quanout_s32_u32_v16(const int32_t* srcptr, __m512& vfactor, __m512i& vzp, __m512i& vzeros,
-                                           __m512i& v255, uint8_t* dstptr) {
-  auto vsrcd = _mm512_loadu_si512(srcptr);
-  auto vsrcf = _mm512_mul_ps(vfactor, _mm512_cvtepi32_ps(vsrcd));
-  vsrcd = _mm512_cvtps_epi32(vsrcf);
-  vsrcd = _mm512_add_epi32(vsrcd, vzp);
-  vsrcd = _mm512_max_epi32(vsrcd, vzeros);
-  vsrcd = _mm512_min_epi32(vsrcd, v255);
-  auto vdstb = _mm512_cvtepi32_epi8(vsrcd);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), vdstb);
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  auto vfactor = _mm512_set1_ps(factor);
-  auto vzp = _mm512_set1_epi32(zpDst);
-  auto vzeros = _mm512_set1_epi32(0);
-  auto v255 = _mm512_set1_epi32(255);
-  int N64 = utils::padto_le(N, 64);
-  int N48 = utils::padto_le(N, 48);
-  int N16 = utils::padto_le(N, 16);
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    for (; j < N64; j += 64) {
-      for (int iv = 0; iv < 4; iv++) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                &dstptr[i * dststep + j + iv * 16]);
-      }
-    }
-    if (N48 - j >= 48) {
-      for (; j < N48; j += 48) {
-        for (int iv = 0; iv < 3; iv++) {
-          vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                  &dstptr[i * dststep + j + iv * 16]);
-        }
-      }
-    }
-    if (N16 - j >= 16) {
-      for (; j < N16; j += 16) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j], vfactor, vzp, vzeros, v255, &dstptr[i * dststep + j]);
-      }
-    }
-    for (; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  auto vbeta = _mm512_set1_ps(beta);
-  int col16 = utils::padto_le(col, 16);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = ascales[irow * ldas] * alpha;
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col16; icol += 16) {
-      auto vwscale = _mm512_loadu_ps(wscales + icol);
-      auto vscale = _mm512_mul_ps(valpha, vwscale);
-      auto vdst = _mm512_loadu_ps(dstptr + irow * ld_dst + icol);
-      vdst = _mm512_mul_ps(vdst, vbeta);
-      auto vsrcd = _mm512_loadu_si512(srcptr + irow * ld_src + icol);
-      auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-      vsrc = _mm512_fmadd_ps(vsrc, vscale, vdst);
-      _mm512_storeu_ps(dstptr + irow * ld_dst + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * ld_dst + icol] =
-          scale * wscales[icol] * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col16 = utils::padto_le(col, 16);
-  int col64 = utils::padto_le(col, 64);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col64; icol += 64) {
-      for (int ic = 0; ic < 4; ic++) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol + ic * 16);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol + ic * 16));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol + ic * 16);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol + ic * 16, vsrc);
-      }
-    }
-    if (icol + 16 <= col16) {
-      for (; icol < col16; icol += 16) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-      }
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  int constexpr VN = 64 / sizeof(srcval);
-  int numv = utils::padto_le(num, VN);
-  auto vsrc = _mm512_set1_epi8(srcval);
-  for (; i < numv; i += VN) {
-    _mm512_storeu_si512(dstptr + i, vsrc);
-  }
-  int num32 = utils::padto_le(num, 32);
-  if (i + 32 <= num32) {
-    for (; i < num32; i += 32) {
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + i), _mm512_castsi512_si256(vsrc));
-    }
-  }
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm512_set1_ps(-zpf);
-    for (; j < col16; j += VLen) {
-      auto vreduce = _mm512_loadu_ps(reduce + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm512_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zps + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scales + j));
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  auto vk = _mm512_set1_ps(static_cast<float>(k));
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm512_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm512_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zpb + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scaleb + j));
-      auto vreduceb = _mm512_loadu_ps(reduceb + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm512_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm512_mul_ps(vzpb, vk);
-      vacc = _mm512_fmadd_ps(vzpa, vzpb, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        float zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-        accptr[i * ldacc + j] -= zpbf * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * zpbf * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 16;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  auto tail_mask = _cvtu32_mask16(0xffff >> (16 - col_tail));
-  int npadding = dststride - col * sizeof(utils::bf16);
-  auto bf16_and_helper = _mm512_set1_epi32(0x00000001);
-  auto bf16_add_helper = _mm512_set1_epi32(0X00007FFF);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      auto round_bias = _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v = _mm512_add_epi32(round_bias, _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_value = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                          pack_bf16_value);
-    }
-    if (col_tail > 0) {
-      auto round_bias = _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v =
-          _mm512_add_epi32(round_bias, _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_tail = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                               tail_mask, pack_bf16_tail);
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 16;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm512_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = _mm512_reduce_add_ps(vtmp);
-        auto s1 = _mm512_reduce_add_ps(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = _mm512_reduce_add_ps(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_fp16_2D_write_back(const float* src_ptr, utils::fp16* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(utils::fp16);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm256_storeu_ph(dst + j, _mm512_cvtxps_ph(_mm512_loadu_ps(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm256_mask_storeu_epi16(  //
-          dst + j, tail_mask, _mm256_castph_si256(_mm512_cvtxps_ph(_mm512_maskz_loadu_ps(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE fp16_cvt_fp32_2D_write_back(const utils::fp16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm512_storeu_ps(dst + j, _mm512_cvtxph_ps(_mm256_loadu_ph(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_ps(dst + j, tail_mask,
-                            _mm512_cvtxph_ps(_mm256_castsi256_ph(_mm256_maskz_loadu_epi16(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wignored-attributes"  // https://stackoverflow.com/a/49216021
-#endif
-// Interleave 2 bf16 zmm vectors inplace
-static inline void interleave_word(std::array<__m512i, 2>& dst) {  // NOLINT [runtime/references]
-  static constexpr uint32_t perm_idx_a[16]{
-      0 | 0,  1 | 0,  2 | 0,  3 | 0,   //
-      0 | 16, 1 | 16, 2 | 16, 3 | 16,  //
-      4 | 0,  5 | 0,  6 | 0,  7 | 0,   //
-      4 | 16, 5 | 16, 6 | 16, 7 | 16,  //
-  };
-  static constexpr uint32_t perm_idx_b[16]{
-      8 | 0,   9 | 0,   10 | 0,  11 | 0,   //
-      8 | 16,  9 | 16,  10 | 16, 11 | 16,  //
-      12 | 0,  13 | 0,  14 | 0,  15 | 0,   //
-      12 | 16, 13 | 16, 14 | 16, 15 | 16,  //
-  };
-  static const auto v_perm_idx_a = _mm512_loadu_si512(perm_idx_a);
-  static const auto v_perm_idx_b = _mm512_loadu_si512(perm_idx_b);
-
-  __m512i tmp[2];
-  tmp[0] = _mm512_unpacklo_epi16(dst[0], dst[1]);
-  tmp[1] = _mm512_unpackhi_epi16(dst[0], dst[1]);
-  dst[0] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_a, tmp[1]);
-  dst[1] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_b, tmp[1]);
-}
-
-// Interleave 16 zmm vectors of dwords inplace
-static inline void tr_x16_dword(std::array<__m512i, 16>& dst) {  // NOLINT [runtime/references]
-  __m512i tmp[16];
-
-#pragma unroll(8)
-  for (int i = 0; i < 8; ++i) {
-    tmp[2 * i] = _mm512_unpacklo_epi32(dst[2 * i], dst[2 * i + 1]);
-    tmp[2 * i + 1] = _mm512_unpackhi_epi32(dst[2 * i], dst[2 * i + 1]);
-  }
-
-#pragma unroll(4)
-  for (int i = 0; i < 4; ++i) {
-    dst[4 * i] = _mm512_unpacklo_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 1] = _mm512_unpackhi_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 2] = _mm512_unpacklo_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-    dst[4 * i + 3] = _mm512_unpackhi_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-  }
-
-#pragma unroll(2)
-  for (int i = 0; i < 2; ++i) {
-    tmp[8 * i + 0] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0x88);
-    tmp[8 * i + 1] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0x88);
-    tmp[8 * i + 2] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0x88);
-    tmp[8 * i + 3] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0x88);
-    tmp[8 * i + 4] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0xdd);
-    tmp[8 * i + 5] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0xdd);
-    tmp[8 * i + 6] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0xdd);
-    tmp[8 * i + 7] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0xdd);
-  }
-
-  dst[0] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0x88);
-  dst[1] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0x88);
-  dst[2] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0x88);
-  dst[3] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0x88);
-  dst[4] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0x88);
-  dst[5] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0x88);
-  dst[6] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0x88);
-  dst[7] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0x88);
-  dst[8] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0xdd);
-  dst[9] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0xdd);
-  dst[10] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0xdd);
-  dst[11] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0xdd);
-  dst[12] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0xdd);
-  dst[13] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0xdd);
-  dst[14] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0xdd);
-  dst[15] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0xdd);
-}
-
-#if CompileBF16() && CompileFP16()
-// Load 2 fp16 vectors; convert them to bf16 and interleave them
-template <int tail>
-static inline std::array<__m512i, 2> load_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-// load_fp16_bf16_interleave_word with maskz
-template <int tail>
-static inline std::array<__m512i, 2> load_maskz_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda,
-                                                                          uint32_t mask) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-template <int tail>
-static inline std::array<__m512i, 16> load_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_fp16_bf16_tr_x16_dword<1>)* load_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<2>,
-    load_fp16_bf16_tr_x16_dword<3>,  load_fp16_bf16_tr_x16_dword<4>,  load_fp16_bf16_tr_x16_dword<5>,
-    load_fp16_bf16_tr_x16_dword<6>,  load_fp16_bf16_tr_x16_dword<7>,  load_fp16_bf16_tr_x16_dword<8>,
-    load_fp16_bf16_tr_x16_dword<9>,  load_fp16_bf16_tr_x16_dword<10>, load_fp16_bf16_tr_x16_dword<11>,
-    load_fp16_bf16_tr_x16_dword<12>, load_fp16_bf16_tr_x16_dword<13>, load_fp16_bf16_tr_x16_dword<14>,
-    load_fp16_bf16_tr_x16_dword<15>, load_fp16_bf16_tr_x16_dword<16>,
-};
-
-template <int tail>
-static inline std::array<__m512i, 16> load_maskz_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda,
-                                                                        uint32_t mask) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_maskz_fp16_bf16_tr_x16_dword<1>)* load_maskz_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<2>,
-    load_maskz_fp16_bf16_tr_x16_dword<3>,  load_maskz_fp16_bf16_tr_x16_dword<4>,  load_maskz_fp16_bf16_tr_x16_dword<5>,
-    load_maskz_fp16_bf16_tr_x16_dword<6>,  load_maskz_fp16_bf16_tr_x16_dword<7>,  load_maskz_fp16_bf16_tr_x16_dword<8>,
-    load_maskz_fp16_bf16_tr_x16_dword<9>,  load_maskz_fp16_bf16_tr_x16_dword<10>, load_maskz_fp16_bf16_tr_x16_dword<11>,
-    load_maskz_fp16_bf16_tr_x16_dword<12>, load_maskz_fp16_bf16_tr_x16_dword<13>, load_maskz_fp16_bf16_tr_x16_dword<14>,
-    load_maskz_fp16_bf16_tr_x16_dword<15>, load_maskz_fp16_bf16_tr_x16_dword<16>,
-};
-#endif
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-struct padding_interleave_cvt {
-  padding_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int NTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int RowPack = 2;
-  padding_interleave_cvt() = delete;
-
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int NTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    int i = 0;
-    for (; i < row / RowPack * RowPack; i += RowPack) {
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    if (i < row) {                      // i: tail processing
-      static constexpr int tail_m = 1;  // must be 1
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-      i += RowPack;
-    }
-    for (; i < row_pad; i += RowPack) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += NTile) {
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-struct padding_trans_interleave_cvt {
-  padding_trans_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int MTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_trans_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int ColPack = 2;
-  padding_trans_interleave_cvt() = delete;
-
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int MTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    assert(row_pad % 16 == 0 && col_pad % 32 == 0);
-    int i = 0;
-    for (; i < row / MTile * MTile; i += MTile) {
-      assert(MTile % 16 == 0);
-      int j = 0;
-      for (; j < col / 32 * 32; j += 32) {
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-      }
-      if (j < col) {  // j: tail processing
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        j += 32;
-      }
-      for (; j < col_pad; j += 2) {  // j: padding zero
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    if (i < row) {  // i: tail processing
-      int ii = 0;
-      for (; i + ii < row / 16 * 16; ii += 16) {
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      if (i + ii < row) {  // ii: tail processing
-        const int tbl_idx = row - i - ii;
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss =
-              load_maskz_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-        ii += 16;
-      }
-      for (; ii < MTile; ii += 16) {  // ii: padding zero
-        for (int j = 0; j < col_pad; j += 2) {
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      assert(ii == MTile);
-      i += MTile;
-    }
-    assert(row_pad % MTile == 0);
-    for (; i < row_pad; i += MTile) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += 2) {
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx512f
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
deleted file mode 100644
index 245401876c91b..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
+++ /dev/null
@@ -1,1375 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "jit_base.h"
-#include "jit_blas_utils.h"
-#include "kernel_jit_injector.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit {
-
-class DequanS8F32 {
- public:
-  class MicroKernelAVX512F : protected jblas::xbyak::JitAvx512f {
-   public:
-    struct params {
-      void *srcptr, *dstptr;
-      int row, col;
-      int srcstride, dststride;
-      float* scales;
-      int8_t* zps;
-    };
-    typedef long long (*func_t)(params*);
-    static int constexpr VBytes = 64;
-    static int constexpr RegScale = 0;
-    static int constexpr RegZP = 4;
-    static int constexpr RegTmp = RegScale + 8;
-    MicroKernelAVX512F(bool is_sym_) {
-      is_sym = is_sym_;
-      generate();
-      this->ready();
-      mKernel = this->getCode<func_t>();
-    }
-
-    void generate() {
-      inLocalLabel();  // use local label for multiple instance
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 14;
-      Xbyak::util::StackFrame st(this, 1, 13, SF_TmpPos + SF_TmpSize);
-      parambase = st.p[0];
-      reg_srcptr = st.t[0];
-      reg_dstptr = st.t[1];
-      reg_srcstride = st.t[2];
-      reg_dststride = st.t[3];
-      reg_rowsize = st.t[4];
-      reg_colsize = st.t[5];
-      reg_iterrow = st.t[6];
-      reg_itercol = st.t[7];
-      reg_tmp = st.t[8];
-      reg_scaleptr = st.t[9];
-      reg_tmpdst = st.t[10];
-      reg_tmp1 = st.t[12];
-      reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      mov(reg_scaleptr, ptr[parambase + OFFSET(scales)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      xor_(reg_itercol, reg_itercol);
-
-      // reuse parambase reg
-      if (!is_sym) {
-        mov(reg_tmp1, ptr[parambase + OFFSET(zps)]);
-        mov(reg_zpptr, reg_tmp1);
-        xor_(reg_tmp1, reg_tmp1);
-      }
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, 64);
-      jl(".proc48", T_NEAR);
-      generateNTile(4);
-      add(reg_itercol, 64);
-      add(reg_srcptr, 1 * 64);
-      add(reg_dstptr, 4 * 64);
-      add(reg_scaleptr, 4 * 64);
-      if (!is_sym) add(reg_zpptr, 1 * 64);
-      jmp(".colend", T_NEAR);
-
-      L(".proc48");
-      cmp(reg_tmp, 48);
-      jl(".proc32", T_NEAR);
-      generateNTile(3);
-      add(reg_itercol, 48);
-      add(reg_srcptr, 1 * 48);
-      add(reg_dstptr, 4 * 48);
-      add(reg_scaleptr, 4 * 48);
-      if (!is_sym) add(reg_zpptr, 1 * 48);
-      jmp(".colend", T_NEAR);
-
-      L(".proc32");
-      generateNTile(2);
-      add(reg_itercol, 32);
-      add(reg_srcptr, 1 * 32);
-      add(reg_dstptr, 4 * 32);
-      add(reg_scaleptr, 4 * 32);
-      if (!is_sym) add(reg_zpptr, 1 * 32);
-
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-      outLocalLabel();  // end of local label
-    }
-
-    void generateNTile(int N) {
-      for (int i = 0; i < N; i++) {
-        vmovups(Xbyak::Zmm(RegScale + i), ptr[reg_scaleptr + i * 64]);
-        if (!is_sym) {
-          vpmovsxbd(Xbyak::Zmm(RegZP + i), ptr[reg_zpptr + i * 16]);
-        }
-      }
-      inLocalLabel();
-      xor_(reg_iterrow, reg_iterrow);
-      mov(reg_tmp, reg_srcptr);
-      mov(reg_tmp1, reg_dstptr);
-      L(".rowloop");
-      for (int i = 0; i < N; i++) {
-        vpmovsxbd(Xbyak::Zmm(RegTmp), ptr[reg_tmp + i * 16]);
-        if (!is_sym) {
-          vpsubd(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegZP + i));
-        }
-        vcvtdq2ps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp));
-        vmulps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegScale + i));
-        vmovups(ptr[reg_tmp1 + i * 64], Xbyak::Zmm(RegTmp));
-      }
-      add(reg_tmp, reg_srcstride);
-      add(reg_tmp1, reg_dststride);
-      add(reg_iterrow, 1);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-      outLocalLabel();
-    }
-    func_t mKernel = nullptr;
-
-   private:
-    Xbyak::Reg64 parambase;
-    Xbyak::Reg64 reg_srcptr;
-    Xbyak::Reg64 reg_dstptr;
-    Xbyak::Reg64 reg_srcstride;
-    Xbyak::Reg64 reg_dststride;
-    Xbyak::Reg64 reg_rowsize;
-    Xbyak::Reg64 reg_colsize;
-    Xbyak::Reg64 reg_iterrow;
-    Xbyak::Reg64 reg_itercol;
-    Xbyak::Reg64 reg_tmp;
-    Xbyak::Reg64 reg_scaleptr;
-    Xbyak::Reg64 reg_tmpdst;
-    Xbyak::Reg64 reg_tmp1;
-    Xbyak::Reg64 reg_ret;
-    Xbyak::Reg64 reg_zpptr = reg_ret;
-    bool is_sym;
-  };
-  static void forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst, float* scales,
-                              int8_t* zero_points) {
-    static MicroKernelAVX512F mAVX512FSym(true);
-    static MicroKernelAVX512F mAVX512FASym(false);
-    auto param = MicroKernelAVX512F::params{srcptr,
-                                            dstptr,
-                                            row,
-                                            col,
-                                            static_cast<int>(ld_src * sizeof(int8_t)),
-                                            static_cast<int>(ld_dst * sizeof(float)),
-                                            scales,
-                                            zero_points};
-    if (zero_points == nullptr) {
-      mAVX512FSym.mKernel(&param);
-    } else {
-      mAVX512FASym.mKernel(&param);
-    }
-  }
-};
-
-class DequanKBlockS8F32 {
- public:
-  template <typename _ST>
-  static inline JBLAS_CODE forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _ST* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int row1_blk = utils::padto_le(row1, kblock);
-    int row2 = row - row1_blk - row0;
-    auto sptr = scales + k_offset / kblock * NPad;
-    int8_t* zptr = nullptr;
-    if (zero_points != nullptr) zptr = zero_points + k_offset / kblock * NPad;
-    if (row0 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row0, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += row0 * ld_src;
-      dstptr += row0 * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    for (int i = 0; i < row1_blk; i += kblock) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, kblock, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += kblock * ld_src;
-      dstptr += kblock * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    if (row2 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row2, col, ld_src, ld_dst, sptr, zptr);
-    }
-    return JblasSuccess;
-  }
-};
-
-class JitMemcpy2DAvx2 : protected jblas::xbyak::JitAvx2 {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 32;
-  JitMemcpy2DAvx2(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx2 instance_withops(1, p);
-    static JitMemcpy2DAvx2 instance2_withops(2, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static JitMemcpy2DAvx2 instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx2 instance2_withops(2, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {
-    // unrollK=[1,2]
-    assert(unrollk == 1 || unrollk == 2);
-    Xbyak::Label data_label;
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 10;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_ymm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_ymm_idx, reg_ret);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            vmovups(Xbyak::Ymm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Ymm(i + j * ColUnroll), k * 3 * sizeof(float));
-            vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Ymm(i + j * ColUnroll));
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Ymm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Ymm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      mov(reg_tmp2, reg_colsize);
-      sub(reg_tmp2, reg_itercol);
-      cmp(reg_tmp2, VBytes);
-      jb(".maskflag", T_NEAR);
-      cmp(reg_tmp2, 0);
-      jl(".maskend", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(0));
-        }
-      } else {
-        vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vmovups(ptr[reg_tmpdst], Xbyak::Ymm(0));
-      }
-      jmp(".maskend", T_NEAR);
-      L(".maskflag");
-      // 0<tail<8
-      mov(reg_tmp1.cvt32(), 1);
-      shlx(reg_tmp1.cvt32(), reg_tmp1.cvt32(), reg_tmp2.cvt32());
-      sub(reg_tmp1.cvt32(), 1);
-      vmovd(Xbyak::Xmm(1), reg_tmp1.cvt32());
-      vpbroadcastd(Xbyak::Ymm(1), Xbyak::Xmm(1));
-      vpsllvd(Xbyak::Ymm(1), Xbyak::Ymm(1), ptr[rip + data_label]);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vpmaskmovd(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(1), Xbyak::Ymm(0));
-        }
-      } else {
-        vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vpmaskmovd(ptr[reg_tmpdst], Xbyak::Ymm(1), Xbyak::Ymm(0));
-      }
-      L(".maskend");
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    L(data_label);
-    uint32_t mask_bias[8] = {28, 24, 20, 16, 12, 8, 4, 0};
-    db(reinterpret_cast<uint8_t*>(mask_bias), sizeof(mask_bias));
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_ymm_idx;
-};
-
-class JitMemcpy2DAvx512f : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  JitMemcpy2DAvx512f(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx512f instance_withops(1, p);
-    static JitMemcpy2DAvx512f instance4_withops(4, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    static JitMemcpy2DAvx512f instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx512f instance4_withops(4, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {  // unrollK=[1,2,4]
-    if (unrollk != 1 && unrollk != 2 && unrollk != 4) {
-      assert(false);
-      return;
-    }
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      if (unrollk == 4) {
-        imul(reg_tmp1, reg_srcstride, 3);
-        imul(reg_tmp2, reg_dststride, 3);
-      }
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_zmm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_zmm_idx, reg_ret, k2);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            if (j == 3) {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_tmp1 + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_tmp2 + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            } else {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            }
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Zmm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Zmm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      push(reg_tmp1);
-      generate_Nbitsmask(k1, reg_itercol, reg_colsize, reg_tmp, reg_tmp1, VBytes);
-      pop(reg_tmp1);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          if (j == 3) {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_tmp1]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_tmp2], Xbyak::Zmm(0) | k1);
-          } else {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_srcstride * j]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Zmm(0) | k1);
-          }
-        }
-      } else {
-        vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-        vmovdqu8(ptr[reg_tmpdst], Xbyak::Zmm(0) | k1);
-      }
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_zmm_idx;
-};
-
-static inline Xbyak::Zmm unpack_4bit(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm zmm, Xbyak::Zmm zmm1,
-                                     Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Ymm ymm1(zmm1.getIdx());
-  jit->vpmovsxbw(zmm, v4bits);
-  jit->vpslld(ymm1, v4bits, 4);
-  jit->vpmovsxbw(zmm1, ymm1);
-  jit->vpsllw(zmm, zmm, 8);
-  jit->vmovdqu8(zmm1 | unpack_mask, zmm);
-  jit->vpandd(zmm1, vmask, zmm1);
-  return zmm1;
-}
-
-static inline Xbyak::Zmm unpack_4bit_2regs(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm tmp,
-                                           Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Zmm dst(v4bits.getIdx());
-  jit->vpmovsxbw(tmp, v4bits);
-  jit->vpslld(v4bits, v4bits, 4);
-  jit->vpmovsxbw(dst, v4bits);
-  jit->vpsllw(tmp, tmp, 8);
-  jit->vmovdqu8(dst | unpack_mask, tmp);
-  jit->vpandd(dst, vmask, dst);
-  return dst;
-}
-
-class DecompressS4S8_AVX512F : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr;
-    size_t size;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  DecompressS4S8_AVX512F() {
-    inLocalLabel();  // use local label for multiple instance
-    int SF_TmpSize = 64;
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_size = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[12];
-    const Xbyak::Reg64& reg_ret = rax;
-
-    vreg_push(rsp);
-
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_size, ptr[parambase + OFFSET(size)]);
-    Xbyak::Opmask unpack_mask(4);
-    Xbyak::Zmm zmm_mask(31);
-    mov(reg_tmp.cvt32(), uint32_t(0xf0f0f0f0));
-    vpbroadcastd(zmm_mask, reg_tmp.cvt32());
-    mov(reg_tmp, 0xaaaaaaaaaaaaaaaa);
-    kmovq(unpack_mask, reg_tmp);
-    int const ColUnroll = 4;
-    xor_(reg_iterrow, reg_iterrow);
-    xor_(reg_itercol, reg_itercol);
-    L(".colloop");
-    mov(reg_tmp, reg_size);
-    sub(reg_tmp, reg_itercol);
-    cmp(reg_tmp, ColUnroll * VBytes);
-    jl(".maskproc", T_NEAR);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    for (int i = 0; i < ColUnroll; i++) {
-      vmovups(Xbyak::Ymm(i), ptr[reg_srcptr + reg_tmp + i * VBytes / 2]);
-      unpack_4bit_2regs(this, Xbyak::Ymm(i), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-      vmovups(ptr[reg_dstptr + reg_itercol + i * VBytes], Xbyak::Zmm(i));
-    }
-    add(reg_itercol, ColUnroll * VBytes);
-    jmp(".colend");
-    L(".maskproc");
-    generate_Nbitsmask(k1, reg_itercol, reg_size, reg_tmp, reg_tmp1, VBytes);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_srcptr + reg_tmp]);
-    unpack_4bit_2regs(this, Xbyak::Ymm(0), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-    vmovdqu8(ptr[reg_dstptr + reg_itercol], Xbyak::Zmm(0) | k1);
-    add(reg_itercol, VBytes);
-    L(".colend");
-    cmp(reg_itercol, reg_size);
-    jb(".colloop");
-
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-    outLocalLabel();  // end of local label
-
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, size_t size) {
-    static DecompressS4S8_AVX512F instance;
-    auto param = params{srcptr, dstptr, size};
-    instance.mKernel(&param);
-    return JblasSuccess;
-  }
-
- private:
-  func_t mKernel = nullptr;
-};
-
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  if (col != ld_src) {  // memory is not continuous
-    return JblasNotSupport;
-  }
-  DecompressS4S8_AVX512F::forward(srcptr, dstptr, (size_t)row * col);
-  return JblasSuccess;
-}
-
-// src: row x col => dst: ⌈col/n_tile⌉ x ⌈row/row_pack⌉ x n_tile x row_pack (zeor-padded)
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride, dststride;  // dst = dst_base + dststride * n_idx, where n_idx % n_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  static inline const uint16_t idx_interleave_self[32] = {
-      0,  16, 1,  17, 2,  18, 3,  19,  //
-      4,  20, 5,  21, 6,  22, 7,  23,  //
-      8,  24, 9,  25, 10, 26, 11, 27,  //
-      12, 28, 13, 29, 14, 30, 15, 31,  //
-  };
-
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t) : PaddingInterleaveCvt(n_tile, dst_t, dst_t) {}
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int row_pack = 0) : xbyak::JitAvx512f() {
-    inLocalLabel();  // use local label for multiple instance
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (row_pack == 0) row_pack = 4 / dst_bytes;  // default value
-    const auto ne_zmm = 64 / std::max(src_bytes, dst_bytes);
-    const auto src_bytes_vmm = ne_zmm * src_bytes;
-
-    assert(n_tile % ne_zmm == 0);
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    int SF_TmpSize = 64;
-    Xbyak::Label l_idx_interleave_self;
-    std::shared_ptr<void> epilogue{
-        // generate code at the very end
-        nullptr, [&](void*) {
-          align(64);
-          L(l_idx_interleave_self);
-          db(reinterpret_cast<const uint8_t*>(idx_interleave_self), sizeof(idx_interleave_self));
-          outLocalLabel();  // end of local label
-
-          this->ready();
-          this->mKernel = this->getCode<func_t>();
-        }};
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[9];
-    const Xbyak::Reg64& reg_tmp2 = st.t[12];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    auto& mask_rd = k1;
-    const Xbyak::Zmm& vreg_idx0 = zmm31;
-
-    vreg_push(rsp);
-    vmovups(vreg_idx0, zword[rip + l_idx_interleave_self]);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(row_pack), reg_tmps(row_pack);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = row_pack;
-    for (int i = 0; i < row_pack; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < row_pack; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(reg_itercol, reg_itercol);
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, row_pack);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      for (int ii = 0; ii < row_pack; ii++) {
-        const Xbyak::Xmm reg_srcs_ii = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[ii].getIdx())
-                                                             : (assert(false), reg_srcs[ii]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        }
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        // interleave_2rows_4regs(reg_srcs.data(), reg_tmps.data());
-        assert(false);  // Not implemented
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-    lea(reg_srcptr, ptr[reg_srcptr + row_pack * reg_srcstride]);
-    lea(reg_dstptr, ptr[reg_dstptr + row_pack * n_tile * dst_bytes]);
-
-    add(reg_iterrow, row_pack);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    L(".tailcolloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      if (row_pack == 2) {
-        const Xbyak::Xmm reg_srcs_0 = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[0].getIdx())
-                                                            : (assert(false), reg_srcs[0]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        }
-        vxorps(reg_srcs[1], reg_srcs[1]);
-      } else {
-        assert(false);
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        assert(false);
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".tailcolloop");
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    const auto kern_col_pad = utils::padto(col, NTile);
-    const auto kern_row_pad = utils::padto(row, RowPack);
-    assert(kern_col_pad <= col_pad && col_pad % NTile == 0);
-    assert(kern_row_pad <= row_pad && row_pad % RowPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    static const PaddingInterleaveCvt kern(NTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, RowPack);
-    kern(&param);
-
-    // extra row and col pad
-    const auto row_pad_size_memset = sizeof(T_DST) * (row_pad - kern_row_pad) * NTile;
-    if (row_pad_size_memset) {
-      for (int j = 0; j < kern_col_pad; j += NTile)
-        memset(dst + j * dst_step + kern_row_pad * NTile, 0, row_pad_size_memset);
-    }
-    for (int j = kern_col_pad; j < col_pad; j += NTile)  //
-      memset(dst + j * dst_step, 0, sizeof(T_DST) * NTile * row_pad);
-  }
-
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(col, NTile) <= col_pad && col_pad % NTile == 0);
-    assert(utils::padto(row, RowPack) <= row_pad && row_pad % RowPack == 0);
-    for (int i = 0; i < row_pad; i += RowPack)
-      for (int j = 0; j < col_pad; j += NTile)
-        for (int ii = 0; ii < RowPack; ++ii)
-          for (int jj = 0; jj < NTile; ++jj)
-            dst[i * NTile + j * dst_step + ii + jj * RowPack] =
-                static_cast<T_DST>((i + ii < row && j + jj < col) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// src: row x col => dst: ⌈row/m_tile⌉ x ⌈col/(trans_cell*col_pack==64/sizeof(t_dst))⌉ x m_tile x col_pack (zeor-padded)
-// Note1: the extra padding on the dimension of col due to the implementation limitation
-// Note2: dst will only be zero-padded to a multiple of trans_cell in the dimension of m_tile
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride;  // src = src_base + srcstride * m_idx
-    int dststride;  // dst = dst_base + dststride * m_idx, where m_idx % m_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-  const int trans_cell;  // transpose matrices of size trans_cellxtrans_cell (in terms of #elements or #packs)
-
- private:
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t) : PaddingTransInterleaveCvt(m_tile, dst_t, dst_t) {}
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int col_pack = 0)
-      : xbyak::JitAvx512f(), trans_cell(64 / col_pack / int(utils::jblas_dtype_size(dst_t))) {
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (col_pack == 0) col_pack = 4 / dst_bytes;  // default value
-    // const auto src_bytes_vmm = ne_zmm * src_bytes;
-    // const auto dst_bytes_vmm = ne_zmm * dst_bytes;
-
-    assert(m_tile % trans_cell == 0);
-    assert(col_pack > 0 && col_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    inLocalLabel();                // use local label for multiple instance
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 11 | Xbyak::util::UseRDX, 16 * 10);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[4];
-    const Xbyak::Reg64& reg_iterrow = st.t[5];
-    const Xbyak::Reg64& reg_itercol = st.t[6];
-    const Xbyak::Reg64& reg_tmp = st.t[7];
-    const Xbyak::Reg64& reg_tmp2 = st.t[9];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    const auto& mask_rd = k1;
-    const auto& mask_rd2 = k2;
-
-    vreg_push(rsp);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(trans_cell), reg_tmps(trans_cell);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = trans_cell;
-    for (int i = 0; i < trans_cell; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < trans_cell; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(rdx, rdx);
-    mov(rax, reg_iterrow);
-    mov(reg_tmp, m_tile);
-    div(reg_tmp);                                 // reg_iterrow `div` m_tile
-    imul(reg_dstptr, rdx, col_pack * dst_bytes);  // ii * col_pack
-    add(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    imul(reg_tmp, rax, m_tile);
-    imul(reg_tmp, reg_dststride);
-    lea(reg_dstptr, ptr[reg_dstptr + reg_tmp]);  // dst = dst_base + i * dst_step + ii * col_pack
-    xor_(reg_itercol, reg_itercol);
-
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, trans_cell);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-    if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-      kshiftrq(mask_rd2, mask_rd, 16);
-      assert(trans_cell == 16);
-      for (int ii = 0; ii < trans_cell; ++ii) {
-        lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-        vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-        vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-        vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-      }
-      transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-      for (int jj = 0; jj < trans_cell; ++jj) {
-        vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-      }
-    } else {
-      assert(false);  // Not implemented
-    }
-    lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-    lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-
-    imul(reg_tmp, reg_srcstride, trans_cell);
-    lea(reg_srcptr, ptr[reg_srcptr + reg_tmp]);  // srcptr += trans_cell * srcstride
-    lea(reg_iterrow, ptr[reg_iterrow + trans_cell]);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    // reg_itercol, reg_dstptr should have been set in the non-tail section
-    Xbyak::Label l_tail_tbl;
-    std::vector<Xbyak::Label> l_tail_case(trans_cell);
-    mov(reg_tmp, l_tail_tbl);                              // TODO(Yi): rip + l + offset?
-    jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR);  // switch(rows-iterrow) ...
-    align(sizeof(intptr_t));
-    L(l_tail_tbl);
-    db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
-    for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);
-
-    for (int m_tail = 1; m_tail < trans_cell; ++m_tail) {  // case (m_tail):
-      auto& tailcolloop = l_tail_case[m_tail];
-      L(tailcolloop);
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        kshiftrq(mask_rd2, mask_rd, 16);
-        assert(trans_cell == 16);
-        for (int ii = 0; ii < trans_cell; ++ii) {
-          if (ii < m_tail) {
-            lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-            vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-            vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-            vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-          } else if (ii == m_tail) {
-            vxorps(reg_srcs[ii], reg_srcs[ii], reg_srcs[ii]);
-          } else {
-            vmovaps(reg_srcs[ii], reg_srcs[m_tail]);
-          }
-        }
-        transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-        for (int jj = 0; jj < trans_cell; ++jj) {
-          vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-        }
-      } else {
-        assert(false);  // Not implemented
-      }
-      lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-      lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-      cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-      jb(tailcolloop);
-      jmp(".aftercolloop", T_NEAR);
-    }
-
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    static const PaddingTransInterleaveCvt kern(MTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, ColPack);
-    // 0-padded guarantee by jit kern
-    const auto kern_row_pad = utils::padto(row, kern.trans_cell),
-               kern_col_pad = utils::padto(col, kern.trans_cell * ColPack);
-    assert(kern_row_pad <= row_pad && row_pad % MTile == 0);
-    assert(kern_col_pad <= col_pad && col_pad % ColPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    kern(&param);
-
-    // extra row and col pad
-    const auto col_pad_size_memset = sizeof(T_DST) * (col_pad - kern_col_pad) * MTile;
-    if (col_pad_size_memset) {
-      for (int i = 0; i < kern_row_pad; i += MTile)
-        memset(dst + i * dst_step + kern_col_pad * MTile, 0, col_pad_size_memset);
-    }
-    const auto row_tail_pad_size_memset = sizeof(T_DST) * (utils::padto(row, MTile) - kern_row_pad) * ColPack;
-    if (row_tail_pad_size_memset) {  // row tail due to kernel limitation: kern_row_pad < next_multiple_of_MTile
-      const auto kern_row_pad_le_mtile = utils::padto_le(kern_row_pad, MTile);
-      const auto tail_dst_base = dst + kern_row_pad_le_mtile * dst_step + kern_row_pad % MTile * ColPack;
-      for (int j = 0; j < kern_col_pad; j += ColPack) memset(tail_dst_base + j * MTile, 0, row_tail_pad_size_memset);
-    }
-    for (int j = utils::padto(row, MTile); j < row_pad; j += MTile)
-      memset(dst + kern_row_pad * dst_step, 0, sizeof(T_DST) * MTile * col_pad);
-  }
-
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    for (int i = 0; i < row_pad; i += MTile)
-      for (int j = 0; j < col_pad; j += ColPack)
-        for (int ii = 0; ii < MTile; ++ii)
-          for (int jj = 0; jj < ColPack; ++jj)
-            dst[j * MTile + i * dst_step + jj + ii * ColPack] =
-                static_cast<T_DST>((j + jj < col && i + ii < row) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// Complex number matrix(interleaved) - vector(as diagonal matrix) multiplication; Typically used for
-// shift-RoPE
-//
-// vector: fp16 values; view every adjacent 2 values on colunm as a complex num
-// src: bf16 ⌈row/row_pack⌉ x n_tile x row_pack; view every adjacent 2 values on colunm as a complex num
-// dst: same as src
-class CScaleInterleavedBF16FP16 : protected xbyak::JitAvx512_fp16 {
- public:
-  struct params {
-    void* srcptr;
-    const void* scaleptr;
-    int row;
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  explicit CScaleInterleavedBF16FP16(int n_tile, int n_off, int row_pack = 2, int unroll = 2)
-      : xbyak::JitAvx512_fp16() {
-    inLocalLabel();  // use local label for multiple instance
-    assert(("n_tile must be a multiple of 16", n_tile % 16 == 0));
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-    int SF_TmpSize = 64;
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 4, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_src = st.t[0];
-    const Xbyak::Reg64& reg_scale = st.t[1];
-    const Xbyak::Reg64& reg_rowsize = st.t[2];
-    const Xbyak::Reg64& reg_iterrow = st.t[3];
-    const Xbyak::Zmm& vreg_scale = zmm31;
-    const auto& mask = k1;
-    const auto masked_off = n_off % 16;
-    if (masked_off != 0) {
-      mov(reg_src, ((1ULL << (16 - masked_off)) - 1) << masked_off);
-      kmovw(mask, reg_src.cvt32());
-    }
-
-    vreg_push(rsp);
-    mov(reg_rowsize.cvt32(), ptr[parambase + OFFSET(row)]);
-    mov(reg_src, qword[parambase + OFFSET(srcptr)]);
-    mov(reg_scale, qword[parambase + OFFSET(scaleptr)]);
-
-    std::vector<Xbyak::Zmm> vreg_src(4 * n_tile / 16);
-    const int ZIDX_TranSrc = 0;
-    for (int i = 0; i < 4 * n_tile / 16; i++) vreg_src[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    Xbyak::Label rowloop;
-    L(rowloop);
-    {
-      assert(("only implement for pack2 bf16", row_pack == 2));
-      for (int i = 0; i < unroll * row_pack; i += row_pack) {
-        vpbroadcastd(vreg_scale, dword[reg_scale + reg_iterrow * sizeof(utils::fp16) + i * sizeof(utils::fp16)]);
-
-        if (masked_off != 0) {
-          int j = utils::padto_le(n_off, 16);
-
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)] | mask, vreg0);
-        }
-
-        for (int j = utils::padto(n_off, 16); j < n_tile; j += 16) {
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)], vreg0);
-        }
-      }
-    }
-    lea(reg_iterrow, ptr[reg_iterrow + unroll * row_pack]);
-    lea(reg_src, ptr[reg_src + unroll * row_pack * n_tile * sizeof(utils::bf16)]);
-    cmp(reg_iterrow, reg_rowsize);
-    jb(rowloop);
-
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, int RowPack = 2>
-  static void forward(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    constexpr auto unroll = 2;
-    assert(("row should be paded", row % (RowPack * unroll) == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    int j = utils::padto_le(n_offset, NTile);
-    if (n_offset % NTile != 0) {
-      static const CScaleInterleavedBF16FP16 kern_off(NTile, n_offset % NTile, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern_off(&param);
-      j += NTile;
-    }
-
-    for (; j < col; j += NTile) {
-      static const CScaleInterleavedBF16FP16 kern(NTile, 0, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern(&param);
-    }
-  }
-
-  template <int NTile, int RowPack = 2>
-  static void reference(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    assert(("row should be paded", row % RowPack == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    for (int j = 0; j < col; j += NTile) {
-      for (int i = 0; i < row; i += RowPack) {
-        for (int jj = 0; jj < NTile; ++jj) {
-          if (j + jj < n_offset) continue;
-          auto& rel = (src + j * src_step)[i * NTile + jj * RowPack + 0];
-          auto& img = (src + j * src_step)[i * NTile + jj * RowPack + 1];
-          const auto rel_f32 = static_cast<float>(rel);
-          const auto img_f32 = static_cast<float>(img);
-          const auto rel_scale = static_cast<float>(scale[i + 0]);
-          const auto img_scale = static_cast<float>(scale[i + 1]);
-          rel = static_cast<utils::bf16>(rel_f32 * rel_scale - img_f32 * img_scale);
-          img = static_cast<utils::bf16>(rel_f32 * img_scale + img_f32 * rel_scale);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace jit
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
deleted file mode 100644
index d3e49eecd6b4e..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
+++ /dev/null
@@ -1,930 +0,0 @@
-//  Copyright (c) 2022 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-#pragma once
-
-#include <utility>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <map>
-#include <set>
-#include <array>
-
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "xbyak/xbyak.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit_injector {
-using Zmm = Xbyak::Zmm;
-using Ymm = Xbyak::Ymm;
-using Xmm = Xbyak::Xmm;
-class eltwise_injector {
- public:
-  eltwise_injector(JBLAS_ELTWISEOP eltwiseop) : elt_op(eltwiseop) { reigster_table_entries(); }
-  virtual ~eltwise_injector() {}
-
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_zmm_idx, const Xbyak::Reg64& table_reg,
-                        const Xbyak::Opmask& mask_reg) {
-    h = ptr;
-    k_mask = mask_reg;
-    p_table = table_reg;
-    assert(used_zmm_idx.size() <= 26);
-    assign_zmm(used_zmm_idx, &zmm_mask);
-    assign_zmm(used_zmm_idx, &zmm_aux0);
-    assign_zmm(used_zmm_idx, &zmm_aux1);
-    assign_zmm(used_zmm_idx, &zmm_aux2);
-    assign_zmm(used_zmm_idx, &zmm_aux3);
-    assign_zmm(used_zmm_idx, &zmm_aux4);
-  }
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_ymm_idx, const Xbyak::Reg64& table_reg) {
-    h = ptr;
-    p_table = table_reg;
-    assert(used_ymm_idx.size() <= 10);
-    assign_ymm(used_ymm_idx, &ymm_mask);
-    assign_ymm(used_ymm_idx, &ymm_aux0);
-    assign_ymm(used_ymm_idx, &ymm_aux1);
-    assign_ymm(used_ymm_idx, &ymm_aux2);
-    assign_ymm(used_ymm_idx, &ymm_aux3);
-    assign_ymm(used_ymm_idx, &ymm_aux4);
-  }
-  void assign_reg_elt_constp(const Xbyak::Reg64& reg) { reg_rt_const_p = reg; }
-  void vector_compute(const Xbyak::Zmm& zmm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(zmm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(zmm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(zmm_src);
-        break;
-      case RELU:
-        relu_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LINEAR:
-        linear_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(zmm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void vector_compute(const Xbyak::Ymm& ymm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(ymm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(ymm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(ymm_src);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(ymm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(ymm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void prepare_table() {
-    h->align(64);
-    h->L(l_table);
-    assert(sizeof(table_entry_val_t) == 4);  // sizeof(table_entry_val_t) should be 4
-    for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-      const auto& te = (*it).second;
-      const auto len = te.bcast ? 64u : sizeof(table_entry_val_t);
-      for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val);
-    }
-  }
-
- private:
-  void reigster_table_entries() {
-    static const table_t common_values{
-        {zero, {0x00000000, true}},      {half, {0x3f000000, true}},          {one, {0x3f800000, true}},
-        {two, {0x40000000, true}},       {minus_one, {0xbf800000, true}},     {minus_two, {0xc0000000, true}},
-        {ln2f, {0x3f317218, true}},      {one_epi32, {0x00000001, true}},     {positive_mask, {0x7fffffff, true}},
-        {sign_mask, {0x80000000, true}}, {exponent_bias, {0x0000007f, true}},
-    };
-
-    static constexpr std::array<float, 3> exp_approx_f32_coeff{0.35815147f, 0.96963238f, 1.f};
-    static const table_t low_precision_exp_consts{
-        {low_precision_exp_const_v0, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[0]), true}},
-        {low_precision_exp_const_v1, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[1]), true}},
-        {low_precision_exp_const_v2, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[2]), true}},
-    };
-
-    static const table_t exp_consts{{exp_log2ef, {0x3fb8aa3b, true}},
-                                    {exp_ln_flt_max_f, {0x42b17218, true}},
-                                    {exp_ln_flt_min_f, {0xc2aeac50, true}}};
-
-    static const table_t exp_polynomial{
-        // p0 = 1.0f
-        {exp_pol, {0x3f7ffffb, true}},  // p1 = 0.999999701f
-        {exp_pol, {0x3efffee3, true}},  // p2 = 0.499991506f
-        {exp_pol, {0x3e2aad40, true}},  // p3 = 0.166676521f
-        {exp_pol, {0x3d2b9d0d, true}},  // p4 = 0.0418978221f
-        {exp_pol, {0x3c07cfce, true}}   // p5 = 0.00828929059f
-    };
-
-    static const table_t gelu_tanh_const{{gelu_tanh_fitting_const, {0x3d372713, true}},
-                                         {gelu_tanh_fitting_const_times_three, {0x3e095d4f, true}},
-                                         {gelu_tanh_sqrt_two_over_pi, {0x3f4c422a, true}},
-                                         {gelu_tanh_flt_max_x, {0x4154C480, true}},
-                                         {gelu_tanh_flt_min_x, {0xC154C480, true}}};
-
-    // tanh(x) constants for four interval approximation
-    static const table_t tanh_consts{{tanh_idx_bias, {0x39800000, true}},
-                                     {tanh_idx_mask, {0xffc00000, true}},
-                                     {tanh_linear_ubound, {0x39ddb3d7, true}},
-                                     {tanh_saturation_lbound, {0x41102cb3, true}}};
-
-    // tanh(x) polynomial approximation
-    // For each coefficient, there is 32 entries
-    static const table_t tanh_polynomial_table{
-        // coefficients of degree 0
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x39bfffff, false}},
-        {tanh_pol_table, {0x39ffffff, false}},
-        {tanh_pol_table, {0x3a3ffffe, false}},
-        {tanh_pol_table, {0x3a7ffffb, false}},
-        {tanh_pol_table, {0x3abffff7, false}},
-        {tanh_pol_table, {0x3affffeb, false}},
-        {tanh_pol_table, {0x3b3fffdc, false}},
-        {tanh_pol_table, {0x3b7fffab, false}},
-        {tanh_pol_table, {0x3bbfff70, false}},
-        {tanh_pol_table, {0x3bfffeab, false}},
-        {tanh_pol_table, {0x3c3ffdc0, false}},
-        {tanh_pol_table, {0x3c7ffaab, false}},
-        {tanh_pol_table, {0x3cbff701, false}},
-        {tanh_pol_table, {0x3cffeaad, false}},
-        {tanh_pol_table, {0x3d3fdc08, false}},
-        {tanh_pol_table, {0x3d7faacd, false}},
-        {tanh_pol_table, {0x3dbf7081, false}},
-        {tanh_pol_table, {0x3dfeacc9, false}},
-        {tanh_pol_table, {0x3e3dc7fd, false}},
-        {tanh_pol_table, {0x3e7acbf5, false}},
-        {tanh_pol_table, {0x3eb77a9f, false}},
-        {tanh_pol_table, {0x3eec9a9f, false}},
-        {tanh_pol_table, {0x3f22991f, false}},
-        {tanh_pol_table, {0x3f42f7d6, false}},
-        {tanh_pol_table, {0x3f67b7cc, false}},
-        {tanh_pol_table, {0x3f76ca83, false}},
-        {tanh_pol_table, {0x3f7ebbe9, false}},
-        {tanh_pol_table, {0x3f7fd40c, false}},
-        {tanh_pol_table, {0x3f7fff32, false}},
-        {tanh_pol_table, {0x3f7ffffc, false}},
-        {tanh_pol_table, {0x3f800000, false}},
-        // coefficients of degree 1
-        {tanh_pol_table, {0x3f800000, false}},
-        {tanh_pol_table, {0x3f800018, false}},
-        {tanh_pol_table, {0x3f7fffe8, false}},
-        {tanh_pol_table, {0x3f7fffda, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffac, false}},
-        {tanh_pol_table, {0x3f7fff70, false}},
-        {tanh_pol_table, {0x3f7ffeec, false}},
-        {tanh_pol_table, {0x3f7ffdc0, false}},
-        {tanh_pol_table, {0x3f7ffbed, false}},
-        {tanh_pol_table, {0x3f7ff704, false}},
-        {tanh_pol_table, {0x3f7feff5, false}},
-        {tanh_pol_table, {0x3f7fdbca, false}},
-        {tanh_pol_table, {0x3f7fbfff, false}},
-        {tanh_pol_table, {0x3f7f7041, false}},
-        {tanh_pol_table, {0x3f7f009b, false}},
-        {tanh_pol_table, {0x3f7dc36c, false}},
-        {tanh_pol_table, {0x3f7c0aa8, false}},
-        {tanh_pol_table, {0x3f7734b8, false}},
-        {tanh_pol_table, {0x3f70a4de, false}},
-        {tanh_pol_table, {0x3f5f1fd8, false}},
-        {tanh_pol_table, {0x3f495493, false}},
-        {tanh_pol_table, {0x3f18b9ec, false}},
-        {tanh_pol_table, {0x3ed706cb, false}},
-        {tanh_pol_table, {0x3e390b06, false}},
-        {tanh_pol_table, {0x3d90b11f, false}},
-        {tanh_pol_table, {0x3c21a053, false}},
-        {tanh_pol_table, {0x3aaf7fdb, false}},
-        {tanh_pol_table, {0x37ccc1a3, false}},
-        {tanh_pol_table, {0x355c6733, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 2
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xbe4e0ff1, false}},
-        {tanh_pol_table, {0x3d25b1b1, false}},
-        {tanh_pol_table, {0x3d6b6dab, false}},
-        {tanh_pol_table, {0x3c9fb1d5, false}},
-        {tanh_pol_table, {0xbabff06f, false}},
-        {tanh_pol_table, {0x3c07b3f6, false}},
-        {tanh_pol_table, {0xbb3fc1bc, false}},
-        {tanh_pol_table, {0x3a9f5921, false}},
-        {tanh_pol_table, {0xbbbf06f2, false}},
-        {tanh_pol_table, {0xbbb0f402, false}},
-        {tanh_pol_table, {0xbc47db9e, false}},
-        {tanh_pol_table, {0xbc73d5e7, false}},
-        {tanh_pol_table, {0xbca25bda, false}},
-        {tanh_pol_table, {0xbcfca780, false}},
-        {tanh_pol_table, {0xbd40e07c, false}},
-        {tanh_pol_table, {0xbd7dab03, false}},
-        {tanh_pol_table, {0xbdbe4a0f, false}},
-        {tanh_pol_table, {0xbdfb14a5, false}},
-        {tanh_pol_table, {0xbe36cc8d, false}},
-        {tanh_pol_table, {0xbe6bd102, false}},
-        {tanh_pol_table, {0xbe9fe7c5, false}},
-        {tanh_pol_table, {0xbeba0f10, false}},
-        {tanh_pol_table, {0xbec206a8, false}},
-        {tanh_pol_table, {0xbea3c388, false}},
-        {tanh_pol_table, {0xbe277d62, false}},
-        {tanh_pol_table, {0xbd8b7960, false}},
-        {tanh_pol_table, {0xbc209f49, false}},
-        {tanh_pol_table, {0xbaad44ca, false}},
-        {tanh_pol_table, {0xb7c6eeac, false}},
-        {tanh_pol_table, {0xb663aa41, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 3
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x45b3ae96, false}},
-        {tanh_pol_table, {0xc414eb20, false}},
-        {tanh_pol_table, {0xc450e02e, false}},
-        {tanh_pol_table, {0xc3152b4e, false}},
-        {tanh_pol_table, {0xbead2f56, false}},
-        {tanh_pol_table, {0xc2162e02, false}},
-        {tanh_pol_table, {0xbeb4bd5a, false}},
-        {tanh_pol_table, {0xc11a59a4, false}},
-        {tanh_pol_table, {0xbed2f507, false}},
-        {tanh_pol_table, {0xc020d32c, false}},
-        {tanh_pol_table, {0x3dd0f506, false}},
-        {tanh_pol_table, {0xbf2a75e2, false}},
-        {tanh_pol_table, {0xbff950e3, false}},
-        {tanh_pol_table, {0xbed47334, false}},
-        {tanh_pol_table, {0xbe809b8c, false}},
-        {tanh_pol_table, {0xbeb64532, false}},
-        {tanh_pol_table, {0xbe961a5b, false}},
-        {tanh_pol_table, {0xbe9b63ac, false}},
-        {tanh_pol_table, {0xbea0d4b2, false}},
-        {tanh_pol_table, {0xbe828a77, false}},
-        {tanh_pol_table, {0xbe378612, false}},
-        {tanh_pol_table, {0xbdc20908, false}},
-        {tanh_pol_table, {0x3d2d3957, false}},
-        {tanh_pol_table, {0x3dd46e89, false}},
-        {tanh_pol_table, {0x3db3f629, false}},
-        {tanh_pol_table, {0x3d2c5e7b, false}},
-        {tanh_pol_table, {0x3bd20403, false}},
-        {tanh_pol_table, {0x3a59dfae, false}},
-        {tanh_pol_table, {0x3770af45, false}},
-        {tanh_pol_table, {0x372cc014, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 4
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xcc981a1b, false}},
-        {tanh_pol_table, {0x4a7edd3d, false}},
-        {tanh_pol_table, {0x4ab1007c, false}},
-        {tanh_pol_table, {0x48fedd9c, false}},
-        {tanh_pol_table, {0x41a557b5, false}},
-        {tanh_pol_table, {0x477ee32a, false}},
-        {tanh_pol_table, {0x422557f5, false}},
-        {tanh_pol_table, {0x45ff3ce4, false}},
-        {tanh_pol_table, {0x42a55641, false}},
-        {tanh_pol_table, {0x446e0867, false}},
-        {tanh_pol_table, {0xc33dc19a, false}},
-        {tanh_pol_table, {0x42915214, false}},
-        {tanh_pol_table, {0x43af4fad, false}},
-        {tanh_pol_table, {0x4110fe88, false}},
-        {tanh_pol_table, {0xc1099b75, false}},
-        {tanh_pol_table, {0x3fc8a8dc, false}},
-        {tanh_pol_table, {0xbfbeaef5, false}},
-        {tanh_pol_table, {0xbe365aad, false}},
-        {tanh_pol_table, {0x3f4d9652, false}},
-        {tanh_pol_table, {0x3ddfa08f, false}},
-        {tanh_pol_table, {0x3e34e9b8, false}},
-        {tanh_pol_table, {0x3e2d07a6, false}},
-        {tanh_pol_table, {0x3dc63567, false}},
-        {tanh_pol_table, {0x3cdaeb78, false}},
-        {tanh_pol_table, {0xbcd17537, false}},
-        {tanh_pol_table, {0xbc92829c, false}},
-        {tanh_pol_table, {0xbb43ab99, false}},
-        {tanh_pol_table, {0xb9b471dd, false}},
-        {tanh_pol_table, {0xb6baad5a, false}},
-        {tanh_pol_table, {0xb78bafc7, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 5
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x52f688d5, false}},
-        {tanh_pol_table, {0xd0505c72, false}},
-        {tanh_pol_table, {0xd08f98e3, false}},
-        {tanh_pol_table, {0xce505cc9, false}},
-        {tanh_pol_table, {0xc7162b8a, false}},
-        {tanh_pol_table, {0xcc5061d6, false}},
-        {tanh_pol_table, {0xc7162bdf, false}},
-        {tanh_pol_table, {0xca50b37f, false}},
-        {tanh_pol_table, {0xc7162a3a, false}},
-        {tanh_pol_table, {0xc8422086, false}},
-        {tanh_pol_table, {0x471a714e, false}},
-        {tanh_pol_table, {0xc5ece1f1, false}},
-        {tanh_pol_table, {0xc70e3d90, false}},
-        {tanh_pol_table, {0xc3eba94a, false}},
-        {tanh_pol_table, {0x43e0c424, false}},
-        {tanh_pol_table, {0xc21f4552, false}},
-        {tanh_pol_table, {0x42217cc8, false}},
-        {tanh_pol_table, {0x405e7dc4, false}},
-        {tanh_pol_table, {0xc10dd401, false}},
-        {tanh_pol_table, {0x3e96b602, false}},
-        {tanh_pol_table, {0xbd1a6d2f, false}},
-        {tanh_pol_table, {0xbd393883, false}},
-        {tanh_pol_table, {0xbd674682, false}},
-        {tanh_pol_table, {0xbd310016, false}},
-        {tanh_pol_table, {0xb961e269, false}},
-        {tanh_pol_table, {0x3ba32495, false}},
-        {tanh_pol_table, {0x3a7680d5, false}},
-        {tanh_pol_table, {0x38b3173c, false}},
-        {tanh_pol_table, {0x35a9deea, false}},
-        {tanh_pol_table, {0x375c3f2a, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 6
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xd8995ed1, false}},
-        {tanh_pol_table, {0x558285ea, false}},
-        {tanh_pol_table, {0x55b2cd69, false}},
-        {tanh_pol_table, {0x53028625, false}},
-        {tanh_pol_table, {0x4bc9991f, false}},
-        {tanh_pol_table, {0x5082898a, false}},
-        {tanh_pol_table, {0x4b4999b3, false}},
-        {tanh_pol_table, {0x4e02c07c, false}},
-        {tanh_pol_table, {0x4ac99764, false}},
-        {tanh_pol_table, {0x4b72c822, false}},
-        {tanh_pol_table, {0xca40c0e1, false}},
-        {tanh_pol_table, {0x489413e4, false}},
-        {tanh_pol_table, {0x49b12224, false}},
-        {tanh_pol_table, {0x46134c4e, false}},
-        {tanh_pol_table, {0xc60c2d57, false}},
-        {tanh_pol_table, {0x43c83910, false}},
-        {tanh_pol_table, {0xc3c872d1, false}},
-        {tanh_pol_table, {0xc186bc9e, false}},
-        {tanh_pol_table, {0x42325bc3, false}},
-        {tanh_pol_table, {0xbf2ffa4a, false}},
-        {tanh_pol_table, {0x3d9a203c, false}},
-        {tanh_pol_table, {0xbc545a43, false}},
-        {tanh_pol_table, {0xbae08fee, false}},
-        {tanh_pol_table, {0x3c80225d, false}},
-        {tanh_pol_table, {0x3b1fd1df, false}},
-        {tanh_pol_table, {0xba36b9d1, false}},
-        {tanh_pol_table, {0xb91de544, false}},
-        {tanh_pol_table, {0xb71f100f, false}},
-        {tanh_pol_table, {0xb408e2ed, false}},
-        {tanh_pol_table, {0xb685fec8, false}},
-        {tanh_pol_table, {0x00000000, false}},
-    };
-
-    auto push_arg_entry_of = [&](const key_t key, const table_entry_val_t val, const bool broadcast) {
-      mapped_table_entry_t te{0, val, broadcast};
-      entry_map.insert(std::make_pair(key, te));
-    };
-
-    auto push_entries_of = [&](const table_t& t) {
-      for (auto it = t.begin(); it != t.end(); it++) {
-        auto key = it->first;
-        auto te = it->second;
-        push_arg_entry_of(key, te.val, te.bcast);
-      }
-    };
-
-    auto set_table_term_offset = [&]() {
-      size_t off = 0;
-      for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-        auto& te = (*it).second;
-        te.off = off;
-        off += te.bcast ? 64u : sizeof(table_entry_val_t);
-      }
-    };
-
-    struct need_t {
-      explicit need_t(JBLAS_ELTWISEOP& op) {
-        if (op == EXP) exp_ = true;
-        if (op == TANH) tanh_ = true;
-        if (op == GELU) gelu_ = true;
-        if (op == SWISH) swish_ = true;
-        if (op == LOW_PRECISION_EXP) low_precision_exp_ = true;
-      }
-      bool bf16_ = false;
-      bool exp_ = false;
-      bool tanh_ = false;
-      bool gelu_ = false;
-      bool low_precision_exp_ = false;
-      bool swish_ = false;
-
-      bool bf16() const { return bf16_; }
-      bool exp() const { return exp_; }
-      bool tanh() const { return tanh_; }
-      bool gelu() const { return gelu_; }
-      bool low_precision_exp() { return low_precision_exp_; }
-      bool swish() const { return swish_; }
-    };
-
-    need_t need(elt_op);
-    push_entries_of(common_values);
-    if (need.exp()) {
-      push_entries_of(exp_consts);
-      push_entries_of(exp_polynomial);
-    }
-    if (need.low_precision_exp() || need.swish()) {
-      push_entries_of(exp_polynomial);
-      push_entries_of(exp_consts);
-      push_entries_of(low_precision_exp_consts);
-    }
-    if (need.tanh() || need.gelu()) {
-      push_entries_of(tanh_consts);
-      push_entries_of(tanh_polynomial_table);
-    }
-    if (need.gelu()) push_entries_of(gelu_tanh_const);
-
-    set_table_term_offset();
-  }
-  void exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    /* exp code */
-    h->vcmpps(ymm_mask, ymm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(ymm_src, ymm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(ymm_src, ymm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(ymm_aux1, ymm_src);
-    h->vmulps(ymm_src, ymm_src, table_val(exp_log2ef));
-    h->vaddps(ymm_src, ymm_src, table_val(half));
-    h->vroundps(ymm_aux2, ymm_src, _op_floor);
-
-    // keep ymm_src = fx for further computations
-    h->vmovups(ymm_src, ymm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(ymm_aux1, ymm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of
-    // computing 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(ymm_src, ymm_src, table_val(one));
-    h->vcvtps2dq(ymm_aux2, ymm_src);
-    h->vpaddd(ymm_aux2, ymm_aux2, table_val(exponent_bias));
-    h->vpslld(ymm_aux2, ymm_aux2, n_mantissa_bits);
-
-    // use ymm_src as tmp ymm_zero when applying mask
-    h->vxorps(ymm_src, ymm_src, ymm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendvps(ymm_aux2, ymm_aux2, ymm_src, ymm_mask);
-
-    // compute polynomial
-    h->vmovups(ymm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(ymm_src, ymm_src, ymm_aux2);
-    h->vmulps(ymm_src, ymm_src, table_val(two));
-  }
-  void exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    /* exp code */
-    h->vcmpps(k_mask, zmm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(zmm_src, zmm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(zmm_src, zmm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vmulps(zmm_src, zmm_src, table_val(exp_log2ef));
-    h->vaddps(zmm_src, zmm_src, table_val(half));
-    h->vrndscaleps(zmm_aux2, zmm_src, _op_floor & 0x3);
-
-    // keep zmm_src = fx for further computations
-    h->vmovups(zmm_src, zmm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(zmm_aux1, zmm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of computing
-    // 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(zmm_src, zmm_src, table_val(one));
-    h->vcvtps2dq(zmm_aux2, zmm_src);
-    h->vpaddd(zmm_aux2, zmm_aux2, table_val(exponent_bias));
-    h->vpslld(zmm_aux2, zmm_aux2, n_mantissa_bits);
-
-    // use zmm_src as tmp zmm_zero when applying mask
-    h->vxorps(zmm_src, zmm_src, zmm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendmps(zmm_aux2 | k_mask, zmm_aux2, zmm_src);
-
-    // compute polynomial
-    h->vmovups(zmm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(zmm_src, zmm_src, zmm_aux2);
-    h->vmulps(zmm_src, zmm_src, table_val(two));
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // support abs(x)<23
-    auto code = [&](Xbyak::CodeGenerator* h, const Ymm& dst, const Ymm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Ymm, 4>& tmp) {
-      h->vmulps(tmp[0], src, log2e);      // x / ln2
-      h->vroundps(tmp[0], tmp[0], 0x0A);  // round up
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-
-      const auto& z_sign = tmp[2];
-      const auto& z_abs = tmp[3];
-      h->vcmpps(z_sign, z, table_val(zero), _cmp_lt_os);
-      h->vcvtps2dq(z, z);
-      h->vpabsd(z_abs, z);
-      h->vmovdqu(tmp[1], table_val(one_epi32));
-      h->vpsllvd(z_abs, tmp[1], z_abs);  // 2^z
-      h->vcvtdq2ps(z_abs, z_abs);
-      h->vrcpps(z, z_abs);
-      h->vblendvps(z, z_abs, z, z_sign);
-      h->vmulps(dst, dst, z);  // dst = exp(f) * 2^z
-    };
-    code(h, ymm_src, ymm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4});
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    auto code = [&](Xbyak::CodeGenerator* h, const Zmm& dst, const Zmm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Zmm, 2>& tmp) {
-      h->vmovups(tmp[0], log2e);
-      h->vmulps(tmp[0] | h->T_ru_sae, src, tmp[0]);  // round up(x / ln2)
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-      h->vscalefps(dst, dst, z);            // dst = exp(f) * 2^z
-    };
-    code(h, zmm_src, zmm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {zmm_aux1, zmm_aux2});
-  }
-  void swish_compute_vector_fwd(const Xbyak::Ymm& ymm_src, int const_p_offset) {
-    h->vbroadcastss(ymm_aux0, h->ptr[reg_rt_const_p + const_p_offset]);
-    h->vmulps(ymm_aux0, ymm_aux0, ymm_src);
-    exp_compute_vector_fwd(ymm_aux0);
-    h->vaddps(ymm_aux0, ymm_aux0, table_val(one));
-    h->vrcpps(ymm_aux0, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void swish_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux0, zmm_src);
-    h->vmulps(zmm_aux0, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset]);
-    low_precision_exp_compute_vector_fwd(zmm_aux0);
-    h->vaddps(zmm_aux0, zmm_aux0, table_val(one));
-    h->vrcp14ps(zmm_aux0, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // register mapping
-    Ymm ymm_dst = ymm_aux1, ymm_src_shift = ymm_aux1, ymm_coeff = ymm_aux1, ymm_pol = ymm_aux2, ymm_indices = ymm_aux3,
-        ymm_src_original = ymm_aux4, ymm_sign = ymm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Ymm vmm_coeff, int coeff_idx, Ymm vmm_pol_idx) {
-      Ymm ymm_coeff(vmm_coeff.getIdx());
-      Ymm ymm_pol_idx(vmm_pol_idx.getIdx());
-      Xbyak::Address idx_addr =
-          h->ptr[p_table + table_off(tanh_pol_table, coeff_idx * tanh_n_polynomials) + ymm_pol_idx * sizeof(float)];
-      h->vcmpps(ymm_mask, ymm_mask, ymm_mask, _cmp_eq_oq);
-      h->vgatherdps(vmm_coeff, idx_addr, ymm_mask);
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(ymm_src_original, ymm_src);
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(ymm_indices, ymm_src);
-    h->vpsubd(ymm_indices, ymm_indices, table_val(tanh_idx_bias));
-    h->vandps(ymm_indices, ymm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(ymm_indices, ymm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(ymm_src_shift, ymm_src);
-    h->vandps(ymm_src_shift, ymm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(ymm_src, ymm_src, ymm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(ymm_pol, 6, ymm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(ymm_coeff, deg, ymm_indices);
-      h->vfmadd213ps(ymm_pol, ymm_src, ymm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(ymm_src, ymm_src_original);
-    h->vandps(ymm_sign, ymm_sign, table_val(sign_mask));
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(ymm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(ymm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_pol, ymm_mask);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(ymm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_src, ymm_mask);
-
-    // We reapply the sign and return
-    h->vxorps(ymm_dst, ymm_dst, ymm_sign);
-    h->vmovups(ymm_src, ymm_dst);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    // register mapping
-    Zmm zmm_dst = zmm_aux1, zmm_src_shift = zmm_aux1, zmm_coeff = zmm_aux1, zmm_pol = zmm_aux2, zmm_indices = zmm_aux3,
-        zmm_src_original = zmm_aux4, zmm_sign = zmm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, (size_t)coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Zmm vmm_coeff, int coeff_idx, Zmm vmm_pol_idx) {
-      Zmm zmm_coeff(vmm_coeff.getIdx());
-      Zmm zmm_pol_idx(vmm_pol_idx.getIdx());
-      h->vmovups(zmm_coeff, coeffs_address(coeff_idx, 0));
-      h->vpermt2ps(zmm_coeff, zmm_pol_idx, coeffs_address(coeff_idx, 16));
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(zmm_src_original, zmm_src);
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(zmm_indices, zmm_src);
-    h->vpsubd(zmm_indices, zmm_indices, table_val(tanh_idx_bias));
-    h->vpandd(zmm_indices, zmm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(zmm_indices, zmm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(zmm_src_shift, zmm_src);
-    h->vpandd(zmm_src_shift, zmm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(zmm_src, zmm_src, zmm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(zmm_pol, 6, zmm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(zmm_coeff, deg, zmm_indices);
-      h->vfmadd213ps(zmm_pol, zmm_src, zmm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(zmm_src, zmm_src_original);
-    h->vpandd(zmm_sign, zmm_sign, table_val(sign_mask));
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(zmm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(zmm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_pol);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(zmm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_src);
-
-    // We reapply the sign and return
-    h->vpxord(zmm_dst, zmm_dst, zmm_sign);
-    h->vmovups(zmm_src, zmm_dst);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    h->vmovups(ymm_aux0, ymm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(ymm_src, ymm_src, ymm_src);
-    h->vmovups(ymm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(ymm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(ymm_src, ymm_src, table_val(one));
-    h->vmulps(ymm_src, ymm_src, table_val(half));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    h->vmovups(zmm_aux0, zmm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(zmm_src, zmm_src, zmm_src);
-    h->vmovups(zmm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(zmm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(zmm_src, zmm_src, table_val(one));
-    h->vmulps(zmm_src, zmm_src, table_val(half));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void relu_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vcmpps(k_mask, zmm_src, table_val(zero), _cmp_nle_us);
-    h->vmulps(zmm_src, zmm_src, h->zword_b[reg_rt_const_p + const_p_offset]);
-    h->vblendmps(zmm_src | k_mask, zmm_src, zmm_aux1);
-  }
-  void linear_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vbroadcastss(zmm_aux0, h->dword[reg_rt_const_p + const_p_offset]);
-    h->vfmadd213ps(zmm_src, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset + 1 * sizeof(float)]);
-  }
-  void load_table_addr() { h->mov(p_table, l_table); }
-  void assign_zmm(const std::set<int>& used_zmm_idx, Zmm* zmm) {
-    constexpr int max_zmm_idx = 32;
-    for (int idx = 0; idx < max_zmm_idx; idx++) {
-      if (used_zmm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *zmm = Zmm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-  void assign_ymm(const std::set<int>& used_ymm_idx, Ymm* ymm) {
-    constexpr int max_ymm_idx = 16;
-    for (int idx = 0; idx < max_ymm_idx; idx++) {
-      if (used_ymm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *ymm = Ymm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-
- private:
-  JBLAS_ELTWISEOP elt_op;
-  Xbyak::CodeGenerator* h = nullptr;
-
-  /*labels*/
-  Xbyak::Label l_table;
-
-  /*register for fwd*/
-  Xbyak::Reg64 p_table;
-  Xbyak::Reg64 reg_rt_const_p;
-  std::set<int> assign_vmm_idx;  // use for zmm (in avx512) or ymm (in avx2)
-  Zmm zmm_mask, zmm_aux0, zmm_aux1, zmm_aux2, zmm_aux3, zmm_aux4;
-  Ymm ymm_mask, ymm_aux0, ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4;
-  Xbyak::Opmask k_mask;
-  static constexpr int n_mantissa_bits = 23;
-
-  enum {
-    _cmp_eq_oq = 0u,
-    _cmp_lt_os = 1u,
-    _cmp_le_os = 2u,
-    _cmp_neq_uq = 4u,
-    _cmp_nlt_us = 5u,
-    _cmp_nle_us = 6u,
-
-    _op_floor = 1u,
-    _op_mxcsr = 4u,
-  };
-
-  enum key_t {
-    zero = 0,                             // 0.f
-    half,                                 // 0.5f
-    one,                                  // 1.f  or  mask for exponent bits
-    two,                                  // 2.f
-    three,                                // 3.f
-    six,                                  // 6.f
-    minus_one,                            // -1.f  or  changes sign to opposite
-    minus_two,                            // -2.f
-    minus_three,                          // -3.f
-    ln2f,                                 // 0.69314718f
-    one_epi32,                            // 1 in int32
-    positive_mask,                        // changes sign to positive
-    sign_mask,                            // gets sign value
-    exponent_bias,                        // (127 = 2^7 - 1), gets exponent bits
-    exp_log2ef,                           // 1.44269502f - formula-based for approx
-    exp_ln_flt_max_f,                     // logf(FLT_MAX) - max normal value
-    exp_ln_flt_min_f,                     // logf(FLT_MIN) - min normal value
-    exp_pol,                              // see correspondent table for float values
-    gelu_tanh_fitting_const,              // 0.044715f
-    gelu_tanh_fitting_const_times_three,  // 0.134145f
-    gelu_tanh_sqrt_two_over_pi,           // sqrtf(2.f/pi) = 0.797884f
-    gelu_tanh_flt_max_x,
-    gelu_tanh_flt_min_x,
-    tanh_idx_bias,
-    tanh_idx_mask,
-    tanh_linear_ubound,
-    tanh_saturation_lbound,
-    tanh_pol_table,
-    low_precision_exp_const_v0,
-    low_precision_exp_const_v1,
-    low_precision_exp_const_v2,
-    undef_key,
-  };
-
-  size_t table_off(key_t key, size_t key_off_val_shift = 0) {
-    const auto it = entry_map.find(key);
-    assert(it != entry_map.end());  // "key is not in entry_map"
-    const auto& te = (*it).second;
-    const auto scale = te.bcast ? 64u : sizeof(table_entry_val_t);
-    return te.off + key_off_val_shift * scale;
-  }
-  Xbyak::Address table_val(key_t key, size_t key_off_val_shift = 0) {
-    auto off = table_off(key, key_off_val_shift);
-    return h->ptr[p_table + off];
-  }
-  using table_entry_val_t = uint32_t;
-  using table_entry_offset_t = size_t;  // offsets are in bytes wrt p_table
-  using table_entry_bcast_t = bool;
-
-  struct table_entry_t {
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  struct mapped_table_entry_t {
-    table_entry_offset_t off;
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  using table_t = std::multimap<key_t, table_entry_t>;
-  using mapped_table_t = std::multimap<key_t, mapped_table_entry_t>;
-  mapped_table_t entry_map = {};
-};
-}  // namespace jit_injector
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
deleted file mode 100644
index 6e00704395ed3..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
+++ /dev/null
@@ -1,1039 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <vector>
-#include <algorithm>
-#include <limits>
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace ref {
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                            int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  const T_DST dst_0(0);
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        for (int ii = 0; ii < RowPack; ii++) {
-          dst_ptr[i * NTile + j * dst_step + jj * RowPack + ii] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src_ptr[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// revert padding and interleave
-// row*col <= colpad/NTile*rowpad*NTile
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE revert_padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                                   int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        if ((j + jj) < col) {
-          for (int ii = 0; ii < RowPack; ii++) {
-            if ((i + ii) < row) {
-              dst_ptr[(i + ii) * dst_step + (j + jj)] =
-                  static_cast<T_DST>(src_ptr[i * NTile + j * src_step + jj * RowPack + ii]);
-            }
-          }
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// M x N ===> M/MTile x N/colPack x MTile x colPack (leading dim stride = MTile * dst_stride)
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_trans_interleave(const T_SRC* src, T_DST* dst, int row, int col, int rowpad,
-                                                  int colpad, int src_step, int dst_step, int MTile, int ColPack) {
-  // Note: rows/cols and i/j are in terms of src
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  const T_DST dst_0(0);
-  for (int i = 0; i < rowpad; i += MTile) {
-    for (int j = 0; j < colpad; j += ColPack) {
-      for (int ii = 0; ii < MTile; ii++) {
-        for (int jj = 0; jj < ColPack; jj++) {
-          dst[i * dst_step + j * MTile + ii * ColPack + jj] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_DT, typename DST_DT>
-static inline JBLAS_CODE dt_cvt_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col, int srcstride,
-                                              int dststride, bool zeropadding) {
-  for (int i = 0; i < row; i++) {
-    int j = 0;
-    for (; j < col; j++) {
-      const auto src = reinterpret_cast<const SRC_DT*>(reinterpret_cast<const char*>(raw_srcptr) + i * srcstride);
-      const auto dst = reinterpret_cast<DST_DT*>(reinterpret_cast<char*>(raw_dstptr) + i * dststride);
-      dst[j] = static_cast<DST_DT>(src[j]);
-    }
-    if (zeropadding) {
-      for (int bj = j * sizeof(DST_DT); bj < dststride; bj++) {
-        (reinterpret_cast<char*>(raw_dstptr) + i * dststride)[bj] = 0;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                       float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] = static_cast<float>(srcptr[i * ld_src + j]) * scales[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_bf16(int8_t* srcptr, uint16_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                        float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] =
-          jblas::utils::cast<float, jblas::utils::bf16>(static_cast<float>(srcptr[i * ld_src + j]) * scales[j]).x;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _T>
-static inline JBLAS_CODE transpose2d(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < col; i++) {
-    for (size_t j = 0; j < row; j++) {
-      dstptr[j + i * ld_dst] = srcptr[j * ld_src + i];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_s8_s4(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col,
-                                        int ld_src, int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::int4x2 tmp;
-      tmp.x = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 0]);
-      tmp.y = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 1]);
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_f4(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                     int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::f4x2 tmp;
-      tmp.x = srcptr[j * ld_src + ii + 0];
-      tmp.y = srcptr[j * ld_src + ii + 1];
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE decompress_s4_f32(jblas::utils::int4x2* srcptr, float* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      auto noffset = i * NTile + j % NTile;
-      dstptr[i * ld_dst + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scales[noffset + 0];
-      dstptr[i * ld_dst + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scales[noffset + 1];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline int8_t get_s8(int8_t v) {
-  switch (S4_T) {
-    case JBLAS_DTYPE::S4_CLIP:
-      return v << 4;
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      v &= 0x0f;
-      return v - 8;
-    default:
-      assert(false);
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = get_s8<S4_T>(tmp.x);
-      dstptr[i * ld_dst + j + 1] = get_s8<S4_T>(tmp.y);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s8_f32(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _S_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 1) {
-      float tmp = static_cast<float>(srcptr[i * ld_src + j]);
-      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                          int ld_dst, _S_T* scales, int8_t* zero_points, int k_offset, int kblock,
-                                          int NPad, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      if (zero_points != nullptr) {
-        dst0 = (static_cast<float>(get_s8<S4_T>(tmp.x)) - static_cast<float>((zero_points + kpos * NPad)[s0_idx])) *
-               scale0;
-        dst1 = (static_cast<float>(get_s8<S4_T>(tmp.y)) - static_cast<float>((zero_points + kpos * NPad)[s1_idx])) *
-               scale1;
-      } else {
-        dst0 = static_cast<float>(get_s8<S4_T>(tmp.x)) * scale0;
-        dst1 = static_cast<float>(get_s8<S4_T>(tmp.y)) * scale1;
-      }
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.x)));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.y)));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 1) {
-      auto tmp = srcptr[i * ld_src + j];
-      dstptr[i * ld_dst + j] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-  }
-  return JblasSuccess;
-}
-
-inline float fp4_bnb_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)          // 0
-    if ((val & 0b0010) == 2)        // 01
-      if ((val & 0b0001) == 1)      // 111
-        return 0.25000000f * sign;  // 1111
-      else
-        return 0.16666667f * sign;  // 1110
-    else if ((val & 0b0001) == 1)   // 110
-      return 0.50000000f * sign;    // 1101
-    else
-      return 0.33333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)   // 10
-    if ((val & 0b0001) == 1)      // 101
-      return 1.00000000f * sign;  // 1011
-    else
-      return 0.66666667f * sign;     // 1010
-  else if ((val & 0b0001) == 1)      // 100
-    return 5.208333333e-03f * sign;  // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_bnb_dequantize(uint8_t val, float absmax) { return fp4_bnb_unpack(val) * absmax; }
-
-inline int8_t fp4_bnb_quantize(float x) {
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 0.29166667f)
-    if (x > 0.583333f)
-      if (x > 0.8333333f)
-        return static_cast<int8_t>(0b0011 + sign);
-      else
-        return static_cast<int8_t>(0b0010 + sign);
-    else if (x > 0.4166667f)
-      return static_cast<int8_t>(0b101 + sign);
-    else
-      return static_cast<int8_t>(0b100 + sign);
-  else if (x > 0.0859375f)
-    if (x > 0.20833333f)
-      return static_cast<int8_t>(0b0111 + sign);
-    else
-      return static_cast<int8_t>(0b0110 + sign);
-  else if (x > 0.00260417f)
-    return static_cast<int8_t>(0b0001 + sign);
-  else
-    return static_cast<int8_t>(0b0000 + sign);
-}
-
-inline int8_t fp4_e2m1_quantize(float x) {
-  // FP4 with bias of 1
-  // first bit is a sign
-  // subnormals
-  // 0b000 = 0
-  // 0b001 = 0.0625
-  // 0b010 = 1
-  // 0b011 = 1.5
-  // 0b100 = 2
-  // 0b101 = 3
-  // 0b110 = 4
-  // 0b111 = 6
-
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 1.75f / 6) {
-    if (x > 3.5f / 6) {
-      if (x > 5.f / 6)
-        return static_cast<int8_t>(0b111 + sign);  // 6
-      else
-        return static_cast<int8_t>(0b110 + sign);  // 4
-    } else {
-      if (x > 2.5f / 6)
-        return static_cast<int8_t>(0b101 + sign);  // 3
-      else
-        return static_cast<int8_t>(0b100 + sign);  // 2
-    }
-  } else {
-    if (x > 0.53125f / 6) {
-      if (x > 1.25f / 6)
-        return static_cast<int8_t>(0b011 + sign);  // 1.5
-      else
-        return static_cast<int8_t>(0b010 + sign);  // 1
-    } else {
-      if (x > 0.03125f / 6)
-        return static_cast<int8_t>(0b0001 + sign);  // 0.0625
-      else
-        return static_cast<int8_t>(0b0000 + sign);  // 0
-    }
-  }
-}
-
-inline float fp4_e2m1_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)      // 0
-    if ((val & 0b0010) == 2)    // 01
-      if ((val & 0b0001) == 1)  // 111
-        return 1.f * sign;      // 1111
-      else
-        return 0.6666666666666666f * sign;  // 1110
-    else if ((val & 0b0001) == 1)           // 110
-      return 0.5f * sign;                   // 1101
-    else
-      return 0.3333333333333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)           // 10
-    if ((val & 0b0001) == 1)              // 101
-      return 0.25f * sign;                // 1011
-    else
-      return 0.16666666666666666f * sign;  // 1010
-  else if ((val & 0b0001) == 1)            // 100
-    return 0.010416666666666666f * sign;   // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_e2m1_dequantize(uint8_t val, float absmax) { return fp4_e2m1_unpack(val) * absmax; }
-
-inline float nf4_unpack(int8_t val) {
-  if ((val & 0b1000) == 8)
-    if ((val & 0b0100) == 4)      // 1
-      if ((val & 0b0010) == 2)    // 11
-        if ((val & 0b0001) == 1)  // 111
-          return 1.0f;
-        else
-          return 0.7229568362236023f;
-      else if ((val & 0b0001) == 1)  // 110
-        return 0.5626170039176941f;
-      else
-        return 0.44070982933044434f;
-    else if ((val & 0b0010) == 2)  // 10
-      if ((val & 0b0001) == 1)     // 101
-        return 0.33791524171829224f;
-      else
-        return 0.24611230194568634f;
-    else if ((val & 0b0001) == 1)  // 100
-      return 0.16093020141124725f;
-    else
-      return 0.07958029955625534f;
-
-  else if ((val & 0b0100) == 4)  // 0
-    if ((val & 0b0010) == 2)     // 01
-      if ((val & 0b0001) == 1)   // 011
-        return -1.f;
-      else
-        return -0.09105003625154495f;
-    else if ((val & 0b0001) == 1)  // 010
-      return -0.18477343022823334f;
-    else
-      return -0.28444138169288635f;
-  else if ((val & 0b0010) == 2)  // 00
-    if ((val & 0b0001) == 1)     // 001
-      return -0.39491748809814453f;
-    else
-      return -0.5250730514526367f;
-  else if ((val & 0b0001) == 1)  // 000
-    return -0.6961928009986877f;
-  else
-    return 0.f;
-}
-
-inline float nf4_dequantize(int8_t val, float absmax) { return nf4_unpack(val) * absmax; }
-
-// Note: In the BNB Nf4 definition, 0 has a non-zero value after dequantization, but Jblas uses 0 for padding, which
-// leads to calculation errors. We ultimately choose to swap the binary bits of -1 and 0 in Nf4 to avoid this
-// conflict.
-inline int8_t nf4_quantize(float x) {
-  if (x > 0.03979014977812767f)
-    if (x > 0.3893125355243683f)      // 1
-      if (x > 0.6427869200706482f)    // 11
-        if (x > 0.8614784181118011f)  // 111
-          return 0b1111;
-        else
-          return 0b1110;
-      else if (x > 0.5016634166240692f)  // 110
-        return 0b1101;
-      else
-        return 0b1100;
-    else if (x > 0.2035212516784668f)  // 10
-      if (x > 0.2920137718319893f)     // 101
-        return 0b1011;
-      else
-        return 0b1010;
-    else if (x > 0.1202552504837513f)  // 100
-      return 0b1001;
-    else
-      return 0b1000;
-  else if (x > -0.33967943489551544f)  // 0
-    if (x > -0.13791173323988914f)     // 01
-      if (x > -0.045525018125772476f)  // 011
-        return 0b0000;
-      else
-        return 0b0110;
-    else if (x > -0.23460740596055984f)  // 010
-      return 0b0101;
-    else
-      return 0b0100;
-  else if (x > -0.6106329262256622f)  // 00
-    if (x > -0.4599952697753906f)     // 001
-      return 0b0011;
-    else
-      return 0b0010;
-  else if (x > -0.8480964004993439f)  // 000
-    return 0b0001;
-  else
-    return 0b0111;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_unpack(int8_t v) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_unpack(v);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_unpack(v);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_unpack(v);
-    default:
-      break;
-  }
-  return std::numeric_limits<float>::quiet_NaN();
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_dequantize(int8_t v, float scale) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  return f4_unpack<F4_T>(v) * scale;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline int8_t f4_quantize(float x) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_quantize(x);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_quantize(x);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_quantize(x);
-    default:
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                          _S_T* scales, int k_offset, int kblock, int NPad, int8_t* tmp,
-                                          size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      dst0 = f4_dequantize<F4_T>(tmp.x, scale0);
-      dst1 = f4_dequantize<F4_T>(tmp.y, scale1);
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.x));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.y));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d_dw2highw(const void* srcptr, void* dstptr, int row, int col, int srcstride,
-                                           int dststride) {
-  auto bsrcptr = (char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      std::memcpy(bdstptr + i * dststride + j * sizeof(jblas::utils::bf16),
-                  bsrcptr + i * srcstride + j * sizeof(float) + 2, sizeof(jblas::utils::bf16));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride) {
-  auto bsrcptr = (const char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    std::memcpy(bdstptr + i * dststride, bsrcptr + i * srcstride, col);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto s8_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float amax = 0.f, max = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        if (amax < std::abs(v)) {
-          amax = std::abs(v);
-          max = v;
-        }
-      }
-      float scale = max / -8.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = srcptr[(j + ij) * ld_src + i] * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-    auto s8_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        maxval = std::max(maxval, v);
-        minval = std::min(minval, v);
-      }
-      float max = std::abs(maxval) < std::abs(minval) ? minval - maxval : maxval - minval;
-      float scale = max / -16.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      ;
-      int8_t bzp = utils::cast<float, int8_t>((0.f - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = (srcptr[(j + ij) * ld_src + i] - fmedium) * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-
-    auto dispatch_calc = [&](int blocksize) {
-      switch (S4_T) {
-        case JBLAS_DTYPE::S8:
-        case JBLAS_DTYPE::S4_CLIP:
-          if (zero_points == nullptr) {
-            s8_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s8_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        case JBLAS_DTYPE::S4_FULLRANGE:
-          if (zero_points == nullptr) {
-            s4_fullrange_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s4_fullrange_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        default:
-          assert(false);
-          break;
-      }
-    };
-
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float absmax = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        absmax = std::max(absmax, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      scales[j / raw_blocksize * ld_dst + i] = absmax;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>(srcptr[(j + ij) * ld_src + i] * (1.f / absmax));
-      }
-    };
-    auto calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float amax = 0;
-      float amin = 0;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        amax = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-        amin = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (amax - amin) / 2;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (amax + amin) / 2;
-      zero_points[j / raw_blocksize * ld_dst + i] = f4_quantize<F4_T>((0 - fmedium) * (1.f / scale));
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>((srcptr[(j + ij) * ld_src + i] - fmedium) * (1.f / scale));
-      }
-    };
-    auto dispatch_calc = [&](int blocksize) {
-      if (zero_points == nullptr) {
-        calc_store_scale_and_quantv_sym(blocksize);
-      } else {
-        calc_store_scale_and_quantv_asym(blocksize);
-      }
-    };
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                          int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                          float* blkreduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = static_cast<float>(zp);
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = float(zp);
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                          float* scales, int ld_scale, int blocksize, float* reduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      int sum = 0;
-      scales[j / blocksize + i * ld_scale] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto tmp = utils::cast<float, int8_t>(fsrc * rscale);
-        dstptr[(j + ij) + i * ld_dst] = tmp;
-        sum += tmp;
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>(fsrc * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  if (beta != 0.f) {
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    }
-    return JblasSuccess;
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = static_cast<float>(alpha[j]) * srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int M, const int N, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  for (int i = 0; i < M; i++) {
-    float scale = scaleA[i * ldsa];
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * static_cast<float>(scaleB[j]) * scale;
-      dstptr[i * dststep + j] = fsrc;
-    }
-  }
-  return JblasSuccess;
-}
-
-inline JBLAS_CODE minmax_f32_kblock(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                    int fsize_minmax, int blocksize) {
-  for (int i = 0; i < row; i++) {
-    if (col >= blocksize) {
-      for (int icol = 0; icol < col; icol += blocksize) {
-        float maxval = std::numeric_limits<float>::min();
-        float minval = std::numeric_limits<float>::max();
-        for (int ii = 0; ii < blocksize; ii++) {
-          maxval = std::max(srcptr[i * ld_src + icol + ii], maxval);
-          minval = std::min(srcptr[i * ld_src + icol + ii], minval);
-        }
-        auto colptr = &minmaxptr[i * ld_minmax + icol / blocksize * fsize_minmax];
-        colptr[0] = minval;
-        colptr[1] = maxval;
-      }
-    } else {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = std::numeric_limits<float>::max();
-      for (int icol = 0; icol < col; icol++) {
-        maxval = std::max(srcptr[i * ld_src + icol], maxval);
-        minval = std::min(srcptr[i * ld_src + icol], minval);
-      }
-      minmaxptr[i * ld_minmax + 0] = minval;
-      minmaxptr[i * ld_minmax + 1] = maxval;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  for (int irow = 0; irow < row; irow++) {
-    for (int icol = 0; icol < col; icol++) {
-      float scale = ascales[irow * ldas] * wscales[icol] * alpha;
-      dstptr[irow * ld_dst + icol] = scale * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE quant_s8_row_reduce_sum(const int8_t* srcptr, int ldsrc, const float* scales,
-                                                 const int8_t* zero_points, int row, int col, _RT* reduce) {
-  std::memset(reduce, 0, sizeof(reduce[0]) * col);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      if (zero_points != nullptr) {
-        reduce[j] += static_cast<_RT>((static_cast<float>(srcptr[i * ldsrc + j]) - static_cast<float>(zero_points[j])) *
-                                      static_cast<float>(scales[j]));
-      } else {
-        reduce[j] += static_cast<_RT>(srcptr[i * ldsrc + j] * scales[j]);
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE row_reduce_sum(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-  for (int j = 0; j < col; j++) {
-    float tmp = 0.f;
-    for (int i = 0; i < row; i++) {
-      tmp += srcptr[i * ldsrc + j];
-    }
-    reduce[j] = static_cast<_RT>(tmp);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      for (size_t jj = 0; jj < blocksize; jj++) {
-        if (j + jj < col) {
-          tmp += srcptr[i * ldsrc + j + jj];
-        }
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= zpf * reduce[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto reducef = reduce[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reducef;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  for (int i = 0; i < row; i++) {
-    auto reduceaf = reducea[i * lds];
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    for (int j = 0; j < col; j++) {
-      auto zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-      accptr[i * ldacc + j] -= zpbf * reduceaf;
-      accptr[i * ldacc + j] -= zpaf * reduceb[j];
-      accptr[i * ldacc + j] -= zpaf * zpbf * k;
-    }
-  }
-  return JblasSuccess;
-}
-}  // namespace ref
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
deleted file mode 100644
index d25b72ee2fa4d..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
+++ /dev/null
@@ -1,702 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-#include <cassert>
-#include <type_traits>
-
-#include "jblas/jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx2.h"
-#include "kernel_avx512f.h"
-#include "kernel_avx512_bf16.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace kernel {
-namespace wrapper {
-template <int NTile, int RowPack>
-class PaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_interleave_cvt<T_SRC, T_DST, RowPack>::forward(
-          src, dst, NTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int NTile, int RowPack>
-class RevertPaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    return ref::revert_padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int MTile, int ColPack>
-class PaddingTransInterleaveMN {
-  // row and cols are in terms of src
-  // M x N ===> M/MTile x N/ColPack x MTile x ColPack (leading dim stride = MTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    // Note: rows/cols and i/j are in terms of src
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_trans_interleave_cvt<T_SRC, T_DST, ColPack>::forward(
-          src, dst, MTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_trans_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, MTile, ColPack);
-  }
-};
-
-class Memcpy2D {
- public:
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* const_elt_v = nullptr, Eltops... ops) {
-    auto ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                     const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                  const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(sizeof...(ops) == 0);                      // no post ops
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // no conversion
-    return kernel::ref::memcpy2d(srcptr, dstptr, row, col * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
-                                 dststep * sizeof(_DST_T));
-  }
-
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP OP_T>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* const_elt_v = nullptr) {
-    auto ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                            const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                         const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(false);  // no ref implementation
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp32CvtBf16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (utils::isa_base<ISA_T>::amx_bf16) {
-      return kernel::avx512_bf16::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride,
-                                                              zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return kernel::avx512f::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return kernel::avx2::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<float, utils::bf16>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-class Memcpy2DFp32CvtFp16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp32_cvt_fp16_2D_write_back(
-          reinterpret_cast<const float*>(srcptr), reinterpret_cast<utils::fp16*>(dstptr), row, col,
-          srcstride / sizeof(float), dststride / sizeof(utils::fp16), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::fp16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::fp16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DBf16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (ISA_T >= JblasAMX_BF16) {
-      return kernel::avx512_bf16::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (ISA_T >= JblasAVX512F) {
-      return kernel::avx512f::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (ISA_T >= JblasAVX2) {
-      return kernel::avx2::bf16_cvt_fp32_2D_write_back(
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<utils::bf16, float>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-template <int NTILE>
-class CompressS8S4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_s8_s4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int NTILE>
-class CompressFp4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_f4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <typename _T>
-class Transpose2D {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return ref::transpose2d(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class QuantizeSignIntRowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f &&
-                  S4_T != JBLAS_DTYPE::S4_FULLRANGE) {  // TODO(zhe): support simd version s4_fullrange quantization.
-      return avx512f::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                           zero_points, blocksize);
-    }
-#endif
-    return ref::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-  }
-};
-
-class QuantizeF4RowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-    }
-#endif
-    return ref::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-  }
-};
-
-class QuantizeU8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, uint8_t* zps, int blocksize, float* blkreduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                     blocksize, blkreduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                  blocksize, blkreduce);
-    }
-#endif
-    return ref::quantize_fp_u8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps, blocksize,
-                                        blkreduce);
-  }
-};
-
-class QuantizeS8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, int blocksize, float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_s8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale,
-                                                     blocksize, reduce);
-    }
-#endif
-    return ref::quantize_fp_s8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, blocksize, reduce);
-  }
-};
-
-class Broadcast {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(int num, const uint8_t& srcval, uint8_t* dstptr) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::broadcast_u8(num, srcval, dstptr);
-    }
-#endif
-    return ref::broadcast_u8(num, srcval, dstptr);
-  }
-};
-
-class AccumulateDequantizeS32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, float alpha, float beta, int row, int col,
-                                   int ld_src, int ld_dst, float* ascales, int ldas, float* wscales) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales,
-                                                    ldas, wscales);
-    }
-#endif
-    return ref::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales, ldas,
-                                              wscales);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW, typename _Z_T = int8_t>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename _SCA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   _SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad, void* tmp,
-                                   size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    // AVX2 device only focus on fp32 data and layout
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<_SCA_T, float> && std::is_same_v<_DST_T, float> &&
-                  _PACK_ROW == 1) {
-      if (zero_points == nullptr) {
-        ret = avx2::decompress_kblock_bit4_packrow1<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                          k_offset, kblock, NPad, &avx2::dequant_s8_N_avx2<48, true>,
-                                                          &avx2::convert_s4_s8_16_sse<S4_T>,
-                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      } else {
-        ret = avx2::decompress_kblock_bit4_packrow1<false>(
-            srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-            &avx2::dequant_s8_N_avx2<48, false>, &avx2::convert_s4_s8_16_sse<S4_T>, reinterpret_cast<int8_t*>(tmp),
-            tmpsize);
-      }
-
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    ret = ref::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, zero_points, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-    return ret;
-  }
-};
-
-template <typename _DST_T>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                           reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW>
-class DecompressKBlockF4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int k_offset, int kblock, int NPad, void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                             scales, k_offset, kblock, NPad,
-                                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float>) {
-      ret = avx2::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                          scales, k_offset, kblock, NPad,
-                                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    return ref::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T>
-class DecompressKBlockF4FpNoscale {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-class DecompressKBlockS4S8 {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f && S4_T == JBLAS_DTYPE::S4_CLIP) {
-      return jit::decompress_s4_s8(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-    return ref::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int PACK_ROW>
-class DecompressKBlockS8F32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return jit::DequanKBlockS8F32::forward_avx512f(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     k_offset, kblock, NPad);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return avx2::dequant_kblock_s8_f32(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                         kblock, NPad);
-    }
-#endif
-    return ref::decompress_kblock_s8_f32<float, PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                                 zero_points, k_offset, kblock, NPad);
-  }
-};
-
-class DecompressKBlockS8S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {  // TODO Scale type support
-      return avx512f::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {  // TODO Scale type support
-      return avx2::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    return ref::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class AlphaBetaF32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                            const float* src1ptr, const int src1step, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-#if CompileAVX2()
-    if (utils::isa_base<ISA_T>::avx2) {
-      return avx2::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-  }
-};
-
-class CompFp32BlockScale {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static JBLAS_CODE forward(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                            const int dststep, const int M, const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-    return ref::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class AccumulateFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class QuanOutS32U32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                            const int dststep, const int M, const int N, float scaleSrc, float scaleDst, int zpDst) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-    }
-#endif
-    return ref::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-  }
-};
-
-// scaleA ldsa==0 per tensor, ldsa!=0 per M
-// scaleB per channel(N)
-class DequanS32Fp32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCAB_T>
-  static JBLAS_CODE forward(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N, const float* scaleA, const int ldsa, const SCAB_T* scaleB) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-    return ref::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-  }
-};
-
-class MinMaxKBlock {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                   int fsize_minmax, int blocksize) {
-    return ref::minmax_f32_kblock(srcptr, row, col, ld_src, minmaxptr, ld_minmax, fsize_minmax, blocksize);
-  }
-};
-
-template <typename _RT>
-class QuantS8RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, int ldsrc, const float* scales, const int8_t* zero_points,
-                                   int row, int col, _RT* reduce) {
-    return ref::quant_s8_row_reduce_sum(srcptr, ldsrc, scales, zero_points, row, col, reduce);
-  }
-};
-
-template <typename _RT>
-class RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-    return ref::row_reduce_sum<_RT>(srcptr, ldsrc, row, col, reduce);
-  }
-};
-
-class ColBlockReduceSum {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize, float* reduce,
-                                   int ldr) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    return ref::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-  }
-};
-
-class RemoveZeroPointBias {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_wei(float* accptr, int ldacc, int row, int col, int8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_act(float* accptr, int ldacc, int row, int col, uint8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_both(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                        float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                        const float* reduceb) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea,
-                                            reduceb);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-    }
-#endif
-    return ref::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-  }
-};
-
-}  // namespace wrapper
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
deleted file mode 100644
index 320593150fca2..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
+++ /dev/null
@@ -1,3313 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifndef XBYAK_XBYAK_H_
-#define XBYAK_XBYAK_H_
-/*!
-        @file xbyak.h
-        @brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
-        @author herumi
-        @url https://github.com/herumi/xbyak
-        @note modified new BSD license
-        http://opensource.org/licenses/BSD-3-Clause
-*/
-#if (not +0) && !defined(XBYAK_NO_OP_NAMES)  // trick to detect whether 'not' is operator or not
-#define XBYAK_NO_OP_NAMES
-#endif
-
-#include <stdio.h>  // for debug print
-#include <assert.h>
-#include <list>
-#include <string>
-#include <algorithm>
-#ifndef NDEBUG
-#include <iostream>
-#endif
-
-// #define XBYAK_DISABLE_AVX512
-
-#if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
-#define XBYAK_USE_MMAP_ALLOCATOR
-#endif
-#if !defined(__GNUC__) || defined(__MINGW32__)
-#undef XBYAK_USE_MMAP_ALLOCATOR
-#endif
-
-#ifdef __GNUC__
-#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor))
-#else
-#define XBYAK_GNUC_PREREQ(major, minor) 0
-#endif
-
-// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
-#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) || \
-     ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
-
-/*
-        Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
-        libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
-*/
-#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || \
-    defined(__llvm__)
-#include <tr1/unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <tr1/unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#else
-#include <set>
-#define XBYAK_STD_UNORDERED_SET std::set
-#include <map>
-#define XBYAK_STD_UNORDERED_MAP std::map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
-#endif
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#include <malloc.h>
-#ifdef _MSC_VER
-#define XBYAK_TLS __declspec(thread)
-#else
-#define XBYAK_TLS __thread
-#endif
-#elif defined(__GNUC__)
-#include <unistd.h>
-#include <sys/mman.h>
-#include <stdlib.h>
-#define XBYAK_TLS __thread
-#endif
-#if defined(__APPLE__) && !defined(XBYAK_DONT_USE_MAP_JIT)
-#define XBYAK_USE_MAP_JIT
-#include <sys/sysctl.h>
-#ifndef MAP_JIT
-#define MAP_JIT 0x800
-#endif
-#endif
-#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
-#include <stdint.h>
-#endif
-
-// MFD_CLOEXEC defined only linux 3.17 or later.
-// Android wraps the memfd_create syscall from API version 30.
-#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
-#undef XBYAK_USE_MEMFD
-#endif
-
-#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
-#define XBYAK64_WIN
-#elif defined(__x86_64__)
-#define XBYAK64_GCC
-#endif
-#if !defined(XBYAK64) && !defined(XBYAK32)
-#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
-#define XBYAK64
-#else
-#define XBYAK32
-#endif
-#endif
-
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
-#undef XBYAK_TLS
-#define XBYAK_TLS thread_local
-#define XBYAK_VARIADIC_TEMPLATE
-#define XBYAK_NOEXCEPT noexcept
-#else
-#define XBYAK_NOEXCEPT throw()
-#endif
-
-// require c++14 or later
-// Visual Studio 2017 version 15.0 or later
-// g++-6 or later
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4514) /* remove inline function */
-#pragma warning(disable : 4786) /* identifier is too long */
-#pragma warning(disable : 4503) /* name is too long */
-#pragma warning(disable : 4127) /* constant expresison */
-#endif
-
-// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
-#if defined(__GNUC__) && !defined(__clang__)
-#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-namespace Xbyak {
-
-enum {
-  DEFAULT_MAX_CODE_SIZE = 4096,
-  VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
-};
-
-#ifndef MIE_INTEGER_TYPE_DEFINED
-#define MIE_INTEGER_TYPE_DEFINED
-// for backward compatibility
-typedef uint64_t uint64;
-typedef int64_t sint64;
-typedef uint32_t uint32;
-typedef uint16_t uint16;
-typedef uint8_t uint8;
-#endif
-
-#ifndef MIE_ALIGN
-#ifdef _MSC_VER
-#define MIE_ALIGN(x) __declspec(align(x))
-#else
-#define MIE_ALIGN(x) __attribute__((aligned(x)))
-#endif
-#endif
-#ifndef MIE_PACK  // for shufps
-#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
-#endif
-
-enum {
-  ERR_NONE = 0,
-  ERR_BAD_ADDRESSING,
-  ERR_CODE_IS_TOO_BIG,
-  ERR_BAD_SCALE,
-  ERR_ESP_CANT_BE_INDEX,
-  ERR_BAD_COMBINATION,
-  ERR_BAD_SIZE_OF_REGISTER,
-  ERR_IMM_IS_TOO_BIG,
-  ERR_BAD_ALIGN,
-  ERR_LABEL_IS_REDEFINED,
-  ERR_LABEL_IS_TOO_FAR,
-  ERR_LABEL_IS_NOT_FOUND,
-  ERR_CODE_ISNOT_COPYABLE,
-  ERR_BAD_PARAMETER,
-  ERR_CANT_PROTECT,
-  ERR_CANT_USE_64BIT_DISP,
-  ERR_OFFSET_IS_TOO_BIG,
-  ERR_MEM_SIZE_IS_NOT_SPECIFIED,
-  ERR_BAD_MEM_SIZE,
-  ERR_BAD_ST_COMBINATION,
-  ERR_OVER_LOCAL_LABEL,  // not used
-  ERR_UNDER_LOCAL_LABEL,
-  ERR_CANT_ALLOC,
-  ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
-  ERR_BAD_PROTECT_MODE,
-  ERR_BAD_PNUM,
-  ERR_BAD_TNUM,
-  ERR_BAD_VSIB_ADDRESSING,
-  ERR_CANT_CONVERT,
-  ERR_LABEL_ISNOT_SET_BY_L,
-  ERR_LABEL_IS_ALREADY_SET_BY_L,
-  ERR_BAD_LABEL_STR,
-  ERR_MUNMAP,
-  ERR_OPMASK_IS_ALREADY_SET,
-  ERR_ROUNDING_IS_ALREADY_SET,
-  ERR_K0_IS_INVALID,
-  ERR_EVEX_IS_INVALID,
-  ERR_SAE_IS_INVALID,
-  ERR_ER_IS_INVALID,
-  ERR_INVALID_BROADCAST,
-  ERR_INVALID_OPMASK_WITH_MEMORY,
-  ERR_INVALID_ZERO,
-  ERR_INVALID_RIP_IN_AUTO_GROW,
-  ERR_INVALID_MIB_ADDRESS,
-  ERR_X2APIC_IS_NOT_SUPPORTED,
-  ERR_NOT_SUPPORTED,
-  ERR_SAME_REGS_ARE_INVALID,
-  ERR_INTERNAL  // Put it at last.
-};
-
-inline const char* ConvertErrorToString(int err) {
-  static const char* errTbl[] = {"none",
-                                 "bad addressing",
-                                 "code is too big",
-                                 "bad scale",
-                                 "esp can't be index",
-                                 "bad combination",
-                                 "bad size of register",
-                                 "imm is too big",
-                                 "bad align",
-                                 "label is redefined",
-                                 "label is too far",
-                                 "label is not found",
-                                 "code is not copyable",
-                                 "bad parameter",
-                                 "can't protect",
-                                 "can't use 64bit disp(use (void*))",
-                                 "offset is too big",
-                                 "MEM size is not specified",
-                                 "bad mem size",
-                                 "bad st combination",
-                                 "over local label",
-                                 "under local label",
-                                 "can't alloc",
-                                 "T_SHORT is not supported in AutoGrow",
-                                 "bad protect mode",
-                                 "bad pNum",
-                                 "bad tNum",
-                                 "bad vsib addressing",
-                                 "can't convert",
-                                 "label is not set by L()",
-                                 "label is already set by L()",
-                                 "bad label string",
-                                 "err munmap",
-                                 "opmask is already set",
-                                 "rounding is already set",
-                                 "k0 is invalid",
-                                 "evex is invalid",
-                                 "sae(suppress all exceptions) is invalid",
-                                 "er(embedded rounding) is invalid",
-                                 "invalid broadcast",
-                                 "invalid opmask with memory",
-                                 "invalid zero",
-                                 "invalid rip in AutoGrow",
-                                 "invalid mib address",
-                                 "x2APIC is not supported",
-                                 "not supported",
-                                 "same regs are invalid",
-                                 "internal error"};
-  assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
-  return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
-}
-
-#ifdef XBYAK_NO_EXCEPTION
-namespace local {
-
-inline int& GetErrorRef() {
-  static XBYAK_TLS int err = 0;
-  return err;
-}
-
-inline void SetError(int err) {
-  if (local::GetErrorRef()) return;  // keep the first err code
-  local::GetErrorRef() = err;
-}
-
-}  // namespace local
-
-inline void ClearError() { local::GetErrorRef() = 0; }
-inline int GetError() { return Xbyak::local::GetErrorRef(); }
-
-#define XBYAK_THROW(err)         \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return;                      \
-  }
-#define XBYAK_THROW_RET(err, r)  \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return r;                    \
-  }
-
-#else
-class Error : public std::exception {
-  int err_;
-
- public:
-  explicit Error(int err) : err_(err) {
-    if (err_ < 0 || err_ > ERR_INTERNAL) {
-      err_ = ERR_INTERNAL;
-    }
-  }
-  operator int() const { return err_; }
-  const char* what() const XBYAK_NOEXCEPT { return ConvertErrorToString(err_); }
-};
-
-// dummy functions
-inline void ClearError() {}
-inline int GetError() { return 0; }
-
-inline const char* ConvertErrorToString(const Error& err) { return err.what(); }
-
-#define XBYAK_THROW(err) \
-  { throw Error(err); }
-#define XBYAK_THROW_RET(err, r) \
-  { throw Error(err); }
-
-#endif
-
-inline void* AlignedMalloc(size_t size, size_t alignment) {
-#ifdef __MINGW32__
-  return __mingw_aligned_malloc(size, alignment);
-#elif defined(_WIN32)
-  return _aligned_malloc(size, alignment);
-#else
-  void* p;
-  int ret = posix_memalign(&p, alignment, size);
-  return (ret == 0) ? p : 0;
-#endif
-}
-
-inline void AlignedFree(void* p) {
-#ifdef __MINGW32__
-  __mingw_aligned_free(p);
-#elif defined(_MSC_VER)
-  _aligned_free(p);
-#else
-  free(p);
-#endif
-}
-
-template <class To, class From>
-inline const To CastTo(From p) XBYAK_NOEXCEPT {
-  return (const To)(size_t)(p);
-}
-namespace inner {
-
-#ifdef _WIN32
-struct SystemInfo {
-  SYSTEM_INFO info;
-  SystemInfo() { GetSystemInfo(&info); }
-};
-#endif
-// static const size_t ALIGN_PAGE_SIZE = 4096;
-inline size_t getPageSize() {
-#ifdef _WIN32
-  static const SystemInfo si;
-  return si.info.dwPageSize;
-#elif defined(__GNUC__)
-  static const long pageSize = sysconf(_SC_PAGESIZE);
-  if (pageSize > 0) {
-    return (size_t)pageSize;
-  }
-#endif
-  return 4096;
-}
-
-inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
-inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
-
-inline uint32_t VerifyInInt32(uint64_t x) {
-#if defined(XBYAK64) && !defined(__ILP32__)
-  if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
-#endif
-  return static_cast<uint32_t>(x);
-}
-
-enum LabelMode {
-  LasIs,   // as is
-  Labs,    // absolute
-  LaddTop  // (addr + top) for mov(reg, label) with AutoGrow
-};
-
-}  // namespace inner
-
-/*
-        custom allocator
-*/
-struct Allocator {
-  explicit Allocator(const std::string& = "") {}  // same interface with MmapAllocator
-  virtual uint8_t* alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
-  virtual void free(uint8_t* p) { AlignedFree(p); }
-  virtual ~Allocator() {}
-  /* override to return false if you call protect() manually */
-  virtual bool useProtect() const { return true; }
-};
-
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-#ifdef XBYAK_USE_MAP_JIT
-namespace util {
-
-inline int getMacOsVersionPure() {
-  char buf[64];
-  size_t size = sizeof(buf);
-  int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
-  if (err != 0) return 0;
-  char* endp;
-  int major = strtol(buf, &endp, 10);
-  if (*endp != '.') return 0;
-  return major;
-}
-
-inline int getMacOsVersion() {
-  static const int version = getMacOsVersionPure();
-  return version;
-}
-
-}  // namespace util
-#endif
-class MmapAllocator : public Allocator {
-  struct Allocation {
-    size_t size;
-#if defined(XBYAK_USE_MEMFD)
-    // fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
-    // during the lifetime of each allocation in order to support
-    // checkpoint/restore by unprivileged users.
-    int fd;
-#endif
-  };
-  const std::string name_;  // only used with XBYAK_USE_MEMFD
-  typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
-  AllocationList allocList_;
-
- public:
-  explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
-  uint8_t* alloc(size_t size) {
-    const size_t alignedSizeM1 = inner::getPageSize() - 1;
-    size = (size + alignedSizeM1) & ~alignedSizeM1;
-#if defined(MAP_ANONYMOUS)
-    int mode = MAP_PRIVATE | MAP_ANONYMOUS;
-#elif defined(MAP_ANON)
-    int mode = MAP_PRIVATE | MAP_ANON;
-#else
-#error "not supported"
-#endif
-#if defined(XBYAK_USE_MAP_JIT)
-    const int mojaveVersion = 18;
-    if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
-#endif
-    int fd = -1;
-#if defined(XBYAK_USE_MEMFD)
-    fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
-    if (fd != -1) {
-      mode = MAP_SHARED;
-      if (ftruncate(fd, size) != 0) {
-        close(fd);
-        XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-      }
-    }
-#endif
-    void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
-    if (p == MAP_FAILED) {
-      if (fd != -1) close(fd);
-      XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-    }
-    assert(p);
-    Allocation& alloc = allocList_[(uintptr_t)p];
-    alloc.size = size;
-#if defined(XBYAK_USE_MEMFD)
-    alloc.fd = fd;
-#endif
-    return (uint8_t*)p;
-  }
-  void free(uint8_t* p) {
-    if (p == 0) return;
-    AllocationList::iterator i = allocList_.find((uintptr_t)p);
-    if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
-    if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
-#if defined(XBYAK_USE_MEMFD)
-    if (i->second.fd != -1) close(i->second.fd);
-#endif
-    allocList_.erase(i);
-  }
-};
-#else
-typedef Allocator MmapAllocator;
-#endif
-
-class Address;
-class Reg;
-
-class Operand {
-  static const uint8_t EXT8BIT = 0x20;
-  unsigned int idx_ : 6;  // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil
-  unsigned int kind_ : 10;
-  unsigned int bit_ : 14;
-
- protected:
-  unsigned int zero_ : 1;
-  unsigned int mask_ : 3;
-  unsigned int rounding_ : 3;
-  void setIdx(int idx) { idx_ = idx; }
-
- public:
-  enum Kind {
-    NONE = 0,
-    MEM = 1 << 0,
-    REG = 1 << 1,
-    MMX = 1 << 2,
-    FPU = 1 << 3,
-    XMM = 1 << 4,
-    YMM = 1 << 5,
-    ZMM = 1 << 6,
-    OPMASK = 1 << 7,
-    BNDREG = 1 << 8,
-    TMM = 1 << 9
-  };
-  enum Code {
-#ifdef XBYAK64
-    RAX = 0,
-    RCX,
-    RDX,
-    RBX,
-    RSP,
-    RBP,
-    RSI,
-    RDI,
-    R8,
-    R9,
-    R10,
-    R11,
-    R12,
-    R13,
-    R14,
-    R15,
-    R8D = 8,
-    R9D,
-    R10D,
-    R11D,
-    R12D,
-    R13D,
-    R14D,
-    R15D,
-    R8W = 8,
-    R9W,
-    R10W,
-    R11W,
-    R12W,
-    R13W,
-    R14W,
-    R15W,
-    R8B = 8,
-    R9B,
-    R10B,
-    R11B,
-    R12B,
-    R13B,
-    R14B,
-    R15B,
-    SPL = 4,
-    BPL,
-    SIL,
-    DIL,
-#endif
-    EAX = 0,
-    ECX,
-    EDX,
-    EBX,
-    ESP,
-    EBP,
-    ESI,
-    EDI,
-    AX = 0,
-    CX,
-    DX,
-    BX,
-    SP,
-    BP,
-    SI,
-    DI,
-    AL = 0,
-    CL,
-    DL,
-    BL,
-    AH,
-    CH,
-    DH,
-    BH
-  };
-  XBYAK_CONSTEXPR Operand() : idx_(0), kind_(0), bit_(0), zero_(0), mask_(0), rounding_(0) {}
-  XBYAK_CONSTEXPR Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
-      : idx_(static_cast<uint8_t>(idx | (ext8bit ? EXT8BIT : 0))),
-        kind_(kind),
-        bit_(bit),
-        zero_(0),
-        mask_(0),
-        rounding_(0) {
-    assert((bit_ & (bit_ - 1)) == 0);  // bit must be power of two
-  }
-  XBYAK_CONSTEXPR Kind getKind() const { return static_cast<Kind>(kind_); }
-  XBYAK_CONSTEXPR int getIdx() const { return idx_ & (EXT8BIT - 1); }
-  XBYAK_CONSTEXPR bool isNone() const { return kind_ == 0; }
-  XBYAK_CONSTEXPR bool isMMX() const { return is(MMX); }
-  XBYAK_CONSTEXPR bool isXMM() const { return is(XMM); }
-  XBYAK_CONSTEXPR bool isYMM() const { return is(YMM); }
-  XBYAK_CONSTEXPR bool isZMM() const { return is(ZMM); }
-  XBYAK_CONSTEXPR bool isTMM() const { return is(TMM); }
-  XBYAK_CONSTEXPR bool isXMEM() const { return is(XMM | MEM); }
-  XBYAK_CONSTEXPR bool isYMEM() const { return is(YMM | MEM); }
-  XBYAK_CONSTEXPR bool isZMEM() const { return is(ZMM | MEM); }
-  XBYAK_CONSTEXPR bool isOPMASK() const { return is(OPMASK); }
-  XBYAK_CONSTEXPR bool isBNDREG() const { return is(BNDREG); }
-  XBYAK_CONSTEXPR bool isREG(int bit = 0) const { return is(REG, bit); }
-  XBYAK_CONSTEXPR bool isMEM(int bit = 0) const { return is(MEM, bit); }
-  XBYAK_CONSTEXPR bool isFPU() const { return is(FPU); }
-  XBYAK_CONSTEXPR bool isExt8bit() const { return (idx_ & EXT8BIT) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx() const { return (getIdx() & 8) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx2() const { return (getIdx() & 16) != 0; }
-  XBYAK_CONSTEXPR bool hasEvex() const { return isZMM() || isExtIdx2() || getOpmaskIdx() || getRounding(); }
-  XBYAK_CONSTEXPR bool hasRex() const { return isExt8bit() || isREG(64) || isExtIdx(); }
-  XBYAK_CONSTEXPR bool hasZero() const { return zero_; }
-  XBYAK_CONSTEXPR int getOpmaskIdx() const { return mask_; }
-  XBYAK_CONSTEXPR int getRounding() const { return rounding_; }
-  void setKind(Kind kind) {
-    if ((kind & (XMM | YMM | ZMM | TMM)) == 0) return;
-    kind_ = kind;
-    bit_ = kind == XMM ? 128 : kind == YMM ? 256 : kind == ZMM ? 512 : 8192;
-  }
-  // err if MMX/FPU/OPMASK/BNDREG
-  void setBit(int bit);
-  void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true) {
-    if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
-    mask_ = idx;
-  }
-  void setRounding(int idx) {
-    if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
-    rounding_ = idx;
-  }
-  void setZero() { zero_ = true; }
-  // ah, ch, dh, bh?
-  bool isHigh8bit() const {
-    if (!isBit(8)) return false;
-    if (isExt8bit()) return false;
-    const int idx = getIdx();
-    return AH <= idx && idx <= BH;
-  }
-  // any bit is accetable if bit == 0
-  XBYAK_CONSTEXPR bool is(int kind, uint32_t bit = 0) const {
-    return (kind == 0 || (kind_ & kind)) && (bit == 0 || (bit_ & bit));  // cf. you can set (8|16)
-  }
-  XBYAK_CONSTEXPR bool isBit(uint32_t bit) const { return (bit_ & bit) != 0; }
-  XBYAK_CONSTEXPR uint32_t getBit() const { return bit_; }
-  const char* toString() const {
-    const int idx = getIdx();
-    if (kind_ == REG) {
-      if (isExt8bit()) {
-        static const char* tbl[4] = {"spl", "bpl", "sil", "dil"};
-        return tbl[idx - 4];
-      }
-      static const char* tbl[4][16] = {
-          {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b",
-           "r15b"},
-          {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w",
-           "r15w"},
-          {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d",
-           "r15d"},
-          {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14",
-           "r15"},
-      };
-      return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
-    } else if (isOPMASK()) {
-      static const char* tbl[8] = {"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"};
-      return tbl[idx];
-    } else if (isTMM()) {
-      static const char* tbl[8] = {"tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7"};
-      return tbl[idx];
-    } else if (isZMM()) {
-      static const char* tbl[32] = {"zmm0",  "zmm1",  "zmm2",  "zmm3",  "zmm4",  "zmm5",  "zmm6",  "zmm7",
-                                    "zmm8",  "zmm9",  "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
-                                    "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
-                                    "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31"};
-      return tbl[idx];
-    } else if (isYMM()) {
-      static const char* tbl[32] = {"ymm0",  "ymm1",  "ymm2",  "ymm3",  "ymm4",  "ymm5",  "ymm6",  "ymm7",
-                                    "ymm8",  "ymm9",  "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
-                                    "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
-                                    "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31"};
-      return tbl[idx];
-    } else if (isXMM()) {
-      static const char* tbl[32] = {"xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
-                                    "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
-                                    "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
-                                    "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"};
-      return tbl[idx];
-    } else if (isMMX()) {
-      static const char* tbl[8] = {"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"};
-      return tbl[idx];
-    } else if (isFPU()) {
-      static const char* tbl[8] = {"st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7"};
-      return tbl[idx];
-    } else if (isBNDREG()) {
-      static const char* tbl[4] = {"bnd0", "bnd1", "bnd2", "bnd3"};
-      return tbl[idx];
-    }
-    XBYAK_THROW_RET(ERR_INTERNAL, 0);
-  }
-  bool isEqualIfNotInherited(const Operand& rhs) const {
-    return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ &&
-           rounding_ == rhs.rounding_;
-  }
-  bool operator==(const Operand& rhs) const;
-  bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
-  const Address& getAddress() const;
-  const Reg& getReg() const;
-};
-
-inline void Operand::setBit(int bit) {
-  if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512 && bit != 8192)
-    goto ERR;
-  if (isBit(bit)) return;
-  if (is(MEM | OPMASK)) {
-    bit_ = bit;
-    return;
-  }
-  if (is(REG | XMM | YMM | ZMM | TMM)) {
-    int idx = getIdx();
-    // err if converting ah, bh, ch, dh
-    if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
-    Kind kind = REG;
-    switch (bit) {
-      case 8:
-        if (idx >= 16) goto ERR;
-#ifdef XBYAK32
-        if (idx >= 4) goto ERR;
-#else
-        if (4 <= idx && idx < 8) idx |= EXT8BIT;
-#endif
-        break;
-      case 16:
-      case 32:
-      case 64:
-        if (idx >= 16) goto ERR;
-        break;
-      case 128:
-        kind = XMM;
-        break;
-      case 256:
-        kind = YMM;
-        break;
-      case 512:
-        kind = ZMM;
-        break;
-      case 8192:
-        kind = TMM;
-        break;
-    }
-    idx_ = idx;
-    kind_ = kind;
-    bit_ = bit;
-    if (bit >= 128) return;  // keep mask_ and rounding_
-    mask_ = 0;
-    rounding_ = 0;
-    return;
-  }
-ERR:
-  XBYAK_THROW(ERR_CANT_CONVERT)
-}
-
-class Label;
-
-struct Reg8;
-struct Reg16;
-struct Reg32;
-#ifdef XBYAK64
-struct Reg64;
-#endif
-class Reg : public Operand {
- public:
-  XBYAK_CONSTEXPR Reg() {}
-  XBYAK_CONSTEXPR Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) {}
-  // convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
-  Reg changeBit(int bit) const {
-    Reg r(*this);
-    r.setBit(bit);
-    return r;
-  }
-  uint8_t getRexW() const { return isREG(64) ? 8 : 0; }
-  uint8_t getRexR() const { return isExtIdx() ? 4 : 0; }
-  uint8_t getRexX() const { return isExtIdx() ? 2 : 0; }
-  uint8_t getRexB() const { return isExtIdx() ? 1 : 0; }
-  uint8_t getRex(const Reg& base = Reg()) const {
-    uint8_t rex = getRexW() | getRexR() | base.getRexW() | base.getRexB();
-    if (rex || isExt8bit() || base.isExt8bit()) rex |= 0x40;
-    return rex;
-  }
-  Reg8 cvt8() const;
-  Reg16 cvt16() const;
-  Reg32 cvt32() const;
-#ifdef XBYAK64
-  Reg64 cvt64() const;
-#endif
-};
-
-inline const Reg& Operand::getReg() const {
-  assert(!isMEM());
-  return static_cast<const Reg&>(*this);
-}
-
-struct Reg8 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) {}
-};
-
-struct Reg16 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) {}
-};
-
-struct Mmx : public Reg {
-  explicit XBYAK_CONSTEXPR Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) {}
-};
-
-struct EvexModifierRounding {
-  enum { T_RN_SAE = 1, T_RD_SAE = 2, T_RU_SAE = 3, T_RZ_SAE = 4, T_SAE = 5 };
-  explicit XBYAK_CONSTEXPR EvexModifierRounding(int rounding) : rounding(rounding) {}
-  int rounding;
-};
-struct EvexModifierZero {
-  XBYAK_CONSTEXPR EvexModifierZero() {}
-};
-
-struct Xmm : public Mmx {
-  explicit XBYAK_CONSTEXPR Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) {}
-  XBYAK_CONSTEXPR Xmm(Kind kind, int idx) : Mmx(idx, kind, kind == XMM ? 128 : kind == YMM ? 256 : 512) {}
-  Xmm operator|(const EvexModifierRounding& emr) const {
-    Xmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-  Xmm copyAndSetIdx(int idx) const {
-    Xmm ret(*this);
-    ret.setIdx(idx);
-    return ret;
-  }
-  Xmm copyAndSetKind(Operand::Kind kind) const {
-    Xmm ret(*this);
-    ret.setKind(kind);
-    return ret;
-  }
-};
-
-struct Ymm : public Xmm {
-  explicit XBYAK_CONSTEXPR Ymm(int idx = 0, Kind kind = Operand::YMM, int bit = 256) : Xmm(idx, kind, bit) {}
-  Ymm operator|(const EvexModifierRounding& emr) const {
-    Ymm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-struct Zmm : public Ymm {
-  explicit XBYAK_CONSTEXPR Zmm(int idx = 0) : Ymm(idx, Operand::ZMM, 512) {}
-  Zmm operator|(const EvexModifierRounding& emr) const {
-    Zmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-#ifdef XBYAK64
-struct Tmm : public Reg {
-  explicit XBYAK_CONSTEXPR Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) {}
-};
-#endif
-
-struct Opmask : public Reg {
-  explicit XBYAK_CONSTEXPR Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
-};
-
-struct BoundsReg : public Reg {
-  explicit XBYAK_CONSTEXPR BoundsReg(int idx = 0) : Reg(idx, Operand::BNDREG, 128) {}
-};
-
-template <class T>
-T operator|(const T& x, const Opmask& k) {
-  T r(x);
-  r.setOpmaskIdx(k.getIdx());
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierZero&) {
-  T r(x);
-  r.setZero();
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierRounding& emr) {
-  T r(x);
-  r.setRounding(emr.rounding);
-  return r;
-}
-
-struct Fpu : public Reg {
-  explicit XBYAK_CONSTEXPR Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) {}
-};
-
-struct Reg32e : public Reg {
-  explicit XBYAK_CONSTEXPR Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
-};
-struct Reg32 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg32(int idx = 0) : Reg32e(idx, 32) {}
-};
-#ifdef XBYAK64
-struct Reg64 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {}
-};
-struct RegRip {
-  int64_t disp_;
-  const Label* label_;
-  bool isAddr_;
-  explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false)
-      : disp_(disp), label_(label), isAddr_(isAddr) {}
-  friend const RegRip operator+(const RegRip& r, int disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, int64_t disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int64_t disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, const Label& label) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_, &label);
-  }
-  friend const RegRip operator+(const RegRip& r, const void* addr) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_ + (int64_t)addr, 0, true);
-  }
-};
-#endif
-
-inline Reg8 Reg::cvt8() const {
-  Reg r = changeBit(8);
-  return Reg8(r.getIdx(), r.isExt8bit());
-}
-
-inline Reg16 Reg::cvt16() const { return Reg16(changeBit(16).getIdx()); }
-
-inline Reg32 Reg::cvt32() const { return Reg32(changeBit(32).getIdx()); }
-
-#ifdef XBYAK64
-inline Reg64 Reg::cvt64() const { return Reg64(changeBit(64).getIdx()); }
-#endif
-
-#ifndef XBYAK_DISABLE_SEGMENT
-// not derived from Reg
-class Segment {
-  int idx_;
-
- public:
-  enum { es, cs, ss, ds, fs, gs };
-  explicit XBYAK_CONSTEXPR Segment(int idx) : idx_(idx) { assert(0 <= idx_ && idx_ < 6); }
-  int getIdx() const { return idx_; }
-  const char* toString() const {
-    static const char tbl[][3] = {"es", "cs", "ss", "ds", "fs", "gs"};
-    return tbl[idx_];
-  }
-};
-#endif
-
-class RegExp {
- public:
-#ifdef XBYAK64
-  enum { i32e = 32 | 64 };
-#else
-  enum { i32e = 32 };
-#endif
-  XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) {}
-  XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1) : scale_(scale), disp_(0) {
-    if (!r.isREG(i32e) && !r.is(Reg::XMM | Reg::YMM | Reg::ZMM | Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (scale == 0) return;
-    if (scale != 1 && scale != 2 && scale != 4 && scale != 8) XBYAK_THROW(ERR_BAD_SCALE)
-    if (r.getBit() >= 128 || scale != 1) {  // xmm/ymm is always index
-      index_ = r;
-    } else {
-      base_ = r;
-    }
-  }
-  bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
-  RegExp optimize() const {
-    RegExp exp = *this;
-    // [reg * 2] => [reg + reg]
-    if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) {
-      exp.base_ = index_;
-      exp.scale_ = 1;
-    }
-    return exp;
-  }
-  bool operator==(const RegExp& rhs) const {
-    return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_ && scale_ == rhs.scale_;
-  }
-  const Reg& getBase() const { return base_; }
-  const Reg& getIndex() const { return index_; }
-  int getScale() const { return scale_; }
-  size_t getDisp() const { return disp_; }
-  XBYAK_CONSTEXPR void verify() const {
-    if (base_.getBit() >= 128) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (index_.getBit() && index_.getBit() <= 64) {
-      if (index_.getIdx() == Operand::ESP) XBYAK_THROW(ERR_ESP_CANT_BE_INDEX)
-      if (base_.getBit() && base_.getBit() != index_.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  friend RegExp operator+(const RegExp& a, const RegExp& b);
-  friend RegExp operator-(const RegExp& e, size_t disp);
-  uint8_t getRex() const {
-    uint8_t rex = index_.getRexX() | base_.getRexB();
-    return rex ? uint8_t(rex | 0x40) : 0;
-  }
-
- private:
-  /*
-          [base_ + index_ * scale_ + disp_]
-          base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
-  */
-  Reg base_;
-  Reg index_;
-  int scale_;
-  size_t disp_;
-};
-
-inline RegExp operator+(const RegExp& a, const RegExp& b) {
-  if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-  RegExp ret = a;
-  if (!ret.index_.getBit()) {
-    ret.index_ = b.index_;
-    ret.scale_ = b.scale_;
-  }
-  if (b.base_.getBit()) {
-    if (ret.base_.getBit()) {
-      if (ret.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-      // base + base => base + index * 1
-      ret.index_ = b.base_;
-      // [reg + esp] => [esp + reg]
-      if (ret.index_.getIdx() == Operand::ESP) std::swap(ret.base_, ret.index_);
-      ret.scale_ = 1;
-    } else {
-      ret.base_ = b.base_;
-    }
-  }
-  ret.disp_ += b.disp_;
-  return ret;
-}
-inline RegExp operator*(const Reg& r, int scale) { return RegExp(r, scale); }
-inline RegExp operator*(int scale, const Reg& r) { return r * scale; }
-inline RegExp operator-(const RegExp& e, size_t disp) {
-  RegExp ret = e;
-  ret.disp_ -= disp;
-  return ret;
-}
-
-// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
-void* const AutoGrow = (void*)1;           //-V566
-void* const DontSetProtectRWE = (void*)2;  //-V566
-
-class CodeArray {
-  enum Type {
-    USER_BUF = 1,  // use userPtr(non alignment, non protect)
-    ALLOC_BUF,     // use new(alignment, protect)
-    AUTO_GROW      // automatically move and grow memory if necessary
-  };
-  CodeArray(const CodeArray& rhs);
-  void operator=(const CodeArray&);
-  bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
-  struct AddrInfo {
-    size_t codeOffset;  // position to write
-    size_t jmpAddr;     // value to write
-    int jmpSize;        // size of jmpAddr
-    inner::LabelMode mode;
-    AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
-        : codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
-    uint64_t getVal(const uint8_t* top) const {
-      uint64_t disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top)
-                      : (mode == inner::LasIs) ? jmpAddr
-                                               : jmpAddr - size_t(top);
-      if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
-      return disp;
-    }
-  };
-  typedef std::list<AddrInfo> AddrInfoList;
-  AddrInfoList addrInfoList_;
-  const Type type_;
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-  MmapAllocator defaultAllocator_;
-#else
-  Allocator defaultAllocator_;
-#endif
-  Allocator* alloc_;
-
- protected:
-  size_t maxSize_;
-  uint8_t* top_;
-  size_t size_;
-  bool isCalledCalcJmpAddress_;
-
-  bool useProtect() const { return alloc_->useProtect(); }
-  /*
-          allocate new memory and copy old data to the new area
-  */
-  void growMemory() {
-    const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
-    uint8_t* newTop = alloc_->alloc(newSize);
-    if (newTop == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
-    alloc_->free(top_);
-    top_ = newTop;
-    maxSize_ = newSize;
-  }
-  /*
-          calc jmp address for AutoGrow mode
-  */
-  void calcJmpAddress() {
-    if (isCalledCalcJmpAddress_) return;
-    for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
-      uint64_t disp = i->getVal(top_);
-      rewrite(i->codeOffset, disp, i->jmpSize);
-    }
-    isCalledCalcJmpAddress_ = true;
-  }
-
- public:
-  enum ProtectMode {
-    PROTECT_RW = 0,   // read/write
-    PROTECT_RWE = 1,  // read/write/exec
-    PROTECT_RE = 2    // read/exec
-  };
-  explicit CodeArray(size_t maxSize, void* userPtr = 0, Allocator* allocator = 0)
-      : type_(userPtr == AutoGrow                              ? AUTO_GROW
-              : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF
-                                                               : USER_BUF),
-        alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_),
-        maxSize_(maxSize),
-        top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))),
-        size_(0),
-        isCalledCalcJmpAddress_(false) {
-    if (maxSize_ > 0 && top_ == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
-      alloc_->free(top_);
-      XBYAK_THROW(ERR_CANT_PROTECT)
-    }
-  }
-  virtual ~CodeArray() {
-    if (isAllocType()) {
-      if (useProtect()) setProtectModeRW(false);
-      alloc_->free(top_);
-    }
-  }
-  bool setProtectMode(ProtectMode mode, bool throwException = true) {
-    bool isOK = protect(top_, maxSize_, mode);
-    if (isOK) return true;
-    if (throwException) XBYAK_THROW_RET(ERR_CANT_PROTECT, false)
-    return false;
-  }
-  bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
-  bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
-  void resetSize() {
-    size_ = 0;
-    addrInfoList_.clear();
-    isCalledCalcJmpAddress_ = false;
-  }
-  void db(int code) {
-    if (size_ >= maxSize_) {
-      if (type_ == AUTO_GROW) {
-        growMemory();
-      } else {
-        XBYAK_THROW(ERR_CODE_IS_TOO_BIG)
-      }
-    }
-    top_[size_++] = static_cast<uint8_t>(code);
-  }
-  void db(const uint8_t* code, size_t codeSize) {
-    for (size_t i = 0; i < codeSize; i++) db(code[i]);
-  }
-  void db(uint64_t code, size_t codeSize) {
-    if (codeSize > 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    for (size_t i = 0; i < codeSize; i++) db(static_cast<uint8_t>(code >> (i * 8)));
-  }
-  void dw(uint32_t code) { db(code, 2); }
-  void dd(uint32_t code) { db(code, 4); }
-  void dq(uint64_t code) { db(code, 8); }
-  const uint8_t* getCode() const { return top_; }
-  template <class F>
-  const F getCode() const {
-    return reinterpret_cast<F>(top_);
-  }
-  const uint8_t* getCurr() const { return &top_[size_]; }
-  template <class F>
-  const F getCurr() const {
-    return reinterpret_cast<F>(&top_[size_]);
-  }
-  size_t getSize() const { return size_; }
-  void setSize(size_t size) {
-    if (size > maxSize_) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-    size_ = size;
-  }
-  void dump() const {
-    const uint8_t* p = getCode();
-    size_t bufSize = getSize();
-    size_t remain = bufSize;
-    for (int i = 0; i < 4; i++) {
-      size_t disp = 16;
-      if (remain < 16) {
-        disp = remain;
-      }
-      for (size_t j = 0; j < 16; j++) {
-        if (j < disp) {
-          printf("%02X", p[i * 16 + j]);
-        }
-      }
-      putchar('\n');
-      remain -= disp;
-      if (remain == 0) {
-        break;
-      }
-    }
-  }
-  /*
-          @param offset [in] offset from top
-          @param disp [in] offset from the next of jmp
-          @param size [in] write size(1, 2, 4, 8)
-  */
-  void rewrite(size_t offset, uint64_t disp, size_t size) {
-    assert(offset < maxSize_);
-    if (size != 1 && size != 2 && size != 4 && size != 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    uint8_t* const data = top_ + offset;
-    for (size_t i = 0; i < size; i++) {
-      data[i] = static_cast<uint8_t>(disp >> (i * 8));
-    }
-  }
-  void save(size_t offset, size_t val, int size, inner::LabelMode mode) {
-    addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
-  }
-  bool isAutoGrow() const { return type_ == AUTO_GROW; }
-  bool isCalledCalcJmpAddress() const { return isCalledCalcJmpAddress_; }
-  /**
-          change exec permission of memory
-          @param addr [in] buffer address
-          @param size [in] buffer size
-          @param protectMode [in] mode(RW/RWE/RE)
-          @return true(success), false(failure)
-  */
-  static inline bool protect(const void* addr, size_t size, int protectMode) {
-#if defined(_WIN32)
-    const DWORD c_rw = PAGE_READWRITE;
-    const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
-    const DWORD c_re = PAGE_EXECUTE_READ;
-    DWORD mode;
-#else
-    const int c_rw = PROT_READ | PROT_WRITE;
-    const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
-    const int c_re = PROT_READ | PROT_EXEC;
-    int mode;
-#endif
-    switch (protectMode) {
-      case PROTECT_RW:
-        mode = c_rw;
-        break;
-      case PROTECT_RWE:
-        mode = c_rwe;
-        break;
-      case PROTECT_RE:
-        mode = c_re;
-        break;
-      default:
-        return false;
-    }
-#if defined(_WIN32)
-    DWORD oldProtect;
-    return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
-#elif defined(__GNUC__)
-    size_t pageSize = sysconf(_SC_PAGESIZE);
-    size_t iaddr = reinterpret_cast<size_t>(addr);
-    size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
-    return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
-#else
-    return true;
-#endif
-  }
-  /**
-          get aligned memory pointer
-          @param addr [in] address
-          @param alignedSize [in] power of two
-          @return aligned addr by alingedSize
-  */
-  static inline uint8_t* getAlignedAddress(uint8_t* addr, size_t alignedSize = 16) {
-    return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) &
-                                      ~(alignedSize - static_cast<size_t>(1)));
-  }
-};
-
-class Address : public Operand {
- public:
-  enum Mode { M_ModRM, M_64bitDisp, M_rip, M_ripAddr };
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e)
-      : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) {
-    e_.verify();
-  }
-#ifdef XBYAK64
-  explicit XBYAK_CONSTEXPR Address(size_t disp)
-      : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false) {}
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr)
-      : Operand(0, MEM, sizeBit),
-        e_(addr.disp_),
-        label_(addr.label_),
-        mode_(addr.isAddr_ ? M_ripAddr : M_rip),
-        broadcast_(broadcast) {}
-#endif
-  RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; }
-  Mode getMode() const { return mode_; }
-  bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; }
-  bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); }  // for mov eax
-  size_t getDisp() const { return e_.getDisp(); }
-  uint8_t getRex() const {
-    if (mode_ != M_ModRM) return 0;
-    return getRegExp().getRex();
-  }
-  bool is64bitDisp() const { return mode_ == M_64bitDisp; }  // for moffset
-  bool isBroadcast() const { return broadcast_; }
-  const Label* getLabel() const { return label_; }
-  bool operator==(const Address& rhs) const {
-    return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ &&
-           broadcast_ == rhs.broadcast_;
-  }
-  bool operator!=(const Address& rhs) const { return !operator==(rhs); }
-  bool isVsib() const { return e_.isVsib(); }
-
- private:
-  RegExp e_;
-  const Label* label_;
-  Mode mode_;
-  bool broadcast_;
-};
-
-inline const Address& Operand::getAddress() const {
-  assert(isMEM());
-  return static_cast<const Address&>(*this);
-}
-
-inline bool Operand::operator==(const Operand& rhs) const {
-  if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress();
-  return isEqualIfNotInherited(rhs);
-}
-
-class AddressFrame {
-  void operator=(const AddressFrame&);
-  AddressFrame(const AddressFrame&);
-
- public:
-  const uint32_t bit_;
-  const bool broadcast_;
-  explicit XBYAK_CONSTEXPR AddressFrame(uint32_t bit, bool broadcast = false) : bit_(bit), broadcast_(broadcast) {}
-  Address operator[](const RegExp& e) const { return Address(bit_, broadcast_, e); }
-  Address operator[](const void* disp) const {
-    return Address(bit_, broadcast_, RegExp(reinterpret_cast<size_t>(disp)));
-  }
-#ifdef XBYAK64
-  Address operator[](uint64_t disp) const { return Address(disp); }
-  Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); }
-#endif
-};
-
-struct JmpLabel {
-  size_t endOfJmp; /* offset from top to the end address of jmp */
-  int jmpSize;
-  inner::LabelMode mode;
-  size_t disp;  // disp for [rip + disp]
-  explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
-      : endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp) {}
-};
-
-class LabelManager;
-
-class Label {
-  mutable LabelManager* mgr;
-  mutable int id;
-  friend class LabelManager;
-
- public:
-  Label() : mgr(0), id(0) {}
-  Label(const Label& rhs);
-  Label& operator=(const Label& rhs);
-  ~Label();
-  void clear() {
-    mgr = 0;
-    id = 0;
-  }
-  int getId() const { return id; }
-  const uint8_t* getAddress() const;
-
-  // backward compatibility
-  static inline std::string toStr(int num) {
-    char buf[16];
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-    _snprintf_s
-#else
-    snprintf
-#endif
-        (buf, sizeof(buf), ".%08x", num);
-    return buf;
-  }
-};
-
-class LabelManager {
-  // for string label
-  struct SlabelVal {
-    size_t offset;
-    SlabelVal(size_t offset) : offset(offset) {}
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
-  struct SlabelState {
-    SlabelDefList defList;
-    SlabelUndefList undefList;
-  };
-  typedef std::list<SlabelState> StateList;
-  // for Label class
-  struct ClabelVal {
-    ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
-    size_t offset;
-    int refCount;
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
-  typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
-
-  CodeArray* base_;
-  // global : stateList_.front(), local : stateList_.back()
-  StateList stateList_;
-  mutable int labelId_;
-  ClabelDefList clabelDefList_;
-  ClabelUndefList clabelUndefList_;
-  LabelPtrList labelPtrList_;
-
-  int getId(const Label& label) const {
-    if (label.id == 0) label.id = labelId_++;
-    return label.id;
-  }
-  template <class DefList, class UndefList, class T>
-  void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset) {
-    // add label
-    typename DefList::value_type item(labelId, addrOffset);
-    std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
-    if (!ret.second) XBYAK_THROW(ERR_LABEL_IS_REDEFINED)
-    // search undefined label
-    for (;;) {
-      typename UndefList::iterator itr = undefList.find(labelId);
-      if (itr == undefList.end()) break;
-      const JmpLabel* jmp = &itr->second;
-      const size_t offset = jmp->endOfJmp - jmp->jmpSize;
-      size_t disp;
-      if (jmp->mode == inner::LaddTop) {
-        disp = addrOffset;
-      } else if (jmp->mode == inner::Labs) {
-        disp = size_t(base_->getCurr());
-      } else {
-        disp = addrOffset - jmp->endOfJmp + jmp->disp;
-#ifdef XBYAK64
-        if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-        if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      }
-      if (base_->isAutoGrow()) {
-        base_->save(offset, disp, jmp->jmpSize, jmp->mode);
-      } else {
-        base_->rewrite(offset, disp, jmp->jmpSize);
-      }
-      undefList.erase(itr);
-    }
-  }
-  template <class DefList, class T>
-  bool getOffset_inner(const DefList& defList, size_t* offset, const T& label) const {
-    typename DefList::const_iterator i = defList.find(label);
-    if (i == defList.end()) return false;
-    *offset = i->second.offset;
-    return true;
-  }
-  friend class Label;
-  void incRefCount(int id, Label* label) {
-    clabelDefList_[id].refCount++;
-    labelPtrList_.insert(label);
-  }
-  void decRefCount(int id, Label* label) {
-    labelPtrList_.erase(label);
-    ClabelDefList::iterator i = clabelDefList_.find(id);
-    if (i == clabelDefList_.end()) return;
-    if (i->second.refCount == 1) {
-      clabelDefList_.erase(id);
-    } else {
-      --i->second.refCount;
-    }
-  }
-  template <class T>
-  bool hasUndefinedLabel_inner(const T& list) const {
-#ifndef NDEBUG
-    for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
-      std::cerr << "undefined label:" << i->first << std::endl;
-    }
-#endif
-    return !list.empty();
-  }
-  // detach all labels linked to LabelManager
-  void resetLabelPtrList() {
-    for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
-      (*i)->clear();
-    }
-    labelPtrList_.clear();
-  }
-
- public:
-  LabelManager() { reset(); }
-  ~LabelManager() { resetLabelPtrList(); }
-  void reset() {
-    base_ = 0;
-    labelId_ = 1;
-    stateList_.clear();
-    stateList_.push_back(SlabelState());
-    stateList_.push_back(SlabelState());
-    clabelDefList_.clear();
-    clabelUndefList_.clear();
-    resetLabelPtrList();
-  }
-  void enterLocal() { stateList_.push_back(SlabelState()); }
-  void leaveLocal() {
-    if (stateList_.size() <= 2) XBYAK_THROW(ERR_UNDER_LOCAL_LABEL)
-    if (hasUndefinedLabel_inner(stateList_.back().undefList)) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    stateList_.pop_back();
-  }
-  void set(CodeArray* base) { base_ = base; }
-  void defineSlabel(std::string label) {
-    if (label == "@b" || label == "@f") XBYAK_THROW(ERR_BAD_LABEL_STR)
-    if (label == "@@") {
-      SlabelDefList& defList = stateList_.front().defList;
-      SlabelDefList::iterator i = defList.find("@f");
-      if (i != defList.end()) {
-        defList.erase(i);
-        label = "@b";
-      } else {
-        i = defList.find("@b");
-        if (i != defList.end()) {
-          defList.erase(i);
-        }
-        label = "@f";
-      }
-    }
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    define_inner(st.defList, st.undefList, label, base_->getSize());
-  }
-  void defineClabel(Label& label) {
-    define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
-    label.mgr = this;
-    labelPtrList_.insert(&label);
-  }
-  void assign(Label& dst, const Label& src) {
-    ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
-    if (i == clabelDefList_.end()) XBYAK_THROW(ERR_LABEL_ISNOT_SET_BY_L)
-    define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
-    dst.mgr = this;
-    labelPtrList_.insert(&dst);
-  }
-  bool getOffset(size_t* offset, std::string& label) const {
-    const SlabelDefList& defList = stateList_.front().defList;
-    if (label == "@b") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@f";
-      } else if (defList.find("@b") == defList.end()) {
-        XBYAK_THROW_RET(ERR_LABEL_IS_NOT_FOUND, false)
-      }
-    } else if (label == "@f") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@b";
-      }
-    }
-    const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    return getOffset_inner(st.defList, offset, label);
-  }
-  bool getOffset(size_t* offset, const Label& label) const {
-    return getOffset_inner(clabelDefList_, offset, getId(label));
-  }
-  void addUndefinedLabel(const std::string& label, const JmpLabel& jmp) {
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    st.undefList.insert(SlabelUndefList::value_type(label, jmp));
-  }
-  void addUndefinedLabel(const Label& label, const JmpLabel& jmp) {
-    clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
-  }
-  bool hasUndefSlabel() const {
-    for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
-      if (hasUndefinedLabel_inner(i->undefList)) return true;
-    }
-    return false;
-  }
-  bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
-  const uint8_t* getCode() const { return base_->getCode(); }
-  bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); }
-};
-
-inline Label::Label(const Label& rhs) {
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-}
-inline Label& Label::operator=(const Label& rhs) {
-  if (id) XBYAK_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-  return *this;
-}
-inline Label::~Label() {
-  if (id && mgr) mgr->decRefCount(id, this);
-}
-inline const uint8_t* Label::getAddress() const {
-  if (mgr == 0 || !mgr->isReady()) return 0;
-  size_t offset;
-  if (!mgr->getOffset(&offset, *this)) return 0;
-  return mgr->getCode() + offset;
-}
-
-typedef enum { DefaultEncoding, VexEncoding, EvexEncoding } PreferredEncoding;
-
-class CodeGenerator : public CodeArray {
- public:
-  enum LabelType {
-    T_SHORT,
-    T_NEAR,
-    T_FAR,  // far jump
-    T_AUTO  // T_SHORT if possible
-  };
-
- private:
-  CodeGenerator operator=(const CodeGenerator&);  // don't call
-#ifdef XBYAK64
-  enum {i32e = 32 | 64, BIT = 64};
-  static const uint64_t dummyAddr = uint64_t(0x1122334455667788ull);
-  typedef Reg64 NativeReg;
-#else
-  enum {i32e = 32, BIT = 32};
-  static const size_t dummyAddr = 0x12345678;
-  typedef Reg32 NativeReg;
-#endif
-  // (XMM, XMM|MEM)
-  static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isXMM() || op2.isMEM());
-  }
-  // (MMX, MMX|MEM) or (XMM, XMM|MEM)
-  static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2) {
-    return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
-  }
-  // (XMM, MMX|MEM)
-  static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isMMX() || op2.isMEM());
-  }
-  // (MMX, XMM|MEM)
-  static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isMMX() && (op2.isXMM() || op2.isMEM());
-  }
-  // (XMM, REG32|MEM)
-  static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
-  }
-  // (REG32, XMM|MEM)
-  static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
-  }
-  // (REG32, REG32|MEM)
-  static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
-  }
-  static inline bool isValidSSE(const Operand& op1) {
-    // SSE instructions do not support XMM16 - XMM31
-    return !(op1.isXMM() && op1.getIdx() >= 16);
-  }
-  void rex(const Operand& op1, const Operand& op2 = Operand()) {
-    uint8_t rex = 0;
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM()) std::swap(p1, p2);
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isMEM()) {
-      const Address& addr = p2->getAddress();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      rex = addr.getRex() | p1->getReg().getRex();
-    } else {
-      // ModRM(reg, base);
-      rex = op2.getReg().getRex(op1.getReg());
-    }
-    // except movsx(16bit, 32/64bit)
-    if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
-    if (rex) db(rex);
-  }
-  enum AVXtype {
-    // low 3 bit
-    T_N1 = 1,
-    T_N2 = 2,
-    T_N4 = 3,
-    T_N8 = 4,
-    T_N16 = 5,
-    T_N32 = 6,
-    T_NX_MASK = 7,
-    //
-    T_N_VL = 1 << 3,     // N * (1, 2, 4) for VL
-    T_DUP = 1 << 4,      // N = (8, 32, 64)
-    T_66 = 1 << 5,       // pp = 1
-    T_F3 = 1 << 6,       // pp = 2
-    T_F2 = T_66 | T_F3,  // pp = 3
-    T_ER_R = 1 << 7,     // reg{er}
-    T_0F = 1 << 8,
-    T_0F38 = 1 << 9,
-    T_0F3A = 1 << 10,
-    T_L0 = 1 << 11,
-    T_L1 = 1 << 12,
-    T_W0 = 1 << 13,
-    T_W1 = 1 << 14,
-    T_EW0 = 1 << 15,
-    T_EW1 = 1 << 16,
-    T_YMM = 1 << 17,  // support YMM, ZMM
-    T_EVEX = 1 << 18,
-    T_ER_X = 1 << 19,       // xmm{er}
-    T_ER_Y = 1 << 20,       // ymm{er}
-    T_ER_Z = 1 << 21,       // zmm{er}
-    T_SAE_X = 1 << 22,      // xmm{sae}
-    T_SAE_Y = 1 << 23,      // ymm{sae}
-    T_SAE_Z = 1 << 24,      // zmm{sae}
-    T_MUST_EVEX = 1 << 25,  // contains T_EVEX
-    T_B32 = 1 << 26,        // m32bcst
-    T_B64 = 1 << 27,        // m64bcst
-    T_B16 = T_B32 | T_B64,  // m16bcst (Be careful)
-    T_M_K = 1 << 28,        // mem{k}
-    T_VSIB = 1 << 29,
-    T_MEM_EVEX = 1 << 30,  // use evex if mem
-    T_FP16 = 1 << 31,      // avx512-fp16
-    T_MAP5 = T_FP16 | T_0F,
-    T_MAP6 = T_FP16 | T_0F38,
-    T_XXX
-  };
-  // T_66 = 1, T_F3 = 2, T_F2 = 3
-  uint32_t getPP(int type) const { return (type >> 5) & 3; }
-  void vex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false) {
-    int w = (type & T_W1) ? 1 : 0;
-    bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
-    bool r = reg.isExtIdx();
-    bool b = base.isExtIdx();
-    int idx = v ? v->getIdx() : 0;
-    if ((idx | reg.getIdx() | base.getIdx()) >= 16) XBYAK_THROW(ERR_BAD_COMBINATION)
-    uint32_t pp = getPP(type);
-    uint32_t vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
-    if (!b && !x && !w && (type & T_0F)) {
-      db(0xC5);
-      db((r ? 0 : 0x80) | vvvv);
-    } else {
-      uint32_t mmmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-      db(0xC4);
-      db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm);
-      db((w << 7) | vvvv);
-    }
-    db(code);
-  }
-  void verifySAE(const Reg& r, int type) const {
-    if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_SAE_IS_INVALID)
-  }
-  void verifyER(const Reg& r, int type) const {
-    if ((type & T_ER_R) && r.isREG(32 | 64)) return;
-    if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_ER_IS_INVALID)
-  }
-  // (a, b, c) contains non zero two or three values then err
-  int verifyDuplicate(int a, int b, int c, int err) {
-    int v = a | b | c;
-    if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) XBYAK_THROW_RET(err, 0)
-    return v;
-  }
-  int evex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false, bool b = false,
-           int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false) {
-    if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
-    int w = (type & T_EW1) ? 1 : 0;
-    uint32_t mmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-    if (type & T_FP16) mmm |= 4;
-    uint32_t pp = getPP(type);
-    int idx = v ? v->getIdx() : 0;
-    uint32_t vvvv = ~idx;
-
-    bool R = !reg.isExtIdx();
-    bool X = x ? false : !base.isExtIdx2();
-    bool B = !base.isExtIdx();
-    bool Rp = !reg.isExtIdx2();
-    int LL;
-    int rounding =
-        verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
-    int disp8N = 1;
-    if (rounding) {
-      if (rounding == EvexModifierRounding::T_SAE) {
-        verifySAE(base, type);
-        LL = 0;
-      } else {
-        verifyER(base, type);
-        LL = rounding - 1;
-      }
-      b = true;
-    } else {
-      if (v) VL = (std::max)(VL, v->getBit());
-      VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
-      LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
-      if (b) {
-        disp8N = ((type & T_B16) == T_B16) ? 2 : (type & T_B32) ? 4 : 8;
-      } else if (type & T_DUP) {
-        disp8N = VL == 128 ? 8 : VL == 256 ? 32 : 64;
-      } else {
-        if ((type & (T_NX_MASK | T_N_VL)) == 0) {
-          type |= T_N16 | T_N_VL;  // default
-        }
-        int low = type & T_NX_MASK;
-        if (low > 0) {
-          disp8N = 1 << (low - 1);
-          if (type & T_N_VL) disp8N *= (VL == 512 ? 4 : VL == 256 ? 2 : 1);
-        }
-      }
-    }
-    bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
-    bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
-    if (aaa == 0)
-      aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0),
-                            ERR_OPMASK_IS_ALREADY_SET);
-    if (aaa == 0) z = 0;  // clear T_z if mask is not set
-    db(0x62);
-    db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | mmm);
-    db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
-    db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (Vp ? 8 : 0) | (aaa & 7));
-    db(code);
-    return disp8N;
-  }
-  void setModRM(int mod, int r1, int r2) { db(static_cast<uint8_t>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7))); }
-  void setSIB(const RegExp& e, int reg, int disp8N = 0) {
-    uint64_t disp64 = e.getDisp();
-#if defined(XBYAK64) && !defined(__ILP32__)
-#ifdef XBYAK_OLD_DISP_CHECK
-    // treat 0xffffffff as 0xffffffffffffffff
-    uint64_t high = disp64 >> 32;
-    if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#else
-    // displacement should be a signed 32-bit value, so also check sign bit
-    uint64_t high = disp64 >> 31;
-    if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-#endif
-    uint32_t disp = static_cast<uint32_t>(disp64);
-    const Reg& base = e.getBase();
-    const Reg& index = e.getIndex();
-    const int baseIdx = base.getIdx();
-    const int baseBit = base.getBit();
-    const int indexBit = index.getBit();
-    enum { mod00 = 0, mod01 = 1, mod10 = 2 };
-    int mod = mod10;  // disp32
-    if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) {
-      mod = mod00;
-    } else {
-      if (disp8N == 0) {
-        if (inner::IsInDisp8(disp)) {
-          mod = mod01;
-        }
-      } else {
-        // disp must be casted to signed
-        uint32_t t = static_cast<uint32_t>(static_cast<int>(disp) / disp8N);
-        if ((disp % disp8N) == 0 && inner::IsInDisp8(t)) {
-          disp = t;
-          mod = mod01;
-        }
-      }
-    }
-    const int newBaseIdx = baseBit ? (baseIdx & 7) : Operand::EBP;
-    /* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
-    bool hasSIB = indexBit || (baseIdx & 7) == Operand::ESP;
-#ifdef XBYAK64
-    if (!baseBit && !indexBit) hasSIB = true;
-#endif
-    if (hasSIB) {
-      setModRM(mod, reg, Operand::ESP);
-      /* SIB = [2:3:3] = [SS:index:base(=rm)] */
-      const int idx = indexBit ? (index.getIdx() & 7) : Operand::ESP;
-      const int scale = e.getScale();
-      const int SS = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
-      setModRM(SS, idx, newBaseIdx);
-    } else {
-      setModRM(mod, reg, newBaseIdx);
-    }
-    if (mod == mod01) {
-      db(disp);
-    } else if (mod == mod10 || (mod == mod00 && !baseBit)) {
-      dd(disp);
-    }
-  }
-  LabelManager labelMgr_;
-  bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
-  void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE) {
-    rex(reg2, reg1);
-    db(code0 | (reg1.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    setModRM(3, reg1.getIdx(), reg2.getIdx());
-  }
-  void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    rex(addr, reg);
-    db(code0 | (reg.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    opAddr(addr, reg.getIdx(), immSize);
-  }
-  void opLoadSeg(const Address& addr, const Reg& reg, int code0, int code1 = NONE) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    rex(addr, reg);
-    db(code0);
-    if (code1 != NONE) db(code1);
-    opAddr(addr, reg.getIdx());
-  }
-  void opMIB(const Address& addr, const Reg& reg, int code0, int code1) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS)
-    if (BIT == 64 && addr.is32bit()) db(0x67);
-    const RegExp& regExp = addr.getRegExp(false);
-    uint8_t rex = regExp.getRex();
-    if (rex) db(rex);
-    db(code0);
-    db(code1);
-    setSIB(regExp, reg.getIdx());
-  }
-  void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    const int shortJmpSize = 2;
-    const int longHeaderSize = longPref ? 2 : 1;
-    const int longJmpSize = longHeaderSize + 4;
-    if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
-      db(shortCode);
-      db(disp - shortJmpSize);
-    } else {
-      if (type == T_SHORT) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(disp - longJmpSize);
-    }
-  }
-  bool isNEAR(LabelType type) const { return type == T_NEAR || (type == T_AUTO && isDefaultJmpNEAR_); }
-  template <class T>
-  void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) { /* label exists */
-      makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
-    } else {
-      int jmpSize = 0;
-      if (isNEAR(type)) {
-        jmpSize = 4;
-        if (longPref) db(longPref);
-        db(longCode);
-        dd(0);
-      } else {
-        jmpSize = 1;
-        db(shortCode);
-        db(0);
-      }
-      JmpLabel jmp(size_, jmpSize, inner::LasIs);
-      labelMgr_.addUndefinedLabel(label, jmp);
-    }
-  }
-  void opJmpAbs(const void* addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow()) {
-      if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
-      if (size_ + 16 >= maxSize_) growMemory();
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(0);
-      save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
-    } else {
-      makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8_t*>(addr) - getCurr()), type, shortCode, longCode,
-              longPref);
-    }
-  }
-  void opJmpOp(const Operand& op, LabelType type, int ext) {
-    const int bit = 16 | i32e;
-    if (type == T_FAR) {
-      if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-      opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
-    } else {
-      opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
-    }
-  }
-  // reg is reg field of ModRM
-  // immSize is the size for immediate value
-  // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
-  void opAddr(const Address& addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) {
-    if (!permitVisb && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    if (addr.getMode() == Address::M_ModRM) {
-      setSIB(addr.getRegExp(), reg, disp8N);
-    } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) {
-      setModRM(0, reg, 5);
-      if (addr.getLabel()) {  // [rip + Label]
-        putL_inner(*addr.getLabel(), true, addr.getDisp() - immSize);
-      } else {
-        size_t disp = addr.getDisp();
-        if (addr.getMode() == Address::M_ripAddr) {
-          if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW)
-          disp -= (size_t)getCurr() + 4 + immSize;
-        }
-        dd(inner::VerifyInInt32(disp));
-      }
-    }
-  }
-  /* preCode is for SSSE3/SSE4 */
-  void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&),
-             int imm8 = NONE, int preCode = NONE) {
-    if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op.isMEM()) {
-      opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
-    } else {
-      opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code);
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext) {
-    if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (mmx.isXMM()) db(0x66);
-    opModR(Reg32(ext), mmx, 0x0F, code);
-    db(imm8);
-  }
-  void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE) {
-    opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
-  }
-  void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) {
-    if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op1.isXMM() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
-    } else if (op1.isMEM() && op2.isXMM()) {
-      opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false) {
-    if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
-      if (mmx.isXMM()) db(0x66);
-      opModR(op.getReg(), mmx, 0x0F, 0xC5);
-      db(imm);
-    } else {
-      opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A);
-    }
-  }
-  void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE,
-                bool disableRex = false, int immSize = 0) {
-    int opBit = op.getBit();
-    if (disableRex && opBit == 64) opBit = 32;
-    if (op.isREG(bit)) {
-      opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2);
-    } else if (op.isMEM()) {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShift(const Operand& op, int imm, int ext) {
-    verifyMemHasSize(op);
-    opR_ModM(op, 0, ext, (0xC0 | ((imm == 1 ? 1 : 0) << 4)), NONE, NONE, false, (imm != 1) ? 1 : 0);
-    if (imm != 1) db(imm);
-  }
-  void opShift(const Operand& op, const Reg8& _cl, int ext) {
-    if (_cl.getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opR_ModM(op, 0, ext, 0xD2);
-  }
-  void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE,
-               int code2 = NONE, int immSize = 0) {
-    if (condR) {
-      opModR(op1.getReg(), op2.getReg(), code0, code1, code2);
-    } else if (condM) {
-      opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShxd(const Operand& op, const Reg& reg, uint8_t imm, int code, const Reg8* _cl = 0) {
-    if (_cl && _cl->getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F,
-            code | (_cl ? 1 : 0), NONE, _cl ? 0 : 1);
-    if (!_cl) db(imm);
-  }
-  // (REG, REG|MEM), (MEM, REG)
-  void opRM_RM(const Operand& op1, const Operand& op2, int code) {
-    if (op1.isREG() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), code | 2);
-    } else {
-      opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
-    }
-  }
-  // (REG|MEM, IMM)
-  void opRM_I(const Operand& op, uint32_t imm, int code, int ext) {
-    verifyMemHasSize(op);
-    uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
-    if (op.isBit(8)) immBit = 8;
-    if (op.getBit() < immBit) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-    if (op.isBit(32 | 64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
-    if (op.isREG() && op.getIdx() == 0 &&
-        (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) {  // rax, eax, ax, al
-      rex(op);
-      db(code | 4 | (immBit == 8 ? 0 : 1));
-    } else {
-      int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
-      opR_ModM(op, 0, ext, 0x80 | tmp, NONE, NONE, false, immBit / 8);
-    }
-    db(imm, immBit / 8);
-  }
-  void opIncDec(const Operand& op, int code, int ext) {
-    verifyMemHasSize(op);
-#ifndef XBYAK64
-    if (op.isREG() && !op.isBit(8)) {
-      rex(op);
-      db(code | op.getIdx());
-      return;
-    }
-#endif
-    code = 0xFE;
-    if (op.isREG()) {
-      opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code);
-    } else {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code);
-    }
-  }
-  void opPushPop(const Operand& op, int code, int ext, int alt) {
-    int bit = op.getBit();
-    if (bit == 16 || bit == BIT) {
-      if (bit == 16) db(0x66);
-      if (op.isREG()) {
-        if (op.getReg().getIdx() >= 8) db(0x41);
-        db(alt | (op.getIdx() & 7));
-        return;
-      }
-      if (op.isMEM()) {
-        opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code);
-        return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void verifyMemHasSize(const Operand& op) const {
-    if (op.isMEM() && op.getBit() == 0) XBYAK_THROW(ERR_MEM_SIZE_IS_NOT_SPECIFIED)
-  }
-  /*
-          mov(r, imm) = db(imm, mov_imm(r, imm))
-  */
-  int mov_imm(const Reg& reg, uint64_t imm) {
-    int bit = reg.getBit();
-    const int idx = reg.getIdx();
-    int code = 0xB0 | ((bit == 8 ? 0 : 1) << 3);
-    if (bit == 64 && (imm & ~uint64_t(0xffffffffu)) == 0) {
-      rex(Reg32(idx));
-      bit = 32;
-    } else {
-      rex(reg);
-      if (bit == 64 && inner::IsInInt32(imm)) {
-        db(0xC7);
-        code = 0xC0;
-        bit = 32;
-      }
-    }
-    db(code | (idx & 7));
-    return bit / 8;
-  }
-  template <class T>
-  void putL_inner(T& label, bool relative = false, size_t disp = 0) {
-    const int jmpSize = relative ? 4 : (int)sizeof(size_t);
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) {
-      if (relative) {
-        db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
-      } else if (isAutoGrow()) {
-        db(uint64_t(0), jmpSize);
-        save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
-      } else {
-        db(size_t(top_) + offset, jmpSize);
-      }
-      return;
-    }
-    db(uint64_t(0), jmpSize);
-    JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
-    labelMgr_.addUndefinedLabel(label, jmp);
-  }
-  void opMovxx(const Reg& reg, const Operand& op, uint8_t code) {
-    if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    int w = op.isBit(16);
-    bool cond = reg.isREG() && (reg.getBit() > op.getBit());
-    opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
-  }
-  void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    uint8_t code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    if (m64ext && addr.isBit(64)) ext = m64ext;
-
-    rex(addr, st0);
-    db(code);
-    opAddr(addr, ext);
-  }
-  // use code1 if reg1 == st0
-  // use code2 if reg1 != st0 && reg2 == st0
-  void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32_t code1, uint32_t code2) {
-    uint32_t code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_ST_COMBINATION)
-    db(uint8_t(code >> 8));
-    db(uint8_t(code | (reg1.getIdx() | reg2.getIdx())));
-  }
-  void opFpu(const Fpu& reg, uint8_t code1, uint8_t code2) {
-    db(code1);
-    db(code2 | reg.getIdx());
-  }
-  void opVex(const Reg& r, const Operand* p1, const Operand& op2, int type, int code, int imm8 = NONE) {
-    if (op2.isMEM()) {
-      const Address& addr = op2.getAddress();
-      const RegExp& regExp = addr.getRegExp();
-      const Reg& base = regExp.getBase();
-      const Reg& index = regExp.getIndex();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      int disp8N = 0;
-      bool x = index.isExtIdx();
-      if ((type & (T_MUST_EVEX | T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() ||
-          addr.getOpmaskIdx()) {
-        int aaa = addr.getOpmaskIdx();
-        if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
-        bool b = false;
-        if (addr.isBroadcast()) {
-          if (!(type & (T_B32 | T_B64))) XBYAK_THROW(ERR_INVALID_BROADCAST)
-          b = true;
-        }
-        int VL = regExp.isVsib() ? index.getBit() : 0;
-        disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
-      } else {
-        vex(r, base, p1, type, code, x);
-      }
-      opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
-    } else {
-      const Reg& base = op2.getReg();
-      if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
-        evex(r, base, p1, type, code);
-      } else {
-        vex(r, base, p1, type, code);
-      }
-      setModRM(3, r.getIdx(), base.getIdx());
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  // (r, r, r/m) if isR_R_RM
-  // (r, r/m, r)
-  void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8_t code, bool isR_R_RM,
-             int imm8 = NONE) {
-    const Operand* p1 = &op1;
-    const Operand* p2 = &op2;
-    if (!isR_R_RM) std::swap(p1, p2);
-    const unsigned int bit = r.getBit();
-    if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    type |= (bit == 64) ? T_W1 : T_W0;
-    opVex(r, p1, *p2, type, code, imm8);
-  }
-  void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE) {
-    const Xmm* x2 = static_cast<const Xmm*>(&op1);
-    const Operand* op = &op2;
-    if (op2.isNone()) {  // (x1, op1) -> (x1, x1, op1)
-      x2 = &x1;
-      op = &op1;
-    }
-    // (x1, x2, op)
-    if (!((x1.isXMM() && x2->isXMM()) ||
-          ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM())))))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x1, x2, *op, type, code0, imm8);
-  }
-  void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE) {
-    if (!op3.isMEM() && (x2.getKind() != op3.getKind())) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(k, &x2, op3, type, code0, imm8);
-  }
-  // (x, x/m), (y, x/m256), (z, y/m)
-  void checkCvt1(const Operand& x, const Operand& op) const {
-    if (!op.isMEM() && !(x.is(Operand::XMM | Operand::YMM) && op.isXMM()) && !(x.isZMM() && op.isYMM()))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/m), (x, y/m256), (y, z/m)
-  void checkCvt2(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opCvt(const Xmm& x, const Operand& op, int type, int code) {
-    Operand::Kind kind = x.isXMM() ? (op.isBit(256) ? Operand::YMM : Operand::XMM) : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  void opCvt2(const Xmm& x, const Operand& op, int type, int code) {
-    checkCvt2(x, op);
-    opCvt(x, op, type, code);
-  }
-  void opCvt3(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int type64, int type32, uint8_t code) {
-    if (!(x1.isXMM() && x2.isXMM() && (op.isREG(i32e) || op.isMEM()))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    Xmm x(op.getIdx());
-    const Operand* p = op.isREG() ? &x : &op;
-    opVex(x1, &x2, *p, type | (op.isBit(64) ? type64 : type32), code);
-  }
-  // (x, x/y/xword/yword), (y, z/m)
-  void checkCvt4(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM) && op.isBit(128 | 256)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/y/z/xword/yword/zword)
-  void opCvt5(const Xmm& x, const Operand& op, int type, int code) {
-    if (!(x.isXMM() && op.isBit(128 | 256 | 512))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; }
-  // support (x, x/m, imm), (y, y/m, imm)
-  void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, int imm8 = NONE) {
-    opAVX_X_X_XM(x, cvtIdx0(x), op, type, code, imm8);
-  }
-  // QQQ:need to refactor
-  void opSp1(const Reg& reg, const Operand& op, uint8_t pref, uint8_t code0, uint8_t code1) {
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
-    if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (is16bit) db(0x66);
-    db(pref);
-    opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
-  }
-  void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8_t code, int mode) {
-    const RegExp& regExp = addr.getRegExp();
-    if (!regExp.isVsib(128 | 256)) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    const int y_vx_y = 0;
-    const int y_vy_y = 1;
-    //		const int x_vy_x = 2;
-    const bool isAddrYMM = regExp.getIndex().getBit() == 256;
-    if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
-      bool isOK = false;
-      if (mode == y_vx_y) {
-        isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
-      } else if (mode == y_vy_y) {
-        isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
-      } else {  // x_vy_x
-        isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
-      }
-      if (!isOK) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    }
-    int i1 = x1.getIdx();
-    int i2 = regExp.getIndex().getIdx();
-    int i3 = x2.getIdx();
-    if (i1 == i2 || i1 == i3 || i2 == i3) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opAVX_X_X_XM(isAddrYMM ? Ymm(i1) : x1, isAddrYMM ? Ymm(i3) : x2, addr, type, code);
-  }
-  enum { xx_yy_zz = 0, xx_yx_zy = 1, xx_xy_yz = 2 };
-  void checkGather2(const Xmm& x1, const Reg& x2, int mode) const {
-    if (x1.isXMM() && x2.isXMM()) return;
-    switch (mode) {
-      case xx_yy_zz:
-        if ((x1.isYMM() && x2.isYMM()) || (x1.isZMM() && x2.isZMM())) return;
-        break;
-      case xx_yx_zy:
-        if ((x1.isYMM() && x2.isXMM()) || (x1.isZMM() && x2.isYMM())) return;
-        break;
-      case xx_xy_yz:
-        if ((x1.isXMM() && x2.isYMM()) || (x1.isYMM() && x2.isZMM())) return;
-        break;
-    }
-    XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-  }
-  void opGather2(const Xmm& x, const Address& addr, int type, uint8_t code, int mode) {
-    if (x.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    const RegExp& regExp = addr.getRegExp();
-    checkGather2(x, regExp.getIndex(), mode);
-    int maskIdx = x.getOpmaskIdx();
-    if ((type & T_M_K) && addr.getOpmaskIdx()) maskIdx = addr.getOpmaskIdx();
-    if (maskIdx == 0) XBYAK_THROW(ERR_K0_IS_INVALID);
-    if (!(type & T_M_K) && x.getIdx() == regExp.getIndex().getIdx()) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opVex(x, 0, addr, type, code);
-  }
-  /*
-          xx_xy_yz ; mode = true
-          xx_xy_xz ; mode = false
-  */
-  void opVmov(const Operand& op, const Xmm& x, int type, uint8_t code, bool mode) {
-    if (mode) {
-      if (!op.isMEM() && !((op.isXMM() && x.isXMM()) || (op.isXMM() && x.isYMM()) || (op.isYMM() && x.isZMM())))
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-    } else {
-      if (!op.isMEM() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-    opVex(x, 0, op, type, code);
-  }
-  void opGatherFetch(const Address& addr, const Xmm& x, int type, uint8_t code, Operand::Kind kind) {
-    if (addr.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    opVex(x, 0, addr, type, code);
-  }
-  void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) {
-    opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
-  }
-  int orEvexIf(PreferredEncoding encoding) {
-    if (encoding == DefaultEncoding) {
-      encoding = defaultEncoding_;
-    }
-    if (encoding == EvexEncoding) {
-#ifdef XBYAK_DISABLE_AVX512
-      XBYAK_THROW(ERR_EVEX_IS_INVALID)
-#endif
-      return T_MUST_EVEX;
-    }
-    return 0;
-  }
-  void opInOut(const Reg& a, const Reg& d, uint8_t code) {
-    if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          return;
-        case 32:
-          db(code + 1);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opInOut(const Reg& a, uint8_t code, uint8_t v) {
-    if (a.getIdx() == Operand::AL) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          db(v);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          db(v);
-          return;
-        case 32:
-          db(code + 1);
-          db(v);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-#ifdef XBYAK64
-  void opAMX(const Tmm& t1, const Address& addr, int type, int code0) {
-    // require both base and index
-    const RegExp exp = addr.getRegExp(false);
-    if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    opVex(t1, &tmm0, addr, type, code0);
-  }
-#endif
- public:
-  unsigned int getVersion() const { return VERSION; }
-  using CodeArray::db;
-  const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
-  const Zmm zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-  const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
-  const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
-  const Zmm &zm0, &zm1, &zm2, &zm3, &zm4, &zm5, &zm6, &zm7;
-  const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
-  const Reg16 ax, cx, dx, bx, sp, bp, si, di;
-  const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
-  const AddressFrame ptr, byte, word, dword, qword, xword, yword, zword;  // xword is same as oword of NASM
-  const AddressFrame ptr_b, xword_b, yword_b, zword_b;  // broadcast such as {1to2}, {1to4}, {1to8}, {1to16}, {b}
-  const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
-  const Opmask k0, k1, k2, k3, k4, k5, k6, k7;
-  const BoundsReg bnd0, bnd1, bnd2, bnd3;
-  const EvexModifierRounding T_sae, T_rn_sae, T_rd_sae, T_ru_sae,
-      T_rz_sae;                // {sae}, {rn-sae}, {rd-sae}, {ru-sae}, {rz-sae}
-  const EvexModifierZero T_z;  // {z}
-#ifdef XBYAK64
-  const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
-  const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
-  const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
-  const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
-  const Reg8 spl, bpl, sil, dil;
-  const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  const Xmm xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23;
-  const Xmm xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31;
-  const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
-  const Ymm ymm16, ymm17, ymm18, ymm19, ymm20, ymm21, ymm22, ymm23;
-  const Ymm ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
-  const Zmm zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
-  const Zmm zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
-  const Zmm zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
-  const Tmm tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7;
-  const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;  // for my convenience
-  const Xmm &xm16, &xm17, &xm18, &xm19, &xm20, &xm21, &xm22, &xm23;
-  const Xmm &xm24, &xm25, &xm26, &xm27, &xm28, &xm29, &xm30, &xm31;
-  const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
-  const Ymm &ym16, &ym17, &ym18, &ym19, &ym20, &ym21, &ym22, &ym23;
-  const Ymm &ym24, &ym25, &ym26, &ym27, &ym28, &ym29, &ym30, &ym31;
-  const Zmm &zm8, &zm9, &zm10, &zm11, &zm12, &zm13, &zm14, &zm15;
-  const Zmm &zm16, &zm17, &zm18, &zm19, &zm20, &zm21, &zm22, &zm23;
-  const Zmm &zm24, &zm25, &zm26, &zm27, &zm28, &zm29, &zm30, &zm31;
-  const RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-  const Segment es, cs, ss, ds, fs, gs;
-#endif
- private:
-  bool isDefaultJmpNEAR_;
-  PreferredEncoding defaultEncoding_;
-
- public:
-  void L(const std::string& label) { labelMgr_.defineSlabel(label); }
-  void L(Label& label) { labelMgr_.defineClabel(label); }
-  Label L() {
-    Label label;
-    L(label);
-    return label;
-  }
-  void inLocalLabel() { labelMgr_.enterLocal(); }
-  void outLocalLabel() { labelMgr_.leaveLocal(); }
-  /*
-          assign src to dst
-          require
-          dst : does not used by L()
-          src : used by L()
-  */
-  void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
-  /*
-          put address of label to buffer
-          @note the put size is 4(32-bit), 8(64-bit)
-  */
-  void putL(std::string label) { putL_inner(label); }
-  void putL(const Label& label) { putL_inner(label); }
-
-  // set default type of `jmp` of undefined label to T_NEAR
-  void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
-  void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
-  void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const char* label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
-  void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const void* addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
-
-  void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
-  // call(string label), not const std::string&
-  void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  void call(const char* label) { call(std::string(label)); }
-  void call(const Label& label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  // call(function pointer)
-#ifdef XBYAK_VARIADIC_TEMPLATE
-  template <class Ret, class... Params>
-  void call(Ret (*func)(Params...)) {
-    call(reinterpret_cast<const void*>(func));
-  }
-#endif
-  void call(const void* addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
-
-  void test(const Operand& op, const Reg& reg) {
-    opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), 0x84);
-  }
-  void test(const Operand& op, uint32_t imm) {
-    verifyMemHasSize(op);
-    int immSize = (std::min)(op.getBit() / 8, 4U);
-    if (op.isREG() && op.getIdx() == 0) {  // al, ax, eax
-      rex(op);
-      db(0xA8 | (op.isBit(8) ? 0 : 1));
-    } else {
-      opR_ModM(op, 0, 0, 0xF6, NONE, NONE, false, immSize);
-    }
-    db(imm, immSize);
-  }
-  void imul(const Reg& reg, const Operand& op) {
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, 0xAF);
-  }
-  void imul(const Reg& reg, const Operand& op, int imm) {
-    int s = inner::IsInDisp8(imm) ? 1 : 0;
-    int immSize = s ? 1 : reg.isREG(16) ? 2 : 4;
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x69 | (s << 1), NONE, NONE, immSize);
-    db(imm, immSize);
-  }
-  void push(const Operand& op) { opPushPop(op, 0xFF, 6, 0x50); }
-  void pop(const Operand& op) { opPushPop(op, 0x8F, 0, 0x58); }
-  void push(const AddressFrame& af, uint32_t imm) {
-    if (af.bit_ == 8) {
-      db(0x6A);
-      db(imm);
-    } else if (af.bit_ == 16) {
-      db(0x66);
-      db(0x68);
-      dw(imm);
-    } else {
-      db(0x68);
-      dd(imm);
-    }
-  }
-  /* use "push(word, 4)" if you want "push word 4" */
-  void push(uint32_t imm) {
-    if (inner::IsInDisp8(imm)) {
-      push(byte, imm);
-    } else {
-      push(dword, imm);
-    }
-  }
-  void mov(const Operand& reg1, const Operand& reg2) {
-    const Reg* reg = 0;
-    const Address* addr = 0;
-    uint8_t code = 0;
-    if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) {  // mov eax|ax|al, [disp]
-      reg = &reg1.getReg();
-      addr = &reg2.getAddress();
-      code = 0xA0;
-    } else if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) {  // mov [disp], eax|ax|al
-      reg = &reg2.getReg();
-      addr = &reg1.getAddress();
-      code = 0xA2;
-    }
-#ifdef XBYAK64
-    if (addr && addr->is64bitDisp()) {
-      if (code) {
-        rex(*reg);
-        db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
-        db(addr->getDisp(), 8);
-      } else {
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      }
-    } else
-#else
-    if (code && addr->isOnlyDisp()) {
-      rex(*reg, *addr);
-      db(code | (reg->isBit(8) ? 0 : 1));
-      dd(static_cast<uint32_t>(addr->getDisp()));
-    } else
-#endif
-    {
-      opRM_RM(reg1, reg2, 0x88);
-    }
-  }
-  void mov(const Operand& op, uint64_t imm) {
-    if (op.isREG()) {
-      const int size = mov_imm(op.getReg(), imm);
-      db(imm, size);
-    } else if (op.isMEM()) {
-      verifyMemHasSize(op);
-      int immSize = op.getBit() / 8;
-      if (immSize <= 4) {
-        int64_t s = int64_t(imm) >> (immSize * 8);
-        if (s != 0 && s != -1) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-      } else {
-        if (!inner::IsInInt32(imm)) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-        immSize = 4;
-      }
-      opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize);
-      db(static_cast<uint32_t>(imm), immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-
-  // The template is used to avoid ambiguity when the 2nd argument is 0.
-  // When the 2nd argument is 0 the call goes to
-  // `void mov(const Operand& op, uint64_t imm)`.
-  template <typename T1, typename T2>
-  void mov(const T1&, const T2*) {
-    T1::unexpected;
-  }
-  void mov(const NativeReg& reg, const Label& label) {
-    mov_imm(reg, dummyAddr);
-    putL(label);
-  }
-  void xchg(const Operand& op1, const Operand& op2) {
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
-      p1 = &op2;
-      p2 = &op1;
-    }
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
-#ifdef XBYAK64
-        && (p2->getIdx() != 0 || !p1->isREG(32))
-#endif
-    ) {
-      rex(*p2, *p1);
-      db(0x90 | (p2->getIdx() & 7));
-      return;
-    }
-    opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(),
-            0x86 | (p1->isBit(8) ? 0 : 1));
-  }
-
-#ifndef XBYAK_DISABLE_SEGMENT
-  void push(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x06);
-        break;
-      case Segment::cs:
-        db(0x0E);
-        break;
-      case Segment::ss:
-        db(0x16);
-        break;
-      case Segment::ds:
-        db(0x1E);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA0);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA8);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void pop(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x07);
-        break;
-      case Segment::cs:
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      case Segment::ss:
-        db(0x17);
-        break;
-      case Segment::ds:
-        db(0x1F);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA1);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA9);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void putSeg(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x2E);
-        break;
-      case Segment::cs:
-        db(0x36);
-        break;
-      case Segment::ss:
-        db(0x3E);
-        break;
-      case Segment::ds:
-        db(0x26);
-        break;
-      case Segment::fs:
-        db(0x64);
-        break;
-      case Segment::gs:
-        db(0x65);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void mov(const Operand& op, const Segment& seg) {
-    opModRM(Reg8(seg.getIdx()), op, op.isREG(16 | i32e), op.isMEM(), 0x8C);
-  }
-  void mov(const Segment& seg, const Operand& op) {
-    opModRM(Reg8(seg.getIdx()), op.isREG(16 | i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op,
-            op.isREG(16 | i32e), op.isMEM(), 0x8E);
-  }
-#endif
-
-  enum { NONE = 256 };
-  // constructor
-  CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void* userPtr = 0, Allocator* allocator = 0)
-      : CodeArray(maxSize, userPtr, allocator),
-        mm0(0),
-        mm1(1),
-        mm2(2),
-        mm3(3),
-        mm4(4),
-        mm5(5),
-        mm6(6),
-        mm7(7),
-        xmm0(0),
-        xmm1(1),
-        xmm2(2),
-        xmm3(3),
-        xmm4(4),
-        xmm5(5),
-        xmm6(6),
-        xmm7(7),
-        ymm0(0),
-        ymm1(1),
-        ymm2(2),
-        ymm3(3),
-        ymm4(4),
-        ymm5(5),
-        ymm6(6),
-        ymm7(7),
-        zmm0(0),
-        zmm1(1),
-        zmm2(2),
-        zmm3(3),
-        zmm4(4),
-        zmm5(5),
-        zmm6(6),
-        zmm7(7)
-        // for my convenience
-        ,
-        xm0(xmm0),
-        xm1(xmm1),
-        xm2(xmm2),
-        xm3(xmm3),
-        xm4(xmm4),
-        xm5(xmm5),
-        xm6(xmm6),
-        xm7(xmm7),
-        ym0(ymm0),
-        ym1(ymm1),
-        ym2(ymm2),
-        ym3(ymm3),
-        ym4(ymm4),
-        ym5(ymm5),
-        ym6(ymm6),
-        ym7(ymm7),
-        zm0(zmm0),
-        zm1(zmm1),
-        zm2(zmm2),
-        zm3(zmm3),
-        zm4(zmm4),
-        zm5(zmm5),
-        zm6(zmm6),
-        zm7(zmm7)
-
-        ,
-        eax(Operand::EAX),
-        ecx(Operand::ECX),
-        edx(Operand::EDX),
-        ebx(Operand::EBX),
-        esp(Operand::ESP),
-        ebp(Operand::EBP),
-        esi(Operand::ESI),
-        edi(Operand::EDI),
-        ax(Operand::AX),
-        cx(Operand::CX),
-        dx(Operand::DX),
-        bx(Operand::BX),
-        sp(Operand::SP),
-        bp(Operand::BP),
-        si(Operand::SI),
-        di(Operand::DI),
-        al(Operand::AL),
-        cl(Operand::CL),
-        dl(Operand::DL),
-        bl(Operand::BL),
-        ah(Operand::AH),
-        ch(Operand::CH),
-        dh(Operand::DH),
-        bh(Operand::BH),
-        ptr(0),
-        byte(8),
-        word(16),
-        dword(32),
-        qword(64),
-        xword(128),
-        yword(256),
-        zword(512),
-        ptr_b(0, true),
-        xword_b(128, true),
-        yword_b(256, true),
-        zword_b(512, true),
-        st0(0),
-        st1(1),
-        st2(2),
-        st3(3),
-        st4(4),
-        st5(5),
-        st6(6),
-        st7(7),
-        k0(0),
-        k1(1),
-        k2(2),
-        k3(3),
-        k4(4),
-        k5(5),
-        k6(6),
-        k7(7),
-        bnd0(0),
-        bnd1(1),
-        bnd2(2),
-        bnd3(3),
-        T_sae(EvexModifierRounding::T_SAE),
-        T_rn_sae(EvexModifierRounding::T_RN_SAE),
-        T_rd_sae(EvexModifierRounding::T_RD_SAE),
-        T_ru_sae(EvexModifierRounding::T_RU_SAE),
-        T_rz_sae(EvexModifierRounding::T_RZ_SAE),
-        T_z()
-#ifdef XBYAK64
-        ,
-        rax(Operand::RAX),
-        rcx(Operand::RCX),
-        rdx(Operand::RDX),
-        rbx(Operand::RBX),
-        rsp(Operand::RSP),
-        rbp(Operand::RBP),
-        rsi(Operand::RSI),
-        rdi(Operand::RDI),
-        r8(Operand::R8),
-        r9(Operand::R9),
-        r10(Operand::R10),
-        r11(Operand::R11),
-        r12(Operand::R12),
-        r13(Operand::R13),
-        r14(Operand::R14),
-        r15(Operand::R15),
-        r8d(8),
-        r9d(9),
-        r10d(10),
-        r11d(11),
-        r12d(12),
-        r13d(13),
-        r14d(14),
-        r15d(15),
-        r8w(8),
-        r9w(9),
-        r10w(10),
-        r11w(11),
-        r12w(12),
-        r13w(13),
-        r14w(14),
-        r15w(15),
-        r8b(8),
-        r9b(9),
-        r10b(10),
-        r11b(11),
-        r12b(12),
-        r13b(13),
-        r14b(14),
-        r15b(15),
-        spl(Operand::SPL, true),
-        bpl(Operand::BPL, true),
-        sil(Operand::SIL, true),
-        dil(Operand::DIL, true),
-        xmm8(8),
-        xmm9(9),
-        xmm10(10),
-        xmm11(11),
-        xmm12(12),
-        xmm13(13),
-        xmm14(14),
-        xmm15(15),
-        xmm16(16),
-        xmm17(17),
-        xmm18(18),
-        xmm19(19),
-        xmm20(20),
-        xmm21(21),
-        xmm22(22),
-        xmm23(23),
-        xmm24(24),
-        xmm25(25),
-        xmm26(26),
-        xmm27(27),
-        xmm28(28),
-        xmm29(29),
-        xmm30(30),
-        xmm31(31),
-        ymm8(8),
-        ymm9(9),
-        ymm10(10),
-        ymm11(11),
-        ymm12(12),
-        ymm13(13),
-        ymm14(14),
-        ymm15(15),
-        ymm16(16),
-        ymm17(17),
-        ymm18(18),
-        ymm19(19),
-        ymm20(20),
-        ymm21(21),
-        ymm22(22),
-        ymm23(23),
-        ymm24(24),
-        ymm25(25),
-        ymm26(26),
-        ymm27(27),
-        ymm28(28),
-        ymm29(29),
-        ymm30(30),
-        ymm31(31),
-        zmm8(8),
-        zmm9(9),
-        zmm10(10),
-        zmm11(11),
-        zmm12(12),
-        zmm13(13),
-        zmm14(14),
-        zmm15(15),
-        zmm16(16),
-        zmm17(17),
-        zmm18(18),
-        zmm19(19),
-        zmm20(20),
-        zmm21(21),
-        zmm22(22),
-        zmm23(23),
-        zmm24(24),
-        zmm25(25),
-        zmm26(26),
-        zmm27(27),
-        zmm28(28),
-        zmm29(29),
-        zmm30(30),
-        zmm31(31),
-        tmm0(0),
-        tmm1(1),
-        tmm2(2),
-        tmm3(3),
-        tmm4(4),
-        tmm5(5),
-        tmm6(6),
-        tmm7(7)
-        // for my convenience
-        ,
-        xm8(xmm8),
-        xm9(xmm9),
-        xm10(xmm10),
-        xm11(xmm11),
-        xm12(xmm12),
-        xm13(xmm13),
-        xm14(xmm14),
-        xm15(xmm15),
-        xm16(xmm16),
-        xm17(xmm17),
-        xm18(xmm18),
-        xm19(xmm19),
-        xm20(xmm20),
-        xm21(xmm21),
-        xm22(xmm22),
-        xm23(xmm23),
-        xm24(xmm24),
-        xm25(xmm25),
-        xm26(xmm26),
-        xm27(xmm27),
-        xm28(xmm28),
-        xm29(xmm29),
-        xm30(xmm30),
-        xm31(xmm31),
-        ym8(ymm8),
-        ym9(ymm9),
-        ym10(ymm10),
-        ym11(ymm11),
-        ym12(ymm12),
-        ym13(ymm13),
-        ym14(ymm14),
-        ym15(ymm15),
-        ym16(ymm16),
-        ym17(ymm17),
-        ym18(ymm18),
-        ym19(ymm19),
-        ym20(ymm20),
-        ym21(ymm21),
-        ym22(ymm22),
-        ym23(ymm23),
-        ym24(ymm24),
-        ym25(ymm25),
-        ym26(ymm26),
-        ym27(ymm27),
-        ym28(ymm28),
-        ym29(ymm29),
-        ym30(ymm30),
-        ym31(ymm31),
-        zm8(zmm8),
-        zm9(zmm9),
-        zm10(zmm10),
-        zm11(zmm11),
-        zm12(zmm12),
-        zm13(zmm13),
-        zm14(zmm14),
-        zm15(zmm15),
-        zm16(zmm16),
-        zm17(zmm17),
-        zm18(zmm18),
-        zm19(zmm19),
-        zm20(zmm20),
-        zm21(zmm21),
-        zm22(zmm22),
-        zm23(zmm23),
-        zm24(zmm24),
-        zm25(zmm25),
-        zm26(zmm26),
-        zm27(zmm27),
-        zm28(zmm28),
-        zm29(zmm29),
-        zm30(zmm30),
-        zm31(zmm31),
-        rip()
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-        ,
-        es(Segment::es),
-        cs(Segment::cs),
-        ss(Segment::ss),
-        ds(Segment::ds),
-        fs(Segment::fs),
-        gs(Segment::gs)
-#endif
-        ,
-        isDefaultJmpNEAR_(false),
-        defaultEncoding_(EvexEncoding) {
-    labelMgr_.set(this);
-  }
-  void reset() {
-    ClearError();
-    resetSize();
-    labelMgr_.reset();
-    labelMgr_.set(this);
-  }
-  bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
-  /*
-          MUST call ready() to complete generating code if you use AutoGrow mode.
-          It is not necessary for the other mode if hasUndefinedLabel() is true.
-  */
-  void ready(ProtectMode mode = PROTECT_RWE) {
-    if (hasUndefinedLabel()) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    if (isAutoGrow()) {
-      calcJmpAddress();
-      if (useProtect()) setProtectMode(mode);
-    }
-  }
-  // set read/exec
-  void readyRE() { return ready(PROTECT_RE); }
-#ifdef XBYAK_TEST
-  void dump(bool doClear = true) {
-    CodeArray::dump();
-    if (doClear) size_ = 0;
-  }
-#endif
-
-#ifdef XBYAK_UNDEF_JNL
-#undef jnl
-#endif
-
-  // set default encoding to select Vex or Evex
-  void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
-
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void nop(size_t size = 1, bool useMultiByteNop = true) {
-    if (!useMultiByteNop) {
-      for (size_t i = 0; i < size; i++) {
-        db(0x90);
-      }
-      return;
-    }
-    /*
-            Intel Architectures Software Developer's Manual Volume 2
-            recommended multi-byte sequence of NOP instruction
-            AMD and Intel seem to agree on the same sequences for up to 9 bytes:
-            https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
-    */
-    static const uint8_t nopTbl[9][9] = {
-        {0x90},
-        {0x66, 0x90},
-        {0x0F, 0x1F, 0x00},
-        {0x0F, 0x1F, 0x40, 0x00},
-        {0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
-        {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    };
-    const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
-    while (size > 0) {
-      size_t len = (std::min)(n, size);
-      const uint8_t* seq = nopTbl[len - 1];
-      db(seq, len);
-      size -= len;
-    }
-  }
-
-#ifndef XBYAK_DONT_READ_LIST
-#include "xbyak_mnemonic.h"
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void align(size_t x = 16, bool useMultiByteNop = true) {
-    if (x == 1) return;
-    if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
-    if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
-    size_t remain = size_t(getCurr()) % x;
-    if (remain) {
-      nop(x - remain, useMultiByteNop);
-    }
-  }
-#endif
-};
-
-template <>
-inline void CodeGenerator::mov(const NativeReg& reg, const char* label)  // can't use std::string
-{
-  assert(label);
-  mov_imm(reg, dummyAddr);
-  putL(label);
-}
-
-namespace util {
-static const XBYAK_CONSTEXPR Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
-static const XBYAK_CONSTEXPR Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
-static const XBYAK_CONSTEXPR Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
-static const XBYAK_CONSTEXPR Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7);
-static const XBYAK_CONSTEXPR Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX),
-    esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
-static const XBYAK_CONSTEXPR Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP),
-    bp(Operand::BP), si(Operand::SI), di(Operand::DI);
-static const XBYAK_CONSTEXPR Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH),
-    ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
-static const XBYAK_CONSTEXPR AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256),
-    zword(512);
-static const XBYAK_CONSTEXPR AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true);
-static const XBYAK_CONSTEXPR Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
-static const XBYAK_CONSTEXPR Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7);
-static const XBYAK_CONSTEXPR BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3);
-static const XBYAK_CONSTEXPR EvexModifierRounding T_sae(EvexModifierRounding::T_SAE),
-    T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE),
-    T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE);
-static const XBYAK_CONSTEXPR EvexModifierZero T_z;
-#ifdef XBYAK64
-static const XBYAK_CONSTEXPR Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX),
-    rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9),
-    r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
-static const XBYAK_CONSTEXPR Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15);
-static const XBYAK_CONSTEXPR Reg16 r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15);
-static const XBYAK_CONSTEXPR Reg8 r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15),
-    spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true);
-static const XBYAK_CONSTEXPR Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
-static const XBYAK_CONSTEXPR Xmm xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23);
-static const XBYAK_CONSTEXPR Xmm xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31);
-static const XBYAK_CONSTEXPR Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
-static const XBYAK_CONSTEXPR Ymm ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23);
-static const XBYAK_CONSTEXPR Ymm ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31);
-static const XBYAK_CONSTEXPR Zmm zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15);
-static const XBYAK_CONSTEXPR Zmm zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23);
-static const XBYAK_CONSTEXPR Zmm zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31);
-static const XBYAK_CONSTEXPR Zmm tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7);
-static const XBYAK_CONSTEXPR RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds),
-    fs(Segment::fs), gs(Segment::gs);
-#endif
-}  // namespace util
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-
-}  // namespace Xbyak
-
-#endif  // XBYAK_XBYAK_H_
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
deleted file mode 100644
index fda7da3c9b7c1..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
+++ /dev/null
@@ -1,271 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-enum {
-  B00000000 = 0,
-  B00000001 = 1,
-  B00000010 = 2,
-  B00000011 = 3,
-  B00000100 = 4,
-  B00000101 = 5,
-  B00000110 = 6,
-  B00000111 = 7,
-  B00001000 = 8,
-  B00001001 = 9,
-  B00001010 = 10,
-  B00001011 = 11,
-  B00001100 = 12,
-  B00001101 = 13,
-  B00001110 = 14,
-  B00001111 = 15,
-  B00010000 = 16,
-  B00010001 = 17,
-  B00010010 = 18,
-  B00010011 = 19,
-  B00010100 = 20,
-  B00010101 = 21,
-  B00010110 = 22,
-  B00010111 = 23,
-  B00011000 = 24,
-  B00011001 = 25,
-  B00011010 = 26,
-  B00011011 = 27,
-  B00011100 = 28,
-  B00011101 = 29,
-  B00011110 = 30,
-  B00011111 = 31,
-  B00100000 = 32,
-  B00100001 = 33,
-  B00100010 = 34,
-  B00100011 = 35,
-  B00100100 = 36,
-  B00100101 = 37,
-  B00100110 = 38,
-  B00100111 = 39,
-  B00101000 = 40,
-  B00101001 = 41,
-  B00101010 = 42,
-  B00101011 = 43,
-  B00101100 = 44,
-  B00101101 = 45,
-  B00101110 = 46,
-  B00101111 = 47,
-  B00110000 = 48,
-  B00110001 = 49,
-  B00110010 = 50,
-  B00110011 = 51,
-  B00110100 = 52,
-  B00110101 = 53,
-  B00110110 = 54,
-  B00110111 = 55,
-  B00111000 = 56,
-  B00111001 = 57,
-  B00111010 = 58,
-  B00111011 = 59,
-  B00111100 = 60,
-  B00111101 = 61,
-  B00111110 = 62,
-  B00111111 = 63,
-  B01000000 = 64,
-  B01000001 = 65,
-  B01000010 = 66,
-  B01000011 = 67,
-  B01000100 = 68,
-  B01000101 = 69,
-  B01000110 = 70,
-  B01000111 = 71,
-  B01001000 = 72,
-  B01001001 = 73,
-  B01001010 = 74,
-  B01001011 = 75,
-  B01001100 = 76,
-  B01001101 = 77,
-  B01001110 = 78,
-  B01001111 = 79,
-  B01010000 = 80,
-  B01010001 = 81,
-  B01010010 = 82,
-  B01010011 = 83,
-  B01010100 = 84,
-  B01010101 = 85,
-  B01010110 = 86,
-  B01010111 = 87,
-  B01011000 = 88,
-  B01011001 = 89,
-  B01011010 = 90,
-  B01011011 = 91,
-  B01011100 = 92,
-  B01011101 = 93,
-  B01011110 = 94,
-  B01011111 = 95,
-  B01100000 = 96,
-  B01100001 = 97,
-  B01100010 = 98,
-  B01100011 = 99,
-  B01100100 = 100,
-  B01100101 = 101,
-  B01100110 = 102,
-  B01100111 = 103,
-  B01101000 = 104,
-  B01101001 = 105,
-  B01101010 = 106,
-  B01101011 = 107,
-  B01101100 = 108,
-  B01101101 = 109,
-  B01101110 = 110,
-  B01101111 = 111,
-  B01110000 = 112,
-  B01110001 = 113,
-  B01110010 = 114,
-  B01110011 = 115,
-  B01110100 = 116,
-  B01110101 = 117,
-  B01110110 = 118,
-  B01110111 = 119,
-  B01111000 = 120,
-  B01111001 = 121,
-  B01111010 = 122,
-  B01111011 = 123,
-  B01111100 = 124,
-  B01111101 = 125,
-  B01111110 = 126,
-  B01111111 = 127,
-  B10000000 = 128,
-  B10000001 = 129,
-  B10000010 = 130,
-  B10000011 = 131,
-  B10000100 = 132,
-  B10000101 = 133,
-  B10000110 = 134,
-  B10000111 = 135,
-  B10001000 = 136,
-  B10001001 = 137,
-  B10001010 = 138,
-  B10001011 = 139,
-  B10001100 = 140,
-  B10001101 = 141,
-  B10001110 = 142,
-  B10001111 = 143,
-  B10010000 = 144,
-  B10010001 = 145,
-  B10010010 = 146,
-  B10010011 = 147,
-  B10010100 = 148,
-  B10010101 = 149,
-  B10010110 = 150,
-  B10010111 = 151,
-  B10011000 = 152,
-  B10011001 = 153,
-  B10011010 = 154,
-  B10011011 = 155,
-  B10011100 = 156,
-  B10011101 = 157,
-  B10011110 = 158,
-  B10011111 = 159,
-  B10100000 = 160,
-  B10100001 = 161,
-  B10100010 = 162,
-  B10100011 = 163,
-  B10100100 = 164,
-  B10100101 = 165,
-  B10100110 = 166,
-  B10100111 = 167,
-  B10101000 = 168,
-  B10101001 = 169,
-  B10101010 = 170,
-  B10101011 = 171,
-  B10101100 = 172,
-  B10101101 = 173,
-  B10101110 = 174,
-  B10101111 = 175,
-  B10110000 = 176,
-  B10110001 = 177,
-  B10110010 = 178,
-  B10110011 = 179,
-  B10110100 = 180,
-  B10110101 = 181,
-  B10110110 = 182,
-  B10110111 = 183,
-  B10111000 = 184,
-  B10111001 = 185,
-  B10111010 = 186,
-  B10111011 = 187,
-  B10111100 = 188,
-  B10111101 = 189,
-  B10111110 = 190,
-  B10111111 = 191,
-  B11000000 = 192,
-  B11000001 = 193,
-  B11000010 = 194,
-  B11000011 = 195,
-  B11000100 = 196,
-  B11000101 = 197,
-  B11000110 = 198,
-  B11000111 = 199,
-  B11001000 = 200,
-  B11001001 = 201,
-  B11001010 = 202,
-  B11001011 = 203,
-  B11001100 = 204,
-  B11001101 = 205,
-  B11001110 = 206,
-  B11001111 = 207,
-  B11010000 = 208,
-  B11010001 = 209,
-  B11010010 = 210,
-  B11010011 = 211,
-  B11010100 = 212,
-  B11010101 = 213,
-  B11010110 = 214,
-  B11010111 = 215,
-  B11011000 = 216,
-  B11011001 = 217,
-  B11011010 = 218,
-  B11011011 = 219,
-  B11011100 = 220,
-  B11011101 = 221,
-  B11011110 = 222,
-  B11011111 = 223,
-  B11100000 = 224,
-  B11100001 = 225,
-  B11100010 = 226,
-  B11100011 = 227,
-  B11100100 = 228,
-  B11100101 = 229,
-  B11100110 = 230,
-  B11100111 = 231,
-  B11101000 = 232,
-  B11101001 = 233,
-  B11101010 = 234,
-  B11101011 = 235,
-  B11101100 = 236,
-  B11101101 = 237,
-  B11101110 = 238,
-  B11101111 = 239,
-  B11110000 = 240,
-  B11110001 = 241,
-  B11110010 = 242,
-  B11110011 = 243,
-  B11110100 = 244,
-  B11110101 = 245,
-  B11110110 = 246,
-  B11110111 = 247,
-  B11111000 = 248,
-  B11111001 = 249,
-  B11111010 = 250,
-  B11111011 = 251,
-  B11111100 = 252,
-  B11111101 = 253,
-  B11111110 = 254,
-  B11111111 = 255
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
deleted file mode 100644
index 533b1712a7669..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
+++ /dev/null
@@ -1,4728 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-const char* getVersionString() const { return "6.73"; }
-void aadd(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
-void aand(const Address& addr, const Reg32e& reg) {
-  db(0x66);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
-void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
-void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
-void add(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x00, 0); }
-void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
-void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
-void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
-void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
-void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
-void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
-void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
-void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
-void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aeskeygenassist(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void and_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x20, 4); }
-void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
-void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_0F38, 0xf2, true); }
-void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
-void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
-void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
-void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
-void aor(const Address& addr, const Reg32e& reg) {
-  db(0xF2);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void axor(const Address& addr, const Reg32e& reg) {
-  db(0xF3);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
-void blendpd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void bnd() { db(0xF2); }
-void bndcl(const BoundsReg& bnd, const Operand& op) {
-  db(0xF3);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndcn(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM());
-}
-void bndcu(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }
-void bndmk(const BoundsReg& bnd, const Address& addr) {
-  db(0xF3);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const Address& addr, const BoundsReg& bnd) {
-  db(0x66);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const BoundsReg& bnd, const Operand& op) {
-  db(0x66);
-  opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A);
-}
-void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }
-void bsf(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
-void bsr(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
-void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); }
-void bt(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xA3);
-}
-void bt(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 4, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btc(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xBB);
-}
-void btc(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 7, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btr(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xB3);
-}
-void btr(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 6, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bts(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xAB);
-}
-void bts(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 5, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf5, false); }
-void cbw() {
-  db(0x66);
-  db(0x98);
-}
-void cdq() { db(0x99); }
-void clc() { db(0xF8); }
-void cld() { db(0xFC); }
-void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
-void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
-void clflushopt(const Address& addr) {
-  db(0x66);
-  opModM(addr, Reg32(7), 0x0F, 0xAE);
-}
-void cli() { db(0xFA); }
-void clwb(const Address& addr) {
-  db(0x66);
-  opMIB(addr, esi, 0x0F, 0xAE);
-}
-void clzero() {
-  db(0x0F);
-  db(0x01);
-  db(0xFC);
-}
-void cmc() { db(0xF5); }
-void cmova(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmove(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmovg(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovna(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovnae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovnb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovnbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovnc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovne(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovng(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovnge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovnl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovnle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovno(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 1);
-}  //-V524
-void cmovnp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovns(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 9);
-}  //-V524
-void cmovnz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 0);
-}  //-V524
-void cmovp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovs(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 8);
-}  //-V524
-void cmovz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmp(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x38, 7); }
-void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
-void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
-void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
-void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
-void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
-void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
-void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
-void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
-void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
-void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
-void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
-void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
-void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
-void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
-void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
-void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
-void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
-void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
-void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
-void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
-void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
-void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
-void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
-void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
-void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
-void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
-void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
-void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
-void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
-void cmppd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
-void cmpps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
-void cmpsb() { db(0xA6); }
-void cmpsd() { db(0xA7); }
-void cmpsd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
-void cmpss(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
-void cmpsw() {
-  db(0x66);
-  db(0xA7);
-}
-void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
-void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
-void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
-void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
-void cmpxchg(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xB0 | (reg.isBit(8) ? 0 : 1));
-}
-void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }
-void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
-void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
-void cpuid() {
-  db(0x0F);
-  db(0xA2);
-}
-void crc32(const Reg32e& reg, const Operand& op) {
-  if (reg.isBit(32) && op.isBit(16)) db(0x66);
-  db(0xF2);
-  opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
-}
-void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
-void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
-void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
-void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
-void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
-void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
-void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
-void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
-void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
-void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
-void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
-void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
-void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
-void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
-void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
-void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
-void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
-void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
-void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
-void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
-void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
-void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
-void cwd() {
-  db(0x66);
-  db(0x99);
-}
-void cwde() { db(0x98); }
-void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
-void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
-void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
-void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
-void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
-void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
-void dppd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void dpps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void emms() {
-  db(0x0F);
-  db(0x77);
-}
-void endbr32() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFB);
-}
-void endbr64() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFA);
-}
-void enter(uint16_t x, uint8_t y) {
-  db(0xC8);
-  dw(x);
-  db(y);
-}
-void extractps(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x17, imm); }
-void f2xm1() {
-  db(0xD9);
-  db(0xF0);
-}
-void fabs() {
-  db(0xD9);
-  db(0xE1);
-}
-void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
-void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
-void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
-void faddp() {
-  db(0xDE);
-  db(0xC1);
-}
-void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
-void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
-void fbld(const Address& addr) { opModM(addr, Reg32(4), 0xDF, 0x100); }
-void fbstp(const Address& addr) { opModM(addr, Reg32(6), 0xDF, 0x100); }
-void fchs() {
-  db(0xD9);
-  db(0xE0);
-}
-void fclex() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE2);
-}
-void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
-void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
-void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
-void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
-void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
-void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
-void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
-void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
-void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
-void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
-void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
-void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
-void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
-void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
-void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
-void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
-void fcom() {
-  db(0xD8);
-  db(0xD1);
-}
-void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
-void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
-void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
-void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
-void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
-void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
-void fcomp() {
-  db(0xD8);
-  db(0xD9);
-}
-void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
-void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
-void fcompp() {
-  db(0xDE);
-  db(0xD9);
-}
-void fcos() {
-  db(0xD9);
-  db(0xFF);
-}
-void fdecstp() {
-  db(0xD9);
-  db(0xF6);
-}
-void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
-void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
-void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
-void fdivp() {
-  db(0xDE);
-  db(0xF9);
-}
-void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
-void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
-void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
-void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
-void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
-void fdivrp() {
-  db(0xDE);
-  db(0xF1);
-}
-void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
-void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
-void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
-void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
-void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
-void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
-void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
-void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
-void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
-void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
-void fincstp() {
-  db(0xD9);
-  db(0xF7);
-}
-void finit() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE3);
-}
-void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
-void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
-void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
-void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
-void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
-void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
-void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
-void fld1() {
-  db(0xD9);
-  db(0xE8);
-}
-void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
-void fldenv(const Address& addr) { opModM(addr, Reg32(4), 0xD9, 0x100); }
-void fldl2e() {
-  db(0xD9);
-  db(0xEA);
-}
-void fldl2t() {
-  db(0xD9);
-  db(0xE9);
-}
-void fldlg2() {
-  db(0xD9);
-  db(0xEC);
-}
-void fldln2() {
-  db(0xD9);
-  db(0xED);
-}
-void fldpi() {
-  db(0xD9);
-  db(0xEB);
-}
-void fldz() {
-  db(0xD9);
-  db(0xEE);
-}
-void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
-void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
-void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
-void fmulp() {
-  db(0xDE);
-  db(0xC9);
-}
-void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
-void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
-void fnclex() {
-  db(0xDB);
-  db(0xE2);
-}
-void fninit() {
-  db(0xDB);
-  db(0xE3);
-}
-void fnop() {
-  db(0xD9);
-  db(0xD0);
-}
-void fnsave(const Address& addr) { opModM(addr, Reg32(6), 0xDD, 0x100); }
-void fnstcw(const Address& addr) { opModM(addr, Reg32(7), 0xD9, 0x100); }
-void fnstenv(const Address& addr) { opModM(addr, Reg32(6), 0xD9, 0x100); }
-void fnstsw(const Address& addr) { opModM(addr, Reg32(7), 0xDD, 0x100); }
-void fnstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xDF);
-  db(0xE0);
-}
-void fpatan() {
-  db(0xD9);
-  db(0xF3);
-}
-void fprem() {
-  db(0xD9);
-  db(0xF8);
-}
-void fprem1() {
-  db(0xD9);
-  db(0xF5);
-}
-void fptan() {
-  db(0xD9);
-  db(0xF2);
-}
-void frndint() {
-  db(0xD9);
-  db(0xFC);
-}
-void frstor(const Address& addr) { opModM(addr, Reg32(4), 0xDD, 0x100); }
-void fsave(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xDD, 0x100);
-}
-void fscale() {
-  db(0xD9);
-  db(0xFD);
-}
-void fsin() {
-  db(0xD9);
-  db(0xFE);
-}
-void fsincos() {
-  db(0xD9);
-  db(0xFB);
-}
-void fsqrt() {
-  db(0xD9);
-  db(0xFA);
-}
-void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
-void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
-void fstcw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xD9, 0x100);
-}
-void fstenv(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xD9, 0x100);
-}
-void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
-void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
-void fstsw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xDD, 0x100);
-}
-void fstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x9B);
-  db(0xDF);
-  db(0xE0);
-}
-void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
-void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
-void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
-void fsubp() {
-  db(0xDE);
-  db(0xE9);
-}
-void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
-void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
-void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
-void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
-void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
-void fsubrp() {
-  db(0xDE);
-  db(0xE1);
-}
-void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
-void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
-void ftst() {
-  db(0xD9);
-  db(0xE4);
-}
-void fucom() {
-  db(0xDD);
-  db(0xE1);
-}
-void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
-void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
-void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
-void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
-void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
-void fucomp() {
-  db(0xDD);
-  db(0xE9);
-}
-void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
-void fucompp() {
-  db(0xDA);
-  db(0xE9);
-}
-void fwait() { db(0x9B); }
-void fxam() {
-  db(0xD9);
-  db(0xE5);
-}
-void fxch() {
-  db(0xD9);
-  db(0xC9);
-}
-void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
-void fxrstor(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xAE); }
-void fxtract() {
-  db(0xD9);
-  db(0xF4);
-}
-void fyl2x() {
-  db(0xD9);
-  db(0xF1);
-}
-void fyl2xp1() {
-  db(0xD9);
-  db(0xF9);
-}
-void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
-void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
-void hlt() { db(0xF4); }
-void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
-void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
-void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
-void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
-void in_(const Reg& a, const Reg& d) { opInOut(a, d, 0xEC); }
-void in_(const Reg& a, uint8_t v) { opInOut(a, 0xE4, v); }
-void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
-void insertps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void int3() { db(0xCC); }
-void int_(uint8_t x) {
-  db(0xCD);
-  db(x);
-}
-void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }    //-V524
-void ja(const char* label, LabelType type = T_AUTO) { ja(std::string(label), type); }             //-V524
-void ja(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                           //-V524
-void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }     //-V524
-void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jae(const char* label, LabelType type = T_AUTO) { jae(std::string(label), type); }           //-V524
-void jae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jb(const char* label, LabelType type = T_AUTO) { jb(std::string(label), type); }             //-V524
-void jb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jbe(const char* label, LabelType type = T_AUTO) { jbe(std::string(label), type); }           //-V524
-void jbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jc(const char* label, LabelType type = T_AUTO) { jc(std::string(label), type); }             //-V524
-void jc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void je(const char* label, LabelType type = T_AUTO) { je(std::string(label), type); }             //-V524
-void je(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }    //-V524
-void jg(const char* label, LabelType type = T_AUTO) { jg(std::string(label), type); }             //-V524
-void jg(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                           //-V524
-void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }     //-V524
-void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jge(const char* label, LabelType type = T_AUTO) { jge(std::string(label), type); }           //-V524
-void jge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }    //-V524
-void jl(const char* label, LabelType type = T_AUTO) { jl(std::string(label), type); }             //-V524
-void jl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                           //-V524
-void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }     //-V524
-void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jle(const char* label, LabelType type = T_AUTO) { jle(std::string(label), type); }           //-V524
-void jle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jna(const char* label, LabelType type = T_AUTO) { jna(std::string(label), type); }           //-V524
-void jna(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }  //-V524
-void jnae(const char* label, LabelType type = T_AUTO) { jnae(std::string(label), type); }         //-V524
-void jnae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                         //-V524
-void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }   //-V524
-void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnb(const char* label, LabelType type = T_AUTO) { jnb(std::string(label), type); }           //-V524
-void jnb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }  //-V524
-void jnbe(const char* label, LabelType type = T_AUTO) { jnbe(std::string(label), type); }         //-V524
-void jnbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                         //-V524
-void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }   //-V524
-void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnc(const char* label, LabelType type = T_AUTO) { jnc(std::string(label), type); }           //-V524
-void jnc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jne(const char* label, LabelType type = T_AUTO) { jne(std::string(label), type); }           //-V524
-void jne(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jng(const char* label, LabelType type = T_AUTO) { jng(std::string(label), type); }           //-V524
-void jng(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }  //-V524
-void jnge(const char* label, LabelType type = T_AUTO) { jnge(std::string(label), type); }         //-V524
-void jnge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                         //-V524
-void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }   //-V524
-void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jnl(const char* label, LabelType type = T_AUTO) { jnl(std::string(label), type); }           //-V524
-void jnl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }  //-V524
-void jnle(const char* label, LabelType type = T_AUTO) { jnle(std::string(label), type); }         //-V524
-void jnle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                         //-V524
-void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }   //-V524
-void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }   //-V524
-void jno(const char* label, LabelType type = T_AUTO) { jno(std::string(label), type); }           //-V524
-void jno(const void* addr) { opJmpAbs(addr, T_NEAR, 0x71, 0x81, 0x0F); }                          //-V524
-void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }    //-V524
-void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jnp(const char* label, LabelType type = T_AUTO) { jnp(std::string(label), type); }           //-V524
-void jnp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }   //-V524
-void jns(const char* label, LabelType type = T_AUTO) { jns(std::string(label), type); }           //-V524
-void jns(const void* addr) { opJmpAbs(addr, T_NEAR, 0x79, 0x89, 0x0F); }                          //-V524
-void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }    //-V524
-void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jnz(const char* label, LabelType type = T_AUTO) { jnz(std::string(label), type); }           //-V524
-void jnz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }    //-V524
-void jo(const char* label, LabelType type = T_AUTO) { jo(std::string(label), type); }             //-V524
-void jo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x70, 0x80, 0x0F); }                           //-V524
-void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }     //-V524
-void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jp(const char* label, LabelType type = T_AUTO) { jp(std::string(label), type); }             //-V524
-void jp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                           //-V524
-void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }     //-V524
-void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }   //-V524
-void jpe(const char* label, LabelType type = T_AUTO) { jpe(std::string(label), type); }           //-V524
-void jpe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                          //-V524
-void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jpo(const char* label, LabelType type = T_AUTO) { jpo(std::string(label), type); }           //-V524
-void jpo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }    //-V524
-void js(const char* label, LabelType type = T_AUTO) { js(std::string(label), type); }             //-V524
-void js(const void* addr) { opJmpAbs(addr, T_NEAR, 0x78, 0x88, 0x0F); }                           //-V524
-void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }     //-V524
-void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void jz(const char* label, LabelType type = T_AUTO) { jz(std::string(label), type); }             //-V524
-void jz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void lahf() { db(0x9F); }
-void lddqu(const Xmm& xmm, const Address& addr) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0xF0);
-}
-void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
-void lea(const Reg& reg, const Address& addr) {
-  if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModM(addr, reg, 0x8D);
-}
-void leave() { db(0xC9); }
-void lfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xE8);
-}
-void lfs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB4); }
-void lgs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB5); }
-void lock() { db(0xF0); }
-void lodsb() { db(0xAC); }
-void lodsd() { db(0xAD); }
-void lodsw() {
-  db(0x66);
-  db(0xAD);
-}
-void loop(const Label& label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loop(const char* label) { loop(std::string(label)); }
-void loop(std::string label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loope(const Label& label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loope(const char* label) { loope(std::string(label)); }
-void loope(std::string label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loopne(const Label& label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void loopne(const char* label) { loopne(std::string(label)); }
-void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB2); }
-void lzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
-void maskmovdqu(const Xmm& reg1, const Xmm& reg2) {
-  db(0x66);
-  opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maskmovq(const Mmx& reg1, const Mmx& reg2) {
-  if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
-void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
-void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
-void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
-void mfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF0);
-}
-void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
-void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
-void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
-void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
-void monitor() {
-  db(0x0F);
-  db(0x01);
-  db(0xC8);
-}
-void monitorx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFA);
-}
-void movapd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x29);
-}
-void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
-void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
-void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
-void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
-void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
-void movd(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x7E);
-}
-void movd(const Mmx& mmx, const Address& addr) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x6E);
-}
-void movd(const Mmx& mmx, const Reg32& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movd(const Reg32& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
-void movdir64b(const Reg& reg, const Address& addr) {
-  db(0x66);
-  opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8);
-}
-void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
-void movdq2q(const Mmx& mmx, const Xmm& xmm) {
-  db(0xF2);
-  opModR(mmx, xmm, 0x0F, 0xD6);
-}
-void movdqa(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
-void movdqu(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
-void movhlps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x12); }
-void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
-void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
-void movlhps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x16); }
-void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
-void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
-void movmskpd(const Reg32e& reg, const Xmm& xmm) {
-  db(0x66);
-  movmskps(reg, xmm);
-}
-void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, 0x50); }
-void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
-void movntdqa(const Xmm& xmm, const Address& addr) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x38, 0x2A);
-}
-void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0xC3); }
-void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
-void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, 0x2B); }
-void movntq(const Address& addr, const Mmx& mmx) {
-  if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModM(addr, mmx, 0x0F, 0xE7);
-}
-void movq(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, mmx.isXMM() ? 0xD6 : 0x7F);
-}
-void movq(const Mmx& mmx, const Operand& op) {
-  if (mmx.isXMM()) db(0xF3);
-  opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? 0x7E : 0x6F);
-}
-void movq2dq(const Xmm& xmm, const Mmx& mmx) {
-  db(0xF3);
-  opModR(xmm, mmx, 0x0F, 0xD6);
-}
-void movsb() { db(0xA4); }
-void movsd() { db(0xA5); }
-void movsd(const Address& addr, const Xmm& xmm) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
-void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movss(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
-void movsw() {
-  db(0x66);
-  db(0xA5);
-}
-void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
-void movupd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
-void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
-void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
-void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
-void mpsadbw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
-void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
-void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
-void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
-void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
-void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
-void mwait() {
-  db(0x0F);
-  db(0x01);
-  db(0xC9);
-}
-void mwaitx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFB);
-}
-void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
-void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
-void or_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x08, 1); }
-void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
-void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
-void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
-void out_(const Reg& d, const Reg& a) { opInOut(a, d, 0xEE); }
-void out_(uint8_t v, const Reg& a) { opInOut(a, 0xE6, v); }
-void outsb() { db(0x6E); }
-void outsd() { db(0x6F); }
-void outsw() {
-  db(0x66);
-  db(0x6F);
-}
-void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
-void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
-void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
-void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
-void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
-void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
-void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
-void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
-void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
-void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
-void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
-void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
-void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
-void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
-void palignr(const Mmx& mmx, const Operand& op, int imm) {
-  opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8_t>(imm), 0x3a);
-}
-void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
-void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
-void pause() {
-  db(0xF3);
-  db(0x90);
-}
-void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
-void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
-void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pblendw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
-void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
-void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
-void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
-void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
-void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
-void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
-void pcmpestri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpestrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
-void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
-void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
-void pcmpistri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpistrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf5, true); }
-void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F3 | T_0F38, 0xf5, true); }
-void pextrb(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x14, imm); }
-void pextrd(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x16, imm); }
-void pextrw(const Operand& op, const Mmx& xmm, uint8_t imm) { opExt(op, xmm, 0x15, imm, true); }
-void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
-void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
-void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
-void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
-void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
-void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
-void pinsrb(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrw(const Mmx& mmx, const Operand& op, int imm) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(mmx, op, 0xC4, mmx.isXMM() ? 0x66 : NONE, 0, imm);
-}
-void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
-void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
-void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
-void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
-void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
-void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
-void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovmskb(const Reg32e& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(reg, mmx, 0x0F, 0xD7);
-}
-void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
-void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
-void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
-void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
-void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
-void popcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
-void popf() { db(0x9D); }
-void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
-void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
-void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
-void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
-void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
-void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
-void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
-void prefetchw(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x0D); }
-void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
-void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
-void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
-void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
-void pshufhw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
-void pshuflw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
-void pshufw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
-void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
-void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
-void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
-void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
-void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
-void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
-void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
-void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
-void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
-void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
-void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
-void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
-void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
-void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
-void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
-void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
-void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
-void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
-void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
-void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
-void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
-void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
-void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
-void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
-void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
-void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
-void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
-void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
-void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
-void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
-void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
-void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
-void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
-void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
-void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
-void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
-void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
-void pushf() { db(0x9C); }
-void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
-void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
-void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
-void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
-void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
-void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
-void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
-void rdmsr() {
-  db(0x0F);
-  db(0x32);
-}
-void rdpmc() {
-  db(0x0F);
-  db(0x33);
-}
-void rdrand(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdseed(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdtsc() {
-  db(0x0F);
-  db(0x31);
-}
-void rdtscp() {
-  db(0x0F);
-  db(0x01);
-  db(0xF9);
-}
-void rep() { db(0xF3); }
-void repe() { db(0xF3); }
-void repne() { db(0xF2); }
-void repnz() { db(0xF2); }
-void repz() { db(0xF3); }
-void ret(int imm = 0) {
-  if (imm) {
-    db(0xC2);
-    dw(imm);
-  } else {
-    db(0xC3);
-  }
-}
-void retf(int imm = 0) {
-  if (imm) {
-    db(0xCA);
-    dw(imm);
-  } else {
-    db(0xCB);
-  }
-}
-void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
-void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
-void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
-void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
-void rorx(const Reg32e& r, const Operand& op, uint8_t imm) {
-  opGpr(r, op, Reg32e(0, r.getBit()), T_0F3A | T_F2, 0xF0, false, imm);
-}
-void roundpd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundsd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void roundss(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
-void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
-void sahf() { db(0x9E); }
-void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
-void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
-void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
-void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); }
-void sbb(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x18, 3); }
-void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
-void scasb() { db(0xAE); }
-void scasd() { db(0xAF); }
-void scasw() {
-  db(0x66);
-  db(0xAF);
-}
-void serialize() {
-  db(0x0F);
-  db(0x01);
-  db(0xE8);
-}
-void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }     //-V524
-void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }    //-V524
-void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }    //-V524
-void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }   //-V524
-void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }   //-V524
-void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }  //-V524
-void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }  //-V524
-void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 1); }    //-V524
-void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 9); }    //-V524
-void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 0); }     //-V524
-void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }    //-V524
-void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }   //-V524
-void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 8); }     //-V524
-void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void sfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF8);
-}
-void sha1msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC9, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCA, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1nexte(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC8, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1rnds4(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, imm, 0x3A);
-}
-void sha256msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCD, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256rnds2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCB, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
-void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
-void shld(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xA4); }
-void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_66 | T_0F38, 0xf7, false); }
-void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
-void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
-void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
-void shrd(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xAC); }
-void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F2 | T_0F38, 0xf7, false); }
-void shufpd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
-void shufps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
-void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
-void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
-void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
-void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
-void stac() {
-  db(0x0F);
-  db(0x01);
-  db(0xCB);
-}
-void stc() { db(0xF9); }
-void std() { db(0xFD); }
-void sti() { db(0xFB); }
-void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
-void stosb() { db(0xAA); }
-void stosd() { db(0xAB); }
-void stosw() {
-  db(0x66);
-  db(0xAB);
-}
-void sub(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x28, 5); }
-void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
-void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
-void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
-void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
-void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
-void sysenter() {
-  db(0x0F);
-  db(0x34);
-}
-void sysexit() {
-  db(0x0F);
-  db(0x35);
-}
-void tpause(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void tzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
-void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
-void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
-void ud2() {
-  db(0x0F);
-  db(0x0B);
-}
-void umonitor(const Reg& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit();
-  if (BIT != bit) {
-    if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) {
-      db(0x67);
-    } else {
-      XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  db(0xF3);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void umwait(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
-void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
-void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
-void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
-void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x58);
-}
-void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x58);
-}
-void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x58);
-}
-void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x58);
-}
-void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0);
-}
-void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0);
-}
-void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE);
-}
-void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF);
-}
-void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC);
-}
-void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD);
-}
-void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); }
-void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm);
-}
-void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55);
-}
-void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55);
-}
-void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54);
-}
-void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54);
-}
-void vbcstnebf162ps(const Xmm& x, const Address& addr) {
-  opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1);
-}
-void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
-void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm);
-}
-void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm);
-}
-void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4);
-}
-void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4A, x4.getIdx() << 4);
-}
-void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x1A); }
-void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x5A); }
-void vbroadcastsd(const Ymm& y, const Operand& op) {
-  if (!op.isMEM() && !(y.isYMM() && op.isXMM()) && !(y.isZMM() && op.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(y, op, T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_N8, 0x19);
-}
-void vbroadcastss(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x18);
-}
-void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
-void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
-void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
-void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
-void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
-void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
-void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
-void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
-void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
-void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
-void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
-void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
-void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
-void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
-void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
-void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
-void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
-void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
-void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
-void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
-void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
-void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
-void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
-void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
-void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
-void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
-void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
-void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
-void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
-void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
-void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
-void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
-void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
-void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
-void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
-void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
-void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
-void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
-void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
-void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
-void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
-void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
-void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
-void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
-void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
-void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
-void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
-void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
-void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
-void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
-void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
-void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
-void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
-void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
-void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
-void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
-void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
-void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
-void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
-void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
-void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
-void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
-void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
-void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
-void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
-void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
-void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
-void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
-void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
-void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
-void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
-void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
-void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
-void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
-void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
-void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
-void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
-void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
-void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
-void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
-void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
-void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
-void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
-void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
-void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
-void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
-void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
-void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
-void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
-void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
-void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
-void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
-void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
-void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
-void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
-void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
-void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
-void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
-void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
-void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
-void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
-void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
-void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
-void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
-void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
-void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
-void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
-void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
-void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
-void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
-void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
-void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
-void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F, 0xC2, imm);
-}
-void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0xC2, imm);
-}
-void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
-void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
-void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
-void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
-void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
-void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
-void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
-void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
-void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
-void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
-void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
-void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
-void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
-void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
-void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
-void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
-void vcomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2F);
-}
-void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
-void vcvtdq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6);
-}
-void vcvtdq2ps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72);
-}
-void vcvtpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvtpd2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A);
-}
-void vcvtph2ps(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13);
-}
-void vcvtps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtps2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x5A);
-}
-void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm);
-}
-void vcvtsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x5A);
-}
-void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F2 | T_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x5A);
-}
-void vcvtss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvttps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5B);
-}
-void vcvttsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C);
-}
-void vcvttss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C);
-}
-void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5E);
-}
-void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5E);
-}
-void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5E);
-}
-void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5E);
-}
-void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x41, imm);
-}
-void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x40, imm);
-}
-void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm);
-}
-void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm);
-}
-void vextractps(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm);
-}
-void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x98);
-}
-void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x98);
-}
-void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA8);
-}
-void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA8);
-}
-void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB8);
-}
-void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB8);
-}
-void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x96);
-}
-void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x96);
-}
-void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA6);
-}
-void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA6);
-}
-void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB6);
-}
-void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB6);
-}
-void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9A);
-}
-void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9A);
-}
-void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAA);
-}
-void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAA);
-}
-void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBA);
-}
-void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBA);
-}
-void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x97);
-}
-void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x97);
-}
-void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA7);
-}
-void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA7);
-}
-void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB7);
-}
-void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB7);
-}
-void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9C);
-}
-void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9C);
-}
-void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAC);
-}
-void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAC);
-}
-void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBC);
-}
-void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBC);
-}
-void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9E);
-}
-void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9E);
-}
-void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAE);
-}
-void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAE);
-}
-void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBE);
-}
-void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE);
-}
-void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF);
-}
-void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF);
-}
-void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0);
-}
-void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1);
-}
-void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1);
-}
-void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2);
-}
-void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm);
-}
-void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm);
-}
-void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF);
-}
-void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C);
-}
-void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C);
-}
-void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D);
-}
-void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D);
-}
-void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm);
-}
-void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm);
-}
-void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm);
-}
-void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
-void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
-void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
-void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2F);
-}
-void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D);
-}
-void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E);
-}
-void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C);
-}
-void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F);
-}
-void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F);
-}
-void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F);
-}
-void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F);
-}
-void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D);
-}
-void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D);
-}
-void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D);
-}
-void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D);
-}
-void vmovapd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); }
-void vmovaps(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); }
-void vmovd(const Operand& op, const Xmm& x) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E);
-}
-void vmovd(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E);
-}
-void vmovddup(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_DUP | T_F2 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_X | T_ER_Y | T_ER_Z, 0x12);
-}
-void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_YMM, 0x7F); }
-void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_YMM, 0x6F); }
-void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_F3 | T_0F | T_YMM, 0x7F); }
-void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM, 0x6F); }
-void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x12);
-}
-void vmovhpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x17);
-}
-void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x16);
-}
-void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x17); }
-void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x16);
-}
-void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x16);
-}
-void vmovlpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x13);
-}
-void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x12);
-}
-void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x13); }
-void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x12);
-}
-void vmovmskpd(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_66 | T_W0 | T_YMM, 0x50);
-}
-void vmovmskps(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_W0 | T_YMM, 0x50);
-}
-void vmovntdq(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW0, 0xE7); }
-void vmovntdqa(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
-void vmovntpd(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
-void vmovntps(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
-void vmovq(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E);
-}
-void vmovq(const Xmm& x, const Address& addr) {
-  int type, code;
-  if (x.getIdx() < 16) {
-    type = T_0F | T_F3;
-    code = 0x7E;
-  } else {
-    type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8;
-    code = 0x6E;
-  }
-  opAVX_X_X_XM(x, xm0, addr, type, code);
-}
-void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }
-void vmovsd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_M_K, 0x11);
-}
-void vmovsd(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x16); }
-void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x12); }
-void vmovss(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11);
-}
-void vmovss(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovupd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); }
-void vmovups(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); }
-void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm);
-}
-void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59);
-}
-void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59);
-}
-void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59);
-}
-void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59);
-}
-void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56);
-}
-void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56);
-}
-void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1C); }
-void vpabsd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x1E);
-}
-void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1D); }
-void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B);
-}
-void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x63);
-}
-void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B);
-}
-void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x67);
-}
-void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFC);
-}
-void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFE);
-}
-void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xD4);
-}
-void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEC);
-}
-void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xED);
-}
-void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDC);
-}
-void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDD);
-}
-void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFD);
-}
-void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_YMM | T_EVEX, 0x0F, imm);
-}
-void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDB); }
-void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDF); }
-void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE0);
-}
-void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE3);
-}
-void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x02, imm);
-}
-void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4C, x4.getIdx() << 4);
-}
-void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0E, imm);
-}
-void vpbroadcastb(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x78);
-}
-void vpbroadcastd(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58);
-}
-void vpbroadcastq(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59);
-}
-void vpbroadcastw(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79);
-}
-void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
-void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
-void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
-void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
-void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm);
-}
-void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
-void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
-void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29);
-}
-void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x75); }
-void vpcmpestri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x61, imm); }
-void vpcmpestrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x60, imm); }
-void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x64); }
-void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x66); }
-void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x37);
-}
-void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
-void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
-void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
-void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding);
-}
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding);
-}
-void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding);
-}
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding);
-}
-void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm);
-}
-void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm);
-}
-void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36);
-}
-void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x0D);
-}
-void vpermilpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_EVEX | T_B64, 0x05, imm);
-}
-void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x0C);
-}
-void vpermilps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_EVEX | T_B32, 0x04, imm);
-}
-void vpermpd(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x01, imm);
-}
-void vpermpd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x16);
-}
-void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x16);
-}
-void vpermq(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x00, imm);
-}
-void vpermq(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x36);
-}
-void vpextrb(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(8 | 16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x14, imm);
-}
-void vpextrd(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm);
-}
-void vpextrq(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(64) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm);
-}
-void vpextrw(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) if (op.isREG() && x.getIdx() < 16) {
-      opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm);
-    }
-  else {
-    opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm);
-  }
-}
-void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1);
-}
-void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0);
-}
-void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1);
-}
-void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); }
-void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03);
-}
-void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); }
-void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38, 0x41); }
-void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x06); }
-void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x07);
-}
-void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x05); }
-void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x20, imm);
-}
-void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm);
-}
-void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm);
-}
-void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm);
-}
-void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding);
-}
-void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding);
-}
-void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04);
-}
-void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5);
-}
-void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E);
-}
-void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8C);
-}
-void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8E);
-}
-void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8C);
-}
-void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3C);
-}
-void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3D);
-}
-void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEE);
-}
-void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDE);
-}
-void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3F);
-}
-void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3E);
-}
-void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x38);
-}
-void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x39);
-}
-void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEA);
-}
-void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDA);
-}
-void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3B);
-}
-void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3A);
-}
-void vpmovmskb(const Reg32e& r, const Xmm& x) {
-  if (!x.is(Operand::XMM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x.isYMM() ? Ymm(r.getIdx()) : Xmm(r.getIdx()), 0, x, T_0F | T_66 | T_YMM, 0xD7);
-}
-void vpmovsxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x21);
-}
-void vpmovsxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x22);
-}
-void vpmovsxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x20);
-}
-void vpmovsxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x25);
-}
-void vpmovsxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x23);
-}
-void vpmovsxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x24);
-}
-void vpmovzxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x31);
-}
-void vpmovzxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x32);
-}
-void vpmovzxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x30);
-}
-void vpmovzxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x35);
-}
-void vpmovzxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x33);
-}
-void vpmovzxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x34);
-}
-void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x28);
-}
-void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x0B);
-}
-void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE4);
-}
-void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE5);
-}
-void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x40);
-}
-void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD5);
-}
-void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xF4);
-}
-void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEB); }
-void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF6);
-}
-void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x00);
-}
-void vpshufd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x70, imm);
-}
-void vpshufhw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpshuflw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F2 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
-void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
-void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
-void vpslld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2);
-}
-void vpslldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3);
-}
-void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47);
-}
-void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47);
-}
-void vpsllw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1);
-}
-void vpsrad(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2);
-}
-void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46);
-}
-void vpsraw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1);
-}
-void vpsrld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2);
-}
-void vpsrldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3);
-}
-void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45);
-}
-void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45);
-}
-void vpsrlw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1);
-}
-void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8);
-}
-void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA);
-}
-void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xFB);
-}
-void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE8);
-}
-void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE9);
-}
-void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD8);
-}
-void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD9);
-}
-void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF9);
-}
-void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x17); }
-void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x68);
-}
-void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6A);
-}
-void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6D);
-}
-void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x69);
-}
-void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x60);
-}
-void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x62);
-}
-void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6C);
-}
-void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x61);
-}
-void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEF); }
-void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
-void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x53); }
-void vroundpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x09, imm);
-}
-void vroundps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x08, imm);
-}
-void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0B, imm);
-}
-void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm);
-}
-void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
-void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
-void vsha512msg1(const Ymm& y, const Xmm& x) {
-  if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC);
-}
-void vsha512msg2(const Ymm& y1, const Ymm& y2) {
-  if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD);
-}
-void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) {
-  if (!(y1.isYMM() && y2.isYMM() && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB);
-}
-void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm);
-}
-void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm);
-}
-void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm);
-}
-void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsqrtpd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51);
-}
-void vsqrtps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51);
-}
-void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51);
-}
-void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_ER_X, 0x51);
-}
-void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, T_0F, 0xAE); }
-void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5C);
-}
-void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5C);
-}
-void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5C);
-}
-void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5C);
-}
-void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0F); }
-void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0E); }
-void vucomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vucomiss(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x15);
-}
-void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x15);
-}
-void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x14);
-}
-void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x14);
-}
-void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57);
-}
-void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57);
-}
-void vzeroall() {
-  db(0xC5);
-  db(0xFC);
-  db(0x77);
-}
-void vzeroupper() {
-  db(0xC5);
-  db(0xF8);
-  db(0x77);
-}
-void wait() { db(0x9B); }
-void wbinvd() {
-  db(0x0F);
-  db(0x09);
-}
-void wrmsr() {
-  db(0x0F);
-  db(0x30);
-}
-void xabort(uint8_t imm) {
-  db(0xC6);
-  db(0xF8);
-  db(imm);
-}
-void xadd(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xC0 | (reg.isBit(8) ? 0 : 1));
-}
-void xbegin(uint32_t rel) {
-  db(0xC7);
-  db(0xF8);
-  dd(rel);
-}
-void xend() {
-  db(0x0F);
-  db(0x01);
-  db(0xD5);
-}
-void xgetbv() {
-  db(0x0F);
-  db(0x01);
-  db(0xD0);
-}
-void xlatb() { db(0xD7); }
-void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
-void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
-void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
-void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
-#ifdef XBYAK_ENABLE_OMITTED_OPERAND
-void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
-void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
-void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvpd(x1, x1, op, x4); }
-void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvps(x1, x1, op, x4); }
-void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmpeq_ospd(x, x, op); }
-void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpeq_osps(x, x, op); }
-void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpeq_ossd(x, x, op); }
-void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpeq_osss(x, x, op); }
-void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmpeq_uqpd(x, x, op); }
-void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpeq_uqps(x, x, op); }
-void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpeq_uqsd(x, x, op); }
-void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpeq_uqss(x, x, op); }
-void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmpeq_uspd(x, x, op); }
-void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpeq_usps(x, x, op); }
-void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpeq_ussd(x, x, op); }
-void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpeq_usss(x, x, op); }
-void vcmpeqpd(const Xmm& x, const Operand& op) { vcmpeqpd(x, x, op); }
-void vcmpeqps(const Xmm& x, const Operand& op) { vcmpeqps(x, x, op); }
-void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpeqsd(x, x, op); }
-void vcmpeqss(const Xmm& x, const Operand& op) { vcmpeqss(x, x, op); }
-void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmpfalse_ospd(x, x, op); }
-void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpfalse_osps(x, x, op); }
-void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpfalse_ossd(x, x, op); }
-void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpfalse_osss(x, x, op); }
-void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmpfalsepd(x, x, op); }
-void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpfalseps(x, x, op); }
-void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpfalsesd(x, x, op); }
-void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpfalsess(x, x, op); }
-void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmpge_oqpd(x, x, op); }
-void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpge_oqps(x, x, op); }
-void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpge_oqsd(x, x, op); }
-void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpge_oqss(x, x, op); }
-void vcmpgepd(const Xmm& x, const Operand& op) { vcmpgepd(x, x, op); }
-void vcmpgeps(const Xmm& x, const Operand& op) { vcmpgeps(x, x, op); }
-void vcmpgesd(const Xmm& x, const Operand& op) { vcmpgesd(x, x, op); }
-void vcmpgess(const Xmm& x, const Operand& op) { vcmpgess(x, x, op); }
-void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmpgt_oqpd(x, x, op); }
-void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpgt_oqps(x, x, op); }
-void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpgt_oqsd(x, x, op); }
-void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpgt_oqss(x, x, op); }
-void vcmpgtpd(const Xmm& x, const Operand& op) { vcmpgtpd(x, x, op); }
-void vcmpgtps(const Xmm& x, const Operand& op) { vcmpgtps(x, x, op); }
-void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpgtsd(x, x, op); }
-void vcmpgtss(const Xmm& x, const Operand& op) { vcmpgtss(x, x, op); }
-void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmple_oqpd(x, x, op); }
-void vcmple_oqps(const Xmm& x, const Operand& op) { vcmple_oqps(x, x, op); }
-void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmple_oqsd(x, x, op); }
-void vcmple_oqss(const Xmm& x, const Operand& op) { vcmple_oqss(x, x, op); }
-void vcmplepd(const Xmm& x, const Operand& op) { vcmplepd(x, x, op); }
-void vcmpleps(const Xmm& x, const Operand& op) { vcmpleps(x, x, op); }
-void vcmplesd(const Xmm& x, const Operand& op) { vcmplesd(x, x, op); }
-void vcmpless(const Xmm& x, const Operand& op) { vcmpless(x, x, op); }
-void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmplt_oqpd(x, x, op); }
-void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmplt_oqps(x, x, op); }
-void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmplt_oqsd(x, x, op); }
-void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmplt_oqss(x, x, op); }
-void vcmpltpd(const Xmm& x, const Operand& op) { vcmpltpd(x, x, op); }
-void vcmpltps(const Xmm& x, const Operand& op) { vcmpltps(x, x, op); }
-void vcmpltsd(const Xmm& x, const Operand& op) { vcmpltsd(x, x, op); }
-void vcmpltss(const Xmm& x, const Operand& op) { vcmpltss(x, x, op); }
-void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmpneq_oqpd(x, x, op); }
-void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpneq_oqps(x, x, op); }
-void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpneq_oqsd(x, x, op); }
-void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpneq_oqss(x, x, op); }
-void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmpneq_ospd(x, x, op); }
-void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpneq_osps(x, x, op); }
-void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpneq_ossd(x, x, op); }
-void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpneq_osss(x, x, op); }
-void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmpneq_uspd(x, x, op); }
-void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpneq_usps(x, x, op); }
-void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpneq_ussd(x, x, op); }
-void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpneq_usss(x, x, op); }
-void vcmpneqpd(const Xmm& x, const Operand& op) { vcmpneqpd(x, x, op); }
-void vcmpneqps(const Xmm& x, const Operand& op) { vcmpneqps(x, x, op); }
-void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpneqsd(x, x, op); }
-void vcmpneqss(const Xmm& x, const Operand& op) { vcmpneqss(x, x, op); }
-void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmpnge_uqpd(x, x, op); }
-void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpnge_uqps(x, x, op); }
-void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpnge_uqsd(x, x, op); }
-void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpnge_uqss(x, x, op); }
-void vcmpngepd(const Xmm& x, const Operand& op) { vcmpngepd(x, x, op); }
-void vcmpngeps(const Xmm& x, const Operand& op) { vcmpngeps(x, x, op); }
-void vcmpngesd(const Xmm& x, const Operand& op) { vcmpngesd(x, x, op); }
-void vcmpngess(const Xmm& x, const Operand& op) { vcmpngess(x, x, op); }
-void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmpngt_uqpd(x, x, op); }
-void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpngt_uqps(x, x, op); }
-void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpngt_uqsd(x, x, op); }
-void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpngt_uqss(x, x, op); }
-void vcmpngtpd(const Xmm& x, const Operand& op) { vcmpngtpd(x, x, op); }
-void vcmpngtps(const Xmm& x, const Operand& op) { vcmpngtps(x, x, op); }
-void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpngtsd(x, x, op); }
-void vcmpngtss(const Xmm& x, const Operand& op) { vcmpngtss(x, x, op); }
-void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmpnle_uqpd(x, x, op); }
-void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpnle_uqps(x, x, op); }
-void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpnle_uqsd(x, x, op); }
-void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpnle_uqss(x, x, op); }
-void vcmpnlepd(const Xmm& x, const Operand& op) { vcmpnlepd(x, x, op); }
-void vcmpnleps(const Xmm& x, const Operand& op) { vcmpnleps(x, x, op); }
-void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpnlesd(x, x, op); }
-void vcmpnless(const Xmm& x, const Operand& op) { vcmpnless(x, x, op); }
-void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmpnlt_uqpd(x, x, op); }
-void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpnlt_uqps(x, x, op); }
-void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpnlt_uqsd(x, x, op); }
-void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpnlt_uqss(x, x, op); }
-void vcmpnltpd(const Xmm& x, const Operand& op) { vcmpnltpd(x, x, op); }
-void vcmpnltps(const Xmm& x, const Operand& op) { vcmpnltps(x, x, op); }
-void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpnltsd(x, x, op); }
-void vcmpnltss(const Xmm& x, const Operand& op) { vcmpnltss(x, x, op); }
-void vcmpord_spd(const Xmm& x, const Operand& op) { vcmpord_spd(x, x, op); }
-void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpord_sps(x, x, op); }
-void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpord_ssd(x, x, op); }
-void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpord_sss(x, x, op); }
-void vcmpordpd(const Xmm& x, const Operand& op) { vcmpordpd(x, x, op); }
-void vcmpordps(const Xmm& x, const Operand& op) { vcmpordps(x, x, op); }
-void vcmpordsd(const Xmm& x, const Operand& op) { vcmpordsd(x, x, op); }
-void vcmpordss(const Xmm& x, const Operand& op) { vcmpordss(x, x, op); }
-void vcmppd(const Xmm& x, const Operand& op, uint8_t imm) { vcmppd(x, x, op, imm); }
-void vcmpps(const Xmm& x, const Operand& op, uint8_t imm) { vcmpps(x, x, op, imm); }
-void vcmpsd(const Xmm& x, const Operand& op, uint8_t imm) { vcmpsd(x, x, op, imm); }
-void vcmpss(const Xmm& x, const Operand& op, uint8_t imm) { vcmpss(x, x, op, imm); }
-void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmptrue_uspd(x, x, op); }
-void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmptrue_usps(x, x, op); }
-void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmptrue_ussd(x, x, op); }
-void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmptrue_usss(x, x, op); }
-void vcmptruepd(const Xmm& x, const Operand& op) { vcmptruepd(x, x, op); }
-void vcmptrueps(const Xmm& x, const Operand& op) { vcmptrueps(x, x, op); }
-void vcmptruesd(const Xmm& x, const Operand& op) { vcmptruesd(x, x, op); }
-void vcmptruess(const Xmm& x, const Operand& op) { vcmptruess(x, x, op); }
-void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmpunord_spd(x, x, op); }
-void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpunord_sps(x, x, op); }
-void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpunord_ssd(x, x, op); }
-void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpunord_sss(x, x, op); }
-void vcmpunordpd(const Xmm& x, const Operand& op) { vcmpunordpd(x, x, op); }
-void vcmpunordps(const Xmm& x, const Operand& op) { vcmpunordps(x, x, op); }
-void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpunordsd(x, x, op); }
-void vcmpunordss(const Xmm& x, const Operand& op) { vcmpunordss(x, x, op); }
-void vcvtsd2ss(const Xmm& x, const Operand& op) { vcvtsd2ss(x, x, op); }
-void vcvtsi2sd(const Xmm& x, const Operand& op) { vcvtsi2sd(x, x, op); }
-void vcvtsi2ss(const Xmm& x, const Operand& op) { vcvtsi2ss(x, x, op); }
-void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
-void vdppd(const Xmm& x, const Operand& op, uint8_t imm) { vdppd(x, x, op, imm); }
-void vdpps(const Xmm& x, const Operand& op, uint8_t imm) { vdpps(x, x, op, imm); }
-void vinsertps(const Xmm& x, const Operand& op, uint8_t imm) { vinsertps(x, x, op, imm); }
-void vmpsadbw(const Xmm& x, const Operand& op, uint8_t imm) { vmpsadbw(x, x, op, imm); }
-void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
-void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
-void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
-void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
-void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }
-void vpaddd(const Xmm& x, const Operand& op) { vpaddd(x, x, op); }
-void vpaddq(const Xmm& x, const Operand& op) { vpaddq(x, x, op); }
-void vpaddsb(const Xmm& x, const Operand& op) { vpaddsb(x, x, op); }
-void vpaddsw(const Xmm& x, const Operand& op) { vpaddsw(x, x, op); }
-void vpaddusb(const Xmm& x, const Operand& op) { vpaddusb(x, x, op); }
-void vpaddusw(const Xmm& x, const Operand& op) { vpaddusw(x, x, op); }
-void vpaddw(const Xmm& x, const Operand& op) { vpaddw(x, x, op); }
-void vpalignr(const Xmm& x, const Operand& op, uint8_t imm) { vpalignr(x, x, op, imm); }
-void vpand(const Xmm& x, const Operand& op) { vpand(x, x, op); }
-void vpandn(const Xmm& x, const Operand& op) { vpandn(x, x, op); }
-void vpavgb(const Xmm& x, const Operand& op) { vpavgb(x, x, op); }
-void vpavgw(const Xmm& x, const Operand& op) { vpavgw(x, x, op); }
-void vpblendd(const Xmm& x, const Operand& op, uint8_t imm) { vpblendd(x, x, op, imm); }
-void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { vpblendvb(x1, x1, op, x4); }
-void vpblendw(const Xmm& x, const Operand& op, uint8_t imm) { vpblendw(x, x, op, imm); }
-void vpclmulqdq(const Xmm& x, const Operand& op, uint8_t imm) { vpclmulqdq(x, x, op, imm); }
-void vpcmpeqb(const Xmm& x, const Operand& op) { vpcmpeqb(x, x, op); }
-void vpcmpeqd(const Xmm& x, const Operand& op) { vpcmpeqd(x, x, op); }
-void vpcmpeqq(const Xmm& x, const Operand& op) { vpcmpeqq(x, x, op); }
-void vpcmpeqw(const Xmm& x, const Operand& op) { vpcmpeqw(x, x, op); }
-void vpcmpgtb(const Xmm& x, const Operand& op) { vpcmpgtb(x, x, op); }
-void vpcmpgtd(const Xmm& x, const Operand& op) { vpcmpgtd(x, x, op); }
-void vpcmpgtq(const Xmm& x, const Operand& op) { vpcmpgtq(x, x, op); }
-void vpcmpgtw(const Xmm& x, const Operand& op) { vpcmpgtw(x, x, op); }
-void vphaddd(const Xmm& x, const Operand& op) { vphaddd(x, x, op); }
-void vphaddsw(const Xmm& x, const Operand& op) { vphaddsw(x, x, op); }
-void vphaddw(const Xmm& x, const Operand& op) { vphaddw(x, x, op); }
-void vphsubd(const Xmm& x, const Operand& op) { vphsubd(x, x, op); }
-void vphsubsw(const Xmm& x, const Operand& op) { vphsubsw(x, x, op); }
-void vphsubw(const Xmm& x, const Operand& op) { vphsubw(x, x, op); }
-void vpinsrb(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrb(x, x, op, imm); }
-void vpinsrd(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrd(x, x, op, imm); }
-void vpinsrq(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrq(x, x, op, imm); }
-void vpinsrw(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrw(x, x, op, imm); }
-void vpmaddubsw(const Xmm& x, const Operand& op) { vpmaddubsw(x, x, op); }
-void vpmaddwd(const Xmm& x, const Operand& op) { vpmaddwd(x, x, op); }
-void vpmaxsb(const Xmm& x, const Operand& op) { vpmaxsb(x, x, op); }
-void vpmaxsd(const Xmm& x, const Operand& op) { vpmaxsd(x, x, op); }
-void vpmaxsw(const Xmm& x, const Operand& op) { vpmaxsw(x, x, op); }
-void vpmaxub(const Xmm& x, const Operand& op) { vpmaxub(x, x, op); }
-void vpmaxud(const Xmm& x, const Operand& op) { vpmaxud(x, x, op); }
-void vpmaxuw(const Xmm& x, const Operand& op) { vpmaxuw(x, x, op); }
-void vpminsb(const Xmm& x, const Operand& op) { vpminsb(x, x, op); }
-void vpminsd(const Xmm& x, const Operand& op) { vpminsd(x, x, op); }
-void vpminsw(const Xmm& x, const Operand& op) { vpminsw(x, x, op); }
-void vpminub(const Xmm& x, const Operand& op) { vpminub(x, x, op); }
-void vpminud(const Xmm& x, const Operand& op) { vpminud(x, x, op); }
-void vpminuw(const Xmm& x, const Operand& op) { vpminuw(x, x, op); }
-void vpmuldq(const Xmm& x, const Operand& op) { vpmuldq(x, x, op); }
-void vpmulhrsw(const Xmm& x, const Operand& op) { vpmulhrsw(x, x, op); }
-void vpmulhuw(const Xmm& x, const Operand& op) { vpmulhuw(x, x, op); }
-void vpmulhw(const Xmm& x, const Operand& op) { vpmulhw(x, x, op); }
-void vpmulld(const Xmm& x, const Operand& op) { vpmulld(x, x, op); }
-void vpmullw(const Xmm& x, const Operand& op) { vpmullw(x, x, op); }
-void vpmuludq(const Xmm& x, const Operand& op) { vpmuludq(x, x, op); }
-void vpor(const Xmm& x, const Operand& op) { vpor(x, x, op); }
-void vpsadbw(const Xmm& x, const Operand& op) { vpsadbw(x, x, op); }
-void vpsignb(const Xmm& x, const Operand& op) { vpsignb(x, x, op); }
-void vpsignd(const Xmm& x, const Operand& op) { vpsignd(x, x, op); }
-void vpsignw(const Xmm& x, const Operand& op) { vpsignw(x, x, op); }
-void vpslld(const Xmm& x, const Operand& op) { vpslld(x, x, op); }
-void vpslld(const Xmm& x, uint8_t imm) { vpslld(x, x, imm); }
-void vpslldq(const Xmm& x, uint8_t imm) { vpslldq(x, x, imm); }
-void vpsllq(const Xmm& x, const Operand& op) { vpsllq(x, x, op); }
-void vpsllq(const Xmm& x, uint8_t imm) { vpsllq(x, x, imm); }
-void vpsllw(const Xmm& x, const Operand& op) { vpsllw(x, x, op); }
-void vpsllw(const Xmm& x, uint8_t imm) { vpsllw(x, x, imm); }
-void vpsrad(const Xmm& x, const Operand& op) { vpsrad(x, x, op); }
-void vpsrad(const Xmm& x, uint8_t imm) { vpsrad(x, x, imm); }
-void vpsraw(const Xmm& x, const Operand& op) { vpsraw(x, x, op); }
-void vpsraw(const Xmm& x, uint8_t imm) { vpsraw(x, x, imm); }
-void vpsrld(const Xmm& x, const Operand& op) { vpsrld(x, x, op); }
-void vpsrld(const Xmm& x, uint8_t imm) { vpsrld(x, x, imm); }
-void vpsrldq(const Xmm& x, uint8_t imm) { vpsrldq(x, x, imm); }
-void vpsrlq(const Xmm& x, const Operand& op) { vpsrlq(x, x, op); }
-void vpsrlq(const Xmm& x, uint8_t imm) { vpsrlq(x, x, imm); }
-void vpsrlw(const Xmm& x, const Operand& op) { vpsrlw(x, x, op); }
-void vpsrlw(const Xmm& x, uint8_t imm) { vpsrlw(x, x, imm); }
-void vpsubb(const Xmm& x, const Operand& op) { vpsubb(x, x, op); }
-void vpsubd(const Xmm& x, const Operand& op) { vpsubd(x, x, op); }
-void vpsubq(const Xmm& x, const Operand& op) { vpsubq(x, x, op); }
-void vpsubsb(const Xmm& x, const Operand& op) { vpsubsb(x, x, op); }
-void vpsubsw(const Xmm& x, const Operand& op) { vpsubsw(x, x, op); }
-void vpsubusb(const Xmm& x, const Operand& op) { vpsubusb(x, x, op); }
-void vpsubusw(const Xmm& x, const Operand& op) { vpsubusw(x, x, op); }
-void vpsubw(const Xmm& x, const Operand& op) { vpsubw(x, x, op); }
-void vpunpckhbw(const Xmm& x, const Operand& op) { vpunpckhbw(x, x, op); }
-void vpunpckhdq(const Xmm& x, const Operand& op) { vpunpckhdq(x, x, op); }
-void vpunpckhqdq(const Xmm& x, const Operand& op) { vpunpckhqdq(x, x, op); }
-void vpunpckhwd(const Xmm& x, const Operand& op) { vpunpckhwd(x, x, op); }
-void vpunpcklbw(const Xmm& x, const Operand& op) { vpunpcklbw(x, x, op); }
-void vpunpckldq(const Xmm& x, const Operand& op) { vpunpckldq(x, x, op); }
-void vpunpcklqdq(const Xmm& x, const Operand& op) { vpunpcklqdq(x, x, op); }
-void vpunpcklwd(const Xmm& x, const Operand& op) { vpunpcklwd(x, x, op); }
-void vpxor(const Xmm& x, const Operand& op) { vpxor(x, x, op); }
-void vrcpss(const Xmm& x, const Operand& op) { vrcpss(x, x, op); }
-void vroundsd(const Xmm& x, const Operand& op, uint8_t imm) { vroundsd(x, x, op, imm); }
-void vroundss(const Xmm& x, const Operand& op, uint8_t imm) { vroundss(x, x, op, imm); }
-void vrsqrtss(const Xmm& x, const Operand& op) { vrsqrtss(x, x, op); }
-void vshufpd(const Xmm& x, const Operand& op, uint8_t imm) { vshufpd(x, x, op, imm); }
-void vshufps(const Xmm& x, const Operand& op, uint8_t imm) { vshufps(x, x, op, imm); }
-void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
-void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
-void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
-void vunpckhps(const Xmm& x, const Operand& op) { vunpckhps(x, x, op); }
-void vunpcklpd(const Xmm& x, const Operand& op) { vunpcklpd(x, x, op); }
-void vunpcklps(const Xmm& x, const Operand& op) { vunpcklps(x, x, op); }
-#endif
-#ifdef XBYAK64
-void jecxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void cdqe() {
-  db(0x48);
-  db(0x98);
-}
-void cqo() {
-  db(0x48);
-  db(0x99);
-}
-void cmpsq() {
-  db(0x48);
-  db(0xA7);
-}
-void popfq() { db(0x9D); }
-void pushfq() { db(0x9C); }
-void lodsq() {
-  db(0x48);
-  db(0xAD);
-}
-void movsq() {
-  db(0x48);
-  db(0xA5);
-}
-void scasq() {
-  db(0x48);
-  db(0xAF);
-}
-void stosq() {
-  db(0x48);
-  db(0xAB);
-}
-void syscall() {
-  db(0x0F);
-  db(0x05);
-}
-void sysret() {
-  db(0x0F);
-  db(0x07);
-}
-void clui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEE);
-}
-void stui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEF);
-}
-void testui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xED);
-}
-void uiret() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEC);
-}
-void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
-void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
-void movq(const Reg64& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movq(const Mmx& mmx, const Reg64& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movsxd(const Reg64& reg, const Operand& op) {
-  if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63);
-}
-void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A);
-}
-void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A);
-}
-void senduipi(const Reg64& r) {
-  db(0xF3);
-  opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7);
-}
-void vcvtss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C);
-}
-void vcvtsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvttsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C);
-}
-void vmovq(const Xmm& x, const Reg64& r) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E);
-}
-void vmovq(const Reg64& r, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E);
-}
-void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false);
-}
-void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false);
-}
-void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false);
-}
-void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false);
-}
-void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false);
-}
-void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false);
-}
-void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false);
-}
-void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false);
-}
-void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false);
-}
-void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false);
-}
-void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false);
-}
-void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false);
-}
-void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false);
-}
-void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false);
-}
-void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false);
-}
-void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false);
-}
-void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
-void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
-void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
-void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
-void tilerelease() {
-  db(0xc4);
-  db(0xe2);
-  db(0x78);
-  db(0x49);
-  db(0xc0);
-}
-void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
-void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
-void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
-void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
-void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
-void tdpfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
-void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
-#else
-void jcxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jcxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void aaa() { db(0x37); }
-void aad() {
-  db(0xD5);
-  db(0x0A);
-}
-void aam() {
-  db(0xD4);
-  db(0x0A);
-}
-void aas() { db(0x3F); }
-void daa() { db(0x27); }
-void das() { db(0x2F); }
-void into() { db(0xCE); }
-void popad() { db(0x61); }
-void popfd() { db(0x9D); }
-void pusha() { db(0x60); }
-void pushad() { db(0x60); }
-void pushfd() { db(0x9C); }
-void popa() { db(0x61); }
-void lds(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC5, 0x100); }
-void les(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC4, 0x100); }
-#endif
-#ifndef XBYAK_NO_OP_NAMES
-void and (const Operand& op1, const Operand& op2) { and_(op1, op2); }
-void and (const Operand& op, uint32_t imm) { and_(op, imm); }
-void or (const Operand& op1, const Operand& op2) { or_(op1, op2); }
-void or (const Operand& op, uint32_t imm) { or_(op, imm); }
-void xor (const Operand& op1, const Operand& op2) { xor_(op1, op2); } void xor
-    (const Operand& op, uint32_t imm) { xor_(op, imm); } void not(const Operand& op) {
-  not_(op);
-}
-#endif
-#ifndef XBYAK_DISABLE_AVX512
-void kaddb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4A);
-}
-void kaddd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x4A);
-}
-void kaddq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4A); }
-void kaddw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4A); }
-void kandb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x41);
-}
-void kandd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x41);
-}
-void kandnb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x42);
-}
-void kandnd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x42);
-}
-void kandnq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x42); }
-void kandnw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x42); }
-void kandq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x41); }
-void kandw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x41); }
-void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }
-void kmovb(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90);
-}
-void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }
-void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }
-void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }
-void kmovd(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90);
-}
-void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }
-void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }
-void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }
-void kmovq(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90);
-}
-void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }
-void kmovw(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90);
-}
-void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }
-void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }
-void knotb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x44); }
-void knotd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x44); }
-void knotq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x44); }
-void knotw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x44); }
-void korb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x45); }
-void kord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x45); }
-void korq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x45); }
-void kortestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x98); }
-void kortestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x98); }
-void kortestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x98); }
-void kortestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x98); }
-void korw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x45); }
-void kshiftlb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x32, imm); }
-void kshiftld(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x33, imm); }
-void kshiftlq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x33, imm); }
-void kshiftlw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x32, imm); }
-void kshiftrb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x30, imm); }
-void kshiftrd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x31, imm); }
-void kshiftrq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x31, imm); }
-void kshiftrw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x30, imm); }
-void ktestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x99); }
-void ktestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x99); }
-void ktestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x99); }
-void ktestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x99); }
-void kunpckbw(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4B);
-}
-void kunpckdq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4B); }
-void kunpckwd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4B); }
-void kxnorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x46);
-}
-void kxnord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x46);
-}
-void kxnorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x46); }
-void kxnorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x46); }
-void kxorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x47);
-}
-void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47);
-}
-void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
-void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
-void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A);
-}
-void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B);
-}
-void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA);
-}
-void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB);
-}
-void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58);
-}
-void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58);
-}
-void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void valignq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65);
-}
-void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65);
-}
-void vbroadcastf32x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x19);
-}
-void vbroadcastf32x4(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x1A);
-}
-void vbroadcastf32x8(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x1B);
-}
-void vbroadcastf64x2(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x1A);
-}
-void vbroadcastf64x4(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x1B);
-}
-void vbroadcasti32x2(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x59);
-}
-void vbroadcasti32x4(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x5A);
-}
-void vbroadcasti32x8(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B);
-}
-void vbroadcasti64x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A);
-}
-void vbroadcasti64x4(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B);
-}
-void vcmpeq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 16); }
-void vcmpeq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 16); }
-void vcmpeq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 16); }
-void vcmpeq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 16); }
-void vcmpeq_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 8); }
-void vcmpeq_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 8); }
-void vcmpeq_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 8); }
-void vcmpeq_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 8); }
-void vcmpeq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 24); }
-void vcmpeq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 24); }
-void vcmpeq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 24); }
-void vcmpeq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 24); }
-void vcmpeqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 0); }
-void vcmpeqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 0); }
-void vcmpeqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 0); }
-void vcmpeqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 0); }
-void vcmpfalse_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 27); }
-void vcmpfalse_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 27); }
-void vcmpfalse_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 27); }
-void vcmpfalse_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 27); }
-void vcmpfalsepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 11); }
-void vcmpfalseps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 11); }
-void vcmpfalsesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 11); }
-void vcmpfalsess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 11); }
-void vcmpge_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 29); }
-void vcmpge_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 29); }
-void vcmpge_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 29); }
-void vcmpge_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 29); }
-void vcmpgepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 13); }
-void vcmpgeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 13); }
-void vcmpgesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 13); }
-void vcmpgess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 13); }
-void vcmpgt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 30); }
-void vcmpgt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 30); }
-void vcmpgt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 30); }
-void vcmpgt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 30); }
-void vcmpgtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 14); }
-void vcmpgtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 14); }
-void vcmpgtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 14); }
-void vcmpgtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 14); }
-void vcmple_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 18); }
-void vcmple_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 18); }
-void vcmple_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 18); }
-void vcmple_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 18); }
-void vcmplepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 2); }
-void vcmpleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 2); }
-void vcmplesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 2); }
-void vcmpless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 2); }
-void vcmplt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 17); }
-void vcmplt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 17); }
-void vcmplt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 17); }
-void vcmplt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 17); }
-void vcmpltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 1); }
-void vcmpltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 1); }
-void vcmpltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 1); }
-void vcmpltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 1); }
-void vcmpneq_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 12); }
-void vcmpneq_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 12); }
-void vcmpneq_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 12); }
-void vcmpneq_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 12); }
-void vcmpneq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 28); }
-void vcmpneq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 28); }
-void vcmpneq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 28); }
-void vcmpneq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 28); }
-void vcmpneq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 20); }
-void vcmpneq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 20); }
-void vcmpneq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 20); }
-void vcmpneq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 20); }
-void vcmpneqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 4); }
-void vcmpneqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 4); }
-void vcmpneqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 4); }
-void vcmpneqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 4); }
-void vcmpnge_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 25); }
-void vcmpnge_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 25); }
-void vcmpnge_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 25); }
-void vcmpnge_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 25); }
-void vcmpngepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 9); }
-void vcmpngeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 9); }
-void vcmpngesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 9); }
-void vcmpngess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 9); }
-void vcmpngt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 26); }
-void vcmpngt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 26); }
-void vcmpngt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 26); }
-void vcmpngt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 26); }
-void vcmpngtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 10); }
-void vcmpngtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 10); }
-void vcmpngtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 10); }
-void vcmpngtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 10); }
-void vcmpnle_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 22); }
-void vcmpnle_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 22); }
-void vcmpnle_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 22); }
-void vcmpnle_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 22); }
-void vcmpnlepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 6); }
-void vcmpnleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 6); }
-void vcmpnlesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 6); }
-void vcmpnless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 6); }
-void vcmpnlt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 21); }
-void vcmpnlt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 21); }
-void vcmpnlt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 21); }
-void vcmpnlt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 21); }
-void vcmpnltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 5); }
-void vcmpnltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 5); }
-void vcmpnltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 5); }
-void vcmpnltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 5); }
-void vcmpord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 23); }
-void vcmpord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 23); }
-void vcmpord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 23); }
-void vcmpord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 23); }
-void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 7); }
-void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); }
-void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); }
-void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); }
-void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm);
-}
-void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0xC2, imm);
-}
-void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm);
-}
-void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpsh(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N2 | T_F3 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmptrue_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 31); }
-void vcmptrue_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 31); }
-void vcmptrue_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 31); }
-void vcmptrue_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 31); }
-void vcmptruepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 15); }
-void vcmptrueps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 15); }
-void vcmptruesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 15); }
-void vcmptruess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 15); }
-void vcmpunord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 19); }
-void vcmpunord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 19); }
-void vcmpunord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 19); }
-void vcmpunord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 19); }
-void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 3); }
-void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
-void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
-void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
-void vcomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F);
-}
-void vcompressb(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcompresspd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressps(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressw(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcvtdq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B);
-}
-void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72);
-}
-void vcvtpd2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A);
-}
-void vcvtpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B);
-}
-void vcvtpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvtph2pd(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x5A);
-}
-void vcvtph2psx(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x13);
-}
-void vcvtph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x7B);
-}
-void vcvtph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtps2phx(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x1D);
-}
-void vcvtps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x7B);
-}
-void vcvtps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0xE6);
-}
-void vcvtqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtsd2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_MAP5 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsh2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2D);
-}
-void vcvtsh2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x13);
-}
-void vcvtsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x2A);
-}
-void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x1D);
-}
-void vcvtss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvttpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvttpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvttph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x7A);
-}
-void vcvttph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvttps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2C);
-}
-void vcvttsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvtudq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtuqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtusi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x7B);
-}
-void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtuw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm);
-}
-void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E);
-}
-void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E);
-}
-void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52);
-}
-void vexp2pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8);
-}
-void vexp2ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8);
-}
-void vexpandpd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vexpandps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vextractf32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextractf64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x54, imm);
-}
-void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm);
-}
-void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x98);
-}
-void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x99);
-}
-void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA8);
-}
-void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xA9);
-}
-void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB8);
-}
-void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xB9);
-}
-void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x96);
-}
-void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA6);
-}
-void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB6);
-}
-void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9A);
-}
-void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9B);
-}
-void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAA);
-}
-void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAB);
-}
-void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBA);
-}
-void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBB);
-}
-void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x97);
-}
-void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA7);
-}
-void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB7);
-}
-void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9C);
-}
-void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9D);
-}
-void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAC);
-}
-void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAD);
-}
-void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBC);
-}
-void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBD);
-}
-void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9E);
-}
-void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9F);
-}
-void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAE);
-}
-void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAF);
-}
-void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBE);
-}
-void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBF);
-}
-void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm);
-}
-void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm);
-}
-void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm);
-}
-void vfpclasssd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm);
-}
-void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm);
-}
-void vfpclassss(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm);
-}
-void vgatherdpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1);
-}
-void vgatherdps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0);
-}
-void vgatherpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherqpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0);
-}
-void vgatherqps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2);
-}
-void vgetexppd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42);
-}
-void vgetexpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x42);
-}
-void vgetexpps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42);
-}
-void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x26, imm);
-}
-void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x26, imm);
-}
-void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x26, imm);
-}
-void vgetmantsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vinsertf32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinsertf64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F);
-}
-void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F);
-}
-void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D);
-}
-void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D);
-}
-void vmovdqa32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqa64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu16(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu16(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu8(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu8(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovsh(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX | T_M_K, 0x11);
-}
-void vmovsh(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) {
-  opAVX_X_X_XM(x1, x2, x3, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32 | 64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x6E);
-}
-void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59);
-}
-void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59);
-}
-void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68);
-}
-void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68);
-}
-void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52);
-}
-void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53);
-}
-void vpabsq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F);
-}
-void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDB);
-}
-void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDF);
-}
-void vpandnq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDF);
-}
-void vpandq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDB);
-}
-void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64);
-}
-void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64);
-}
-void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpbroadcastb(const Xmm& x, const Reg8& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7A); }
-void vpbroadcastd(const Xmm& x, const Reg32& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7C); }
-void vpbroadcastmb2q(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A);
-}
-void vpbroadcastmw2d(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A);
-}
-void vpbroadcastw(const Xmm& x, const Reg16& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7B); }
-void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm);
-}
-void vpcmpeqb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x74);
-}
-void vpcmpeqd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpcmpeqq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x29);
-}
-void vpcmpeqw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x64);
-}
-void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66);
-}
-void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37);
-}
-void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65);
-}
-void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm);
-}
-void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm);
-}
-void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm);
-}
-void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcompressd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpcompressq(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpconflictd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4);
-}
-void vpconflictq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4);
-}
-void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpermi2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x77);
-}
-void vpermi2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x77);
-}
-void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76);
-}
-void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermt2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7E);
-}
-void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F);
-}
-void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F);
-}
-void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E);
-}
-void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpexpandb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpexpandd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpgatherdd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0);
-}
-void vpgatherdq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1);
-}
-void vpgatherqd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0);
-}
-void vplzcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44);
-}
-void vplzcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44);
-}
-void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D);
-}
-void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F);
-}
-void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39);
-}
-void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B);
-}
-void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
-void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
-void vpmovdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false);
-}
-void vpmovdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true);
-}
-void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
-void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
-void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
-void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
-void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
-void vpmovqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false);
-}
-void vpmovqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true);
-}
-void vpmovqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false);
-}
-void vpmovsdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false);
-}
-void vpmovsdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true);
-}
-void vpmovsqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false);
-}
-void vpmovsqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true);
-}
-void vpmovsqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false);
-}
-void vpmovswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true);
-}
-void vpmovusdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false);
-}
-void vpmovusdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true);
-}
-void vpmovusqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false);
-}
-void vpmovusqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true);
-}
-void vpmovusqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false);
-}
-void vpmovuswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true);
-}
-void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
-void vpmovwb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true);
-}
-void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40);
-}
-void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83);
-}
-void vpopcntb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpopcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55);
-}
-void vpopcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55);
-}
-void vpopcntw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB);
-}
-void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB);
-}
-void vprold(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprolq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprolvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x15);
-}
-void vprolvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x15);
-}
-void vprord(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprorq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14);
-}
-void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14);
-}
-void vpscatterdd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0);
-}
-void vpscatterdq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1);
-}
-void vpscatterqd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2);
-}
-void vpscatterqq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0);
-}
-void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm);
-}
-void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm);
-}
-void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71);
-}
-void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71);
-}
-void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70);
-}
-void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm);
-}
-void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm);
-}
-void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm);
-}
-void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73);
-}
-void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73);
-}
-void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72);
-}
-void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm);
-}
-void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F);
-}
-void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12);
-}
-void vpsraq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2);
-}
-void vpsravq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x46);
-}
-void vpsravw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x11);
-}
-void vpsrlvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x10);
-}
-void vpternlogd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x25, imm);
-}
-void vpternlogq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x25, imm);
-}
-void vptestmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestnmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestnmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vpxord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEF);
-}
-void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEF);
-}
-void vrangepd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x50, imm);
-}
-void vrangeps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50, imm);
-}
-void vrangesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrangess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrcp14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4C);
-}
-void vrcp14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4C);
-}
-void vrcp14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX, 0x4D);
-}
-void vrcp14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vrcp28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCA);
-}
-void vrcp28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA);
-}
-void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4C);
-}
-void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x56, imm);
-}
-void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x56, imm);
-}
-void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x56, imm);
-}
-void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x09, imm);
-}
-void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x08, imm);
-}
-void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x08, imm);
-}
-void vrndscalesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x0B, imm);
-}
-void vrndscalesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrndscaless(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrsqrt14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4E);
-}
-void vrsqrt14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4E);
-}
-void vrsqrt14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4E);
-}
-void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4F);
-}
-void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x2C);
-}
-void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x2C);
-}
-void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C);
-}
-void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscatterdpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1);
-}
-void vscatterdps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0);
-}
-void vscatterpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterqpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0);
-}
-void vscatterqps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2);
-}
-void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm);
-}
-void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm);
-}
-void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm);
-}
-void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm);
-}
-void vsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x51);
-}
-void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x51);
-}
-void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C);
-}
-void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C);
-}
-void vucomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E);
-}
-#ifdef XBYAK64
-void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
-void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
-#endif
-#endif
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
deleted file mode 100644
index f9e43afc8371f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
+++ /dev/null
@@ -1,1160 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#ifndef XBYAK_XBYAK_UTIL_H_
-#define XBYAK_XBYAK_UTIL_H_
-
-#ifdef XBYAK_ONLY_CLASS_CPU
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#ifndef XBYAK_THROW
-#define XBYAK_THROW(x) ;
-#define XBYAK_THROW_RET(x, y) return y;
-#endif
-#ifndef XBYAK_CONSTEXPR
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-#endif
-#else
-#include <string.h>
-
-/**
-        utility class and functions for Xbyak
-        Xbyak::util::Clock ; rdtsc timer
-        Xbyak::util::Cpu ; detect CPU
-*/
-#include "xbyak.h"
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#define XBYAK_INTEL_CPU_SPECIFIC
-#endif
-
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
-static inline __declspec(naked) void __cpuid(int[4], int) {
-  __asm {
-				push	ebx
-				push	esi
-				mov		eax, dword ptr [esp + 4 * 2 + 8]  // eaxIn
-				cpuid
-				mov		esi, dword ptr [esp + 4 * 2 + 4]  // data
-				mov		dword ptr [esi], eax
-				mov		dword ptr [esi + 4], ebx
-				mov		dword ptr [esi + 8], ecx
-				mov		dword ptr [esi + 12], edx
-				pop		esi
-				pop		ebx
-				ret
-  }
-}
-#else
-#include <intrin.h>  // for __cpuid
-#endif
-#else
-#ifndef __GNUC_PREREQ
-#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
-#endif
-#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
-#if !defined(signature_VORTEX_ebx) && !defined(signature_NEXGEN_ebx) && \
-    !defined(signature_AMD_ebx)  // workaround for Bug 96238 - [i386] cpuid.h header needs include guards
-#include <cpuid.h>
-#endif
-#else
-#if defined(__APPLE__) && defined(XBYAK32)  // avoid err : can't find a register in class `BREG' while reloading `asm'
-#define __cpuid(eaxIn, a, b, c, d)                                         \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d)                            \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn), "2"(ecxIn))
-#else
-#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) \
-  __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
-#endif
-#endif
-#endif
-#endif
-
-#ifdef XBYAK_USE_VTUNE
-// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
-#include <jitprofiling.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "libittnotify.lib")
-#endif
-#ifdef __linux__
-#include <dlfcn.h>
-#endif
-#endif
-#ifdef __linux__
-#define XBYAK_USE_PERF
-#endif
-
-namespace Xbyak {
-namespace util {
-
-typedef enum { SmtLevel = 1, CoreLevel = 2 } IntelCpuTopologyLevel;
-
-namespace local {
-
-template <uint64_t L, uint64_t H = 0>
-struct TypeT {};
-
-template <uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
-XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) {
-  return TypeT<L1 | L2, H1 | H2>();
-}
-
-template <typename T>
-inline T max_(T x, T y) {
-  return x >= y ? x : y;
-}
-template <typename T>
-inline T min_(T x, T y) {
-  return x < y ? x : y;
-}
-
-}  // namespace local
-
-/**
-        CPU detection class
-        @note static inline const member is supported by c++17 or later, so use template hack
-*/
-class Cpu {
- public:
-  class Type {
-    uint64_t L;
-    uint64_t H;
-
-   public:
-    Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) {}
-    template <uint64_t L_, uint64_t H_>
-    Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
-    Type& operator&=(const Type& rhs) {
-      L &= rhs.L;
-      H &= rhs.H;
-      return *this;
-    }
-    Type& operator|=(const Type& rhs) {
-      L |= rhs.L;
-      H |= rhs.H;
-      return *this;
-    }
-    Type operator&(const Type& rhs) const {
-      Type t = *this;
-      t &= rhs;
-      return t;
-    }
-    Type operator|(const Type& rhs) const {
-      Type t = *this;
-      t |= rhs;
-      return t;
-    }
-    bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
-    bool operator!=(const Type& rhs) const { return !operator==(rhs); }
-    // without explicit because backward compatilibity
-    operator bool() const { return (H | L) != 0; }
-    uint64_t getL() const { return L; }
-    uint64_t getH() const { return H; }
-  };
-
- private:
-  Type type_;
-  // system topology
-  bool x2APIC_supported_;
-  static const size_t maxTopologyLevels = 2;
-  uint32_t numCores_[maxTopologyLevels];
-
-  static const uint32_t maxNumberCacheLevels = 10;
-  uint32_t dataCacheSize_[maxNumberCacheLevels];
-  uint32_t coresSharignDataCache_[maxNumberCacheLevels];
-  uint32_t dataCacheLevels_;
-
-  uint32_t get32bitAsBE(const char* x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); }
-  uint32_t mask(int n) const { return (1U << n) - 1; }
-  void setFamily() {
-    uint32_t data[4] = {};
-    getCpuid(1, data);
-    stepping = data[0] & mask(4);
-    model = (data[0] >> 4) & mask(4);
-    family = (data[0] >> 8) & mask(4);
-    // type = (data[0] >> 12) & mask(2);
-    extModel = (data[0] >> 16) & mask(4);
-    extFamily = (data[0] >> 20) & mask(8);
-    if (family == 0x0f) {
-      displayFamily = family + extFamily;
-    } else {
-      displayFamily = family;
-    }
-    if (family == 6 || family == 0x0f) {
-      displayModel = (extModel << 4) + model;
-    } else {
-      displayModel = model;
-    }
-  }
-  uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) { return (val >> base) & ((1u << (end - base)) - 1); }
-  void setNumCores() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    uint32_t data[4] = {};
-    getCpuidEx(0x0, 0, data);
-    if (data[0] >= 0xB) {
-      /*
-             if leaf 11 exists(x2APIC is supported),
-             we use it to get the number of smt cores and cores on socket
-
-             leaf 0xB can be zeroed-out by a hypervisor
-     */
-      x2APIC_supported_ = true;
-      for (uint32_t i = 0; i < maxTopologyLevels; i++) {
-        getCpuidEx(0xB, i, data);
-        IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
-        if (level == SmtLevel || level == CoreLevel) {
-          numCores_[level - 1] = extractBit(data[1], 0, 15);
-        }
-      }
-      /*
-              Fallback values in case a hypervisor has 0xB leaf zeroed-out.
-      */
-      numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
-      numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
-    } else {
-      /*
-              Failed to deremine num of cores without x2APIC support.
-              TODO: USE initial APIC ID to determine ncores.
-      */
-      numCores_[SmtLevel - 1] = 0;
-      numCores_[CoreLevel - 1] = 0;
-    }
-  }
-  void setCacheHierarchy() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
-    if (has(tAMD)) {
-      // There are 3 Data Cache Levels (L1, L2, L3)
-      dataCacheLevels_ = 3;
-      const uint32_t leaf = 0x8000001D;  // for modern AMD CPus
-      // Sub leaf value ranges from 0 to 3
-      // Sub leaf value 0 refers to L1 Data Cache
-      // Sub leaf value 1 refers to L1 Instruction Cache
-      // Sub leaf value 2 refers to L2 Cache
-      // Sub leaf value 3 refers to L3 Cache
-      // For legacy AMD CPU, use leaf 0x80000005 for L1 cache
-      // and 0x80000006 for L2 and L3 cache
-      int cache_index = 0;
-      for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
-        // Skip sub_leaf = 1 as it refers to
-        // L1 Instruction Cache (not required)
-        if (sub_leaf == 1) {
-          continue;
-        }
-        uint32_t data[4] = {};
-        getCpuidEx(leaf, sub_leaf, data);
-        // Cache Size = Line Size * Partitions * Associativity * Cache Sets
-        dataCacheSize_[cache_index] = (extractBit(data[1], 22, 31) + 1)    // Associativity-1
-                                      * (extractBit(data[1], 12, 21) + 1)  // Partitions-1
-                                      * (extractBit(data[1], 0, 11) + 1)   // Line Size
-                                      * (data[2] + 1);
-        // Calculate the number of cores sharing the current data cache
-        int smt_width = numCores_[0];
-        int logical_cores = numCores_[1];
-        int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
-        if (logical_cores != 0) {
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
-        ++cache_index;
-      }
-      return;
-    }
-    // intel
-    const uint32_t NO_CACHE = 0;
-    const uint32_t DATA_CACHE = 1;
-    //		const uint32_t INSTRUCTION_CACHE = 2;
-    const uint32_t UNIFIED_CACHE = 3;
-    uint32_t smt_width = 0;
-    uint32_t logical_cores = 0;
-    uint32_t data[4] = {};
-
-    if (x2APIC_supported_) {
-      smt_width = numCores_[0];
-      logical_cores = numCores_[1];
-    }
-
-    /*
-            Assumptions:
-            the first level of data cache is not shared (which is the
-            case for every existing architecture) and use this to
-            determine the SMT width for arch not supporting leaf 11.
-            when leaf 4 reports a number of core less than numCores_
-            on socket reported by leaf 11, then it is a correct number
-            of cores not an upperbound.
-    */
-    for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
-      getCpuidEx(0x4, i, data);
-      uint32_t cacheType = extractBit(data[0], 0, 4);
-      if (cacheType == NO_CACHE) break;
-      if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-        uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
-        if (logical_cores != 0) {  // true only if leaf 0xB is supported and valid
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        assert(actual_logical_cores != 0);
-        dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) *
-                                           (extractBit(data[1], 0, 11) + 1) * (data[2] + 1);
-        if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
-        assert(smt_width != 0);
-        coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
-        dataCacheLevels_++;
-      }
-    }
-  }
-
- public:
-  int model;
-  int family;
-  int stepping;
-  int extModel;
-  int extFamily;
-  int displayFamily;  // family + extFamily
-  int displayModel;   // model + extModel
-
-  uint32_t getNumCores(IntelCpuTopologyLevel level) const {
-    if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    switch (level) {
-      case SmtLevel:
-        return numCores_[level - 1];
-      case CoreLevel:
-        return numCores_[level - 1] / numCores_[SmtLevel - 1];
-      default:
-        XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    }
-  }
-
-  uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
-  uint32_t getCoresSharingDataCache(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return coresSharignDataCache_[i];
-  }
-  uint32_t getDataCacheSize(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return dataCacheSize_[i];
-  }
-
-  /*
-          data[] = { eax, ebx, ecx, edx }
-  */
-  static inline void getCpuid(uint32_t eaxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuid(reinterpret_cast<int*>(data), eaxIn);
-#else
-    __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)data;
-#endif
-  }
-  static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
-#else
-    __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)ecxIn;
-    (void)data;
-#endif
-  }
-  static inline uint64_t getXfeature() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return _xgetbv(0);
-#else
-    uint32_t eax, edx;
-    // xgetvb is not support on gcc 4.2
-    //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
-    __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    return 0;
-#endif
-  }
-
-#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
-#define XBYAK_DEFINE_TYPE(id, NAME) \
-  static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME {}
-#else
-#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
-#endif
-  XBYAK_DEFINE_TYPE(0, tMMX);
-  XBYAK_DEFINE_TYPE(1, tMMX2);
-  XBYAK_DEFINE_TYPE(2, tCMOV);
-  XBYAK_DEFINE_TYPE(3, tSSE);
-  XBYAK_DEFINE_TYPE(4, tSSE2);
-  XBYAK_DEFINE_TYPE(5, tSSE3);
-  XBYAK_DEFINE_TYPE(6, tSSSE3);
-  XBYAK_DEFINE_TYPE(7, tSSE41);
-  XBYAK_DEFINE_TYPE(8, tSSE42);
-  XBYAK_DEFINE_TYPE(9, tPOPCNT);
-  XBYAK_DEFINE_TYPE(10, tAESNI);
-  XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
-  XBYAK_DEFINE_TYPE(12, tOSXSAVE);
-  XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(14, tAVX);
-  XBYAK_DEFINE_TYPE(15, tFMA);
-  XBYAK_DEFINE_TYPE(16, t3DN);
-  XBYAK_DEFINE_TYPE(17, tE3DN);
-  XBYAK_DEFINE_TYPE(18, tWAITPKG);
-  XBYAK_DEFINE_TYPE(19, tRDTSCP);
-  XBYAK_DEFINE_TYPE(20, tAVX2);
-  XBYAK_DEFINE_TYPE(21, tBMI1);  // andn, bextr, blsi, blsmsk, blsr, tzcnt
-  XBYAK_DEFINE_TYPE(22, tBMI2);  // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
-  XBYAK_DEFINE_TYPE(23, tLZCNT);
-  XBYAK_DEFINE_TYPE(24, tINTEL);
-  XBYAK_DEFINE_TYPE(25, tAMD);
-  XBYAK_DEFINE_TYPE(26, tENHANCED_REP);  // enhanced rep movsb/stosb
-  XBYAK_DEFINE_TYPE(27, tRDRAND);
-  XBYAK_DEFINE_TYPE(28, tADX);     // adcx, adox
-  XBYAK_DEFINE_TYPE(29, tRDSEED);  // rdseed
-  XBYAK_DEFINE_TYPE(30, tSMAP);    // stac
-  XBYAK_DEFINE_TYPE(31, tHLE);     // xacquire, xrelease, xtest
-  XBYAK_DEFINE_TYPE(32, tRTM);     // xbegin, xend, xabort
-  XBYAK_DEFINE_TYPE(33, tF16C);    // vcvtph2ps, vcvtps2ph
-  XBYAK_DEFINE_TYPE(34, tMOVBE);   // mobve
-  XBYAK_DEFINE_TYPE(35, tAVX512F);
-  XBYAK_DEFINE_TYPE(36, tAVX512DQ);
-  XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
-  XBYAK_DEFINE_TYPE(37, tAVX512IFMA);  // = tAVX512_IFMA;
-  XBYAK_DEFINE_TYPE(38, tAVX512PF);
-  XBYAK_DEFINE_TYPE(39, tAVX512ER);
-  XBYAK_DEFINE_TYPE(40, tAVX512CD);
-  XBYAK_DEFINE_TYPE(41, tAVX512BW);
-  XBYAK_DEFINE_TYPE(42, tAVX512VL);
-  XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
-  XBYAK_DEFINE_TYPE(43, tAVX512VBMI);  // = tAVX512_VBMI; // changed by Intel's manual
-  XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
-  XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
-  XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
-  XBYAK_DEFINE_TYPE(47, tPREFETCHW);
-  XBYAK_DEFINE_TYPE(48, tSHA);
-  XBYAK_DEFINE_TYPE(49, tMPX);
-  XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
-  XBYAK_DEFINE_TYPE(51, tGFNI);
-  XBYAK_DEFINE_TYPE(52, tVAES);
-  XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
-  XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
-  XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
-  XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
-  XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
-  XBYAK_DEFINE_TYPE(59, tAMX_TILE);
-  XBYAK_DEFINE_TYPE(60, tAMX_INT8);
-  XBYAK_DEFINE_TYPE(61, tAMX_BF16);
-  XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
-  XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
-  XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
-  XBYAK_DEFINE_TYPE(65, tMOVDIRI);
-  XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
-  XBYAK_DEFINE_TYPE(67, tCLZERO);  // AMD Zen
-  XBYAK_DEFINE_TYPE(68, tAMX_FP16);
-  XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
-  XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
-  XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
-  XBYAK_DEFINE_TYPE(72, tRAO_INT);
-  XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
-  XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
-  XBYAK_DEFINE_TYPE(75, tSERIALIZE);
-  XBYAK_DEFINE_TYPE(76, tUINTR);
-  XBYAK_DEFINE_TYPE(77, tXSAVE);
-  XBYAK_DEFINE_TYPE(78, tSHA512);
-  XBYAK_DEFINE_TYPE(79, tSM3);
-  XBYAK_DEFINE_TYPE(80, tSM4);
-  XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
-
-#undef XBYAK_SPLIT_ID
-#undef XBYAK_DEFINE_TYPE
-
-  Cpu()
-      : type_(),
-        x2APIC_supported_(false),
-        numCores_(),
-        dataCacheSize_(),
-        coresSharignDataCache_(),
-        dataCacheLevels_(0) {
-    uint32_t data[4] = {};
-    const uint32_t& EAX = data[0];
-    const uint32_t& EBX = data[1];
-    const uint32_t& ECX = data[2];
-    const uint32_t& EDX = data[3];
-    getCpuid(0, data);
-    const uint32_t maxNum = EAX;
-    static const char intel[] = "ntel";
-    static const char amd[] = "cAMD";
-    if (ECX == get32bitAsBE(amd)) {
-      type_ |= tAMD;
-      getCpuid(0x80000001, data);
-      if (EDX & (1U << 31)) {
-        type_ |= t3DN;
-        // 3DNow! implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-
-      if (EDX & (1U << 29)) {
-        // Long mode implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-    }
-    if (ECX == get32bitAsBE(intel)) {
-      type_ |= tINTEL;
-    }
-
-    // Extended flags information
-    getCpuid(0x80000000, data);
-    const uint32_t maxExtendedNum = EAX;
-    if (maxExtendedNum >= 0x80000001) {
-      getCpuid(0x80000001, data);
-
-      if (EDX & (1U << 31)) type_ |= t3DN;
-      if (EDX & (1U << 30)) type_ |= tE3DN;
-      if (EDX & (1U << 27)) type_ |= tRDTSCP;
-      if (EDX & (1U << 22)) type_ |= tMMX2;
-      if (EDX & (1U << 15)) type_ |= tCMOV;
-      if (ECX & (1U << 5)) type_ |= tLZCNT;
-      if (ECX & (1U << 8)) type_ |= tPREFETCHW;
-    }
-
-    if (maxExtendedNum >= 0x80000008) {
-      getCpuid(0x80000008, data);
-      if (EBX & (1U << 0)) type_ |= tCLZERO;
-    }
-
-    getCpuid(1, data);
-    if (ECX & (1U << 0)) type_ |= tSSE3;
-    if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
-    if (ECX & (1U << 9)) type_ |= tSSSE3;
-    if (ECX & (1U << 19)) type_ |= tSSE41;
-    if (ECX & (1U << 20)) type_ |= tSSE42;
-    if (ECX & (1U << 22)) type_ |= tMOVBE;
-    if (ECX & (1U << 23)) type_ |= tPOPCNT;
-    if (ECX & (1U << 25)) type_ |= tAESNI;
-    if (ECX & (1U << 26)) type_ |= tXSAVE;
-    if (ECX & (1U << 27)) type_ |= tOSXSAVE;
-    if (ECX & (1U << 30)) type_ |= tRDRAND;
-    if (ECX & (1U << 29)) type_ |= tF16C;
-
-    if (EDX & (1U << 15)) type_ |= tCMOV;
-    if (EDX & (1U << 23)) type_ |= tMMX;
-    if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
-    if (EDX & (1U << 26)) type_ |= tSSE2;
-
-    if (type_ & tOSXSAVE) {
-      // check XFEATURE_ENABLED_MASK[2:1] = '11b'
-      uint64_t bv = getXfeature();
-      if ((bv & 6) == 6) {
-        if (ECX & (1U << 28)) type_ |= tAVX;
-        if (ECX & (1U << 12)) type_ |= tFMA;
-          // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
-#if !defined(__APPLE__)
-        if (((bv >> 5) & 7) == 7)
-#endif
-        {
-          getCpuidEx(7, 0, data);
-          if (EBX & (1U << 16)) type_ |= tAVX512F;
-          if (type_ & tAVX512F) {
-            if (EBX & (1U << 17)) type_ |= tAVX512DQ;
-            if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
-            if (EBX & (1U << 26)) type_ |= tAVX512PF;
-            if (EBX & (1U << 27)) type_ |= tAVX512ER;
-            if (EBX & (1U << 28)) type_ |= tAVX512CD;
-            if (EBX & (1U << 30)) type_ |= tAVX512BW;
-            if (EBX & (1U << 31)) type_ |= tAVX512VL;
-            if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
-            if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
-            if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
-            if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
-            if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
-            if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
-            if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
-            if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
-            if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16;
-          }
-        }
-      }
-    }
-    if (maxNum >= 7) {
-      getCpuidEx(7, 0, data);
-      const uint32_t maxNumSubLeaves = EAX;
-      if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
-      if (EBX & (1U << 3)) type_ |= tBMI1;
-      if (EBX & (1U << 8)) type_ |= tBMI2;
-      if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
-      if (EBX & (1U << 18)) type_ |= tRDSEED;
-      if (EBX & (1U << 19)) type_ |= tADX;
-      if (EBX & (1U << 20)) type_ |= tSMAP;
-      if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
-      if (EBX & (1U << 4)) type_ |= tHLE;
-      if (EBX & (1U << 11)) type_ |= tRTM;
-      if (EBX & (1U << 14)) type_ |= tMPX;
-      if (EBX & (1U << 29)) type_ |= tSHA;
-      if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
-      if (ECX & (1U << 5)) type_ |= tWAITPKG;
-      if (ECX & (1U << 8)) type_ |= tGFNI;
-      if (ECX & (1U << 9)) type_ |= tVAES;
-      if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
-      if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
-      if (ECX & (1U << 27)) type_ |= tMOVDIRI;
-      if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
-      if (EDX & (1U << 5)) type_ |= tUINTR;
-      if (EDX & (1U << 14)) type_ |= tSERIALIZE;
-      if (EDX & (1U << 22)) type_ |= tAMX_BF16;
-      if (EDX & (1U << 24)) type_ |= tAMX_TILE;
-      if (EDX & (1U << 25)) type_ |= tAMX_INT8;
-      if (maxNumSubLeaves >= 1) {
-        getCpuidEx(7, 1, data);
-        if (EAX & (1U << 0)) type_ |= tSHA512;
-        if (EAX & (1U << 1)) type_ |= tSM3;
-        if (EAX & (1U << 2)) type_ |= tSM4;
-        if (EAX & (1U << 3)) type_ |= tRAO_INT;
-        if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
-        if (type_ & tAVX512F) {
-          if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
-        }
-        if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
-        if (EAX & (1U << 21)) type_ |= tAMX_FP16;
-        if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
-        if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
-        if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
-        if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
-        if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
-      }
-    }
-    setFamily();
-    setNumCores();
-    setCacheHierarchy();
-  }
-  void putFamily() const {
-#ifndef XBYAK_ONLY_CLASS_CPU
-    printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", family, model, stepping, extFamily,
-           extModel);
-    printf("display:family=%X, model=%X\n", displayFamily, displayModel);
-#endif
-  }
-  bool has(const Type& type) const { return (type & type_) == type; }
-};
-
-#ifndef XBYAK_ONLY_CLASS_CPU
-class Clock {
- public:
-  static inline uint64_t getRdtsc() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return __rdtsc();
-#else
-    uint32_t eax, edx;
-    __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
-    return 0;
-#endif
-  }
-  Clock() : clock_(0), count_(0) {}
-  void begin() { clock_ -= getRdtsc(); }
-  void end() {
-    clock_ += getRdtsc();
-    count_++;
-  }
-  int getCount() const { return count_; }
-  uint64_t getClock() const { return clock_; }
-  void clear() {
-    count_ = 0;
-    clock_ = 0;
-  }
-
- private:
-  uint64_t clock_;
-  int count_;
-};
-
-#ifdef XBYAK64
-const int UseRCX = 1 << 6;
-const int UseRDX = 1 << 7;
-
-class Pack {
-  static const size_t maxTblNum = 15;
-  Xbyak::Reg64 tbl_[maxTblNum];
-  size_t n_;
-
- public:
-  Pack() : tbl_(), n_(0) {}
-  Pack(const Xbyak::Reg64* tbl, size_t n) { init(tbl, n); }
-  Pack(const Pack& rhs) : n_(rhs.n_) {
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-  }
-  Pack& operator=(const Pack& rhs) {
-    n_ = rhs.n_;
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-    return *this;
-  }
-  Pack(const Xbyak::Reg64& t0) {
-    n_ = 1;
-    tbl_[0] = t0;
-  }
-  Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 2;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-  }
-  Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 3;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-  }
-  Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 4;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-  }
-  Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 5;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-  }
-  Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 6;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-  }
-  Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 7;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-  }
-  Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 8;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-  }
-  Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5,
-       const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 9;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-  }
-  Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6,
-       const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 10;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-  }
-  Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7,
-       const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 11;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-  }
-  Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8,
-       const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 12;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-    tbl_[11] = tb;
-  }
-  Pack& append(const Xbyak::Reg64& t) {
-    if (n_ == maxTblNum) {
-      fprintf(stderr, "ERR Pack::can't append\n");
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
-    }
-    tbl_[n_++] = t;
-    return *this;
-  }
-  void init(const Xbyak::Reg64* tbl, size_t n) {
-    if (n > maxTblNum) {
-      fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
-      XBYAK_THROW(ERR_BAD_PARAMETER)
-    }
-    n_ = n;
-    for (size_t i = 0; i < n; i++) {
-      tbl_[i] = tbl[i];
-    }
-  }
-  const Xbyak::Reg64& operator[](size_t n) const {
-    if (n >= n_) {
-      fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
-    }
-    return tbl_[n];
-  }
-  size_t size() const { return n_; }
-  /*
-          get tbl[pos, pos + num)
-  */
-  Pack sub(size_t pos, size_t num = size_t(-1)) const {
-    if (num == size_t(-1)) num = n_ - pos;
-    if (pos + num > n_) {
-      fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
-    }
-    Pack pack;
-    pack.n_ = num;
-    for (size_t i = 0; i < num; i++) {
-      pack.tbl_[i] = tbl_[pos + i];
-    }
-    return pack;
-  }
-  void put() const {
-    for (size_t i = 0; i < n_; i++) {
-      printf("%s ", tbl_[i].toString());
-    }
-    printf("\n");
-  }
-};
-
-class StackFrame {
-#ifdef XBYAK64_WIN
-  static const int noSaveNum = 6;
-  static const int rcxPos = 0;
-  static const int rdxPos = 1;
-#else
-  static const int noSaveNum = 8;
-  static const int rcxPos = 3;
-  static const int rdxPos = 2;
-#endif
-  static const int maxRegNum = 14;  // maxRegNum = 16 - rsp - rax
-  Xbyak::CodeGenerator* code_;
-  int pNum_;
-  int tNum_;
-  bool useRcx_;
-  bool useRdx_;
-  int saveNum_;
-  int P_;
-  bool makeEpilog_;
-  Xbyak::Reg64 pTbl_[4];
-  Xbyak::Reg64 tTbl_[maxRegNum];
-  Pack p_;
-  Pack t_;
-  StackFrame(const StackFrame&);
-  void operator=(const StackFrame&);
-
- public:
-  const Pack& p;
-  const Pack& t;
-  /*
-          make stack frame
-          @param sf [in] this
-          @param pNum [in] num of function parameter(0 <= pNum <= 4)
-          @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
-          @param stackSizeByte [in] local stack size
-          @param makeEpilog [in] automatically call close() if true
-
-          you can use
-          rax
-          gp0, ..., gp(pNum - 1)
-          gt0, ..., gt(tNum-1)
-          rcx if tNum & UseRCX
-          rdx if tNum & UseRDX
-          rsp[0..stackSizeByte - 1]
-  */
-  StackFrame(Xbyak::CodeGenerator* code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
-      : code_(code),
-        pNum_(pNum),
-        tNum_(tNum & ~(UseRCX | UseRDX)),
-        useRcx_((tNum & UseRCX) != 0),
-        useRdx_((tNum & UseRDX) != 0),
-        saveNum_(0),
-        P_(0),
-        makeEpilog_(makeEpilog),
-        p(p_),
-        t(t_) {
-    using namespace Xbyak;
-    if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
-    const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
-    if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
-    const Reg64& _rsp = code->rsp;
-    saveNum_ = local::max_(0, allRegNum - noSaveNum);
-    const int* tbl = getOrderTbl() + noSaveNum;
-    for (int i = 0; i < saveNum_; i++) {
-      code->push(Reg64(tbl[i]));
-    }
-    P_ = (stackSizeByte + 7) / 8;
-    if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++;  // (rsp % 16) == 8, then increment P_ for 16 byte alignment
-    P_ *= 8;
-    if (P_ > 0) code->sub(_rsp, P_);
-    int pos = 0;
-    for (int i = 0; i < pNum; i++) {
-      pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    for (int i = 0; i < tNum_; i++) {
-      tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
-    if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
-    p_.init(pTbl_, pNum);
-    t_.init(tTbl_, tNum_);
-  }
-  /*
-          make epilog manually
-          @param callRet [in] call ret() if true
-  */
-  void close(bool callRet = true) {
-    using namespace Xbyak;
-    const Reg64& _rsp = code_->rsp;
-    const int* tbl = getOrderTbl() + noSaveNum;
-    if (P_ > 0) code_->add(_rsp, P_);
-    for (int i = 0; i < saveNum_; i++) {
-      code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
-    }
-
-    if (callRet) code_->ret();
-  }
-  ~StackFrame() {
-    if (!makeEpilog_) return;
-    close();
-  }
-
- private:
-  const int* getOrderTbl() const {
-    using namespace Xbyak;
-    static const int tbl[] = {
-#ifdef XBYAK64_WIN
-        Operand::RCX, Operand::RDX, Operand::R8,  Operand::R9,  Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
-#else
-        Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8,  Operand::R9, Operand::R10, Operand::R11,
-#endif
-        Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15};
-    return &tbl[0];
-  }
-  int getRegIdx(int& pos) const {
-    assert(pos < maxRegNum);
-    using namespace Xbyak;
-    const int* tbl = getOrderTbl();
-    int r = tbl[pos++];
-    if (useRcx_) {
-      if (r == Operand::RCX) {
-        return Operand::R10;
-      }
-      if (r == Operand::R10) {
-        r = tbl[pos++];
-      }
-    }
-    if (useRdx_) {
-      if (r == Operand::RDX) {
-        return Operand::R11;
-      }
-      if (r == Operand::R11) {
-        return tbl[pos++];
-      }
-    }
-    return r;
-  }
-};
-#endif
-
-class Profiler {
-  int mode_;
-  const char* suffix_;
-  const void* startAddr_;
-#ifdef XBYAK_USE_PERF
-  FILE* fp_;
-#endif
- public:
-  enum { None = 0, Perf = 1, VTune = 2 };
-  Profiler()
-      : mode_(None),
-        suffix_(""),
-        startAddr_(0)
-#ifdef XBYAK_USE_PERF
-        ,
-        fp_(0)
-#endif
-  {
-  }
-  // append suffix to funcName
-  void setNameSuffix(const char* suffix) { suffix_ = suffix; }
-  void setStartAddr(const void* startAddr) { startAddr_ = startAddr; }
-  void init(int mode) {
-    mode_ = None;
-    switch (mode) {
-      default:
-      case None:
-        return;
-      case Perf:
-#ifdef XBYAK_USE_PERF
-        close();
-        {
-          const int pid = getpid();
-          char name[128];
-          snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
-          fp_ = fopen(name, "a+");
-          if (fp_ == 0) {
-            fprintf(stderr, "can't open %s\n", name);
-            return;
-          }
-        }
-        mode_ = Perf;
-#endif
-        return;
-      case VTune:
-#ifdef XBYAK_USE_VTUNE
-        dlopen("dummy", RTLD_LAZY);  // force to load dlopen to enable jit profiling
-        if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
-          fprintf(stderr, "VTune profiling is not active\n");
-          return;
-        }
-        mode_ = VTune;
-#endif
-        return;
-    }
-  }
-  ~Profiler() { close(); }
-  void close() {
-#ifdef XBYAK_USE_PERF
-    if (fp_ == 0) return;
-    fclose(fp_);
-    fp_ = 0;
-#endif
-  }
-  void set(const char* funcName, const void* startAddr, size_t funcSize) const {
-    if (mode_ == None) return;
-#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
-    (void)funcName;
-    (void)startAddr;
-    (void)funcSize;
-#endif
-#ifdef XBYAK_USE_PERF
-    if (mode_ == Perf) {
-      if (fp_ == 0) return;
-      fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
-      /*
-              perf does not recognize the function name which is less than 3,
-              so append '_' at the end of the name if necessary
-      */
-      size_t n = strlen(funcName) + strlen(suffix_);
-      for (size_t i = n; i < 3; i++) {
-        fprintf(fp_, "_");
-      }
-      fprintf(fp_, "\n");
-      fflush(fp_);
-    }
-#endif
-#ifdef XBYAK_USE_VTUNE
-    if (mode_ != VTune) return;
-    char className[] = "";
-    char fileName[] = "";
-    iJIT_Method_Load jmethod = {};
-    jmethod.method_id = iJIT_GetNewMethodID();
-    jmethod.class_file_name = className;
-    jmethod.source_file_name = fileName;
-    jmethod.method_load_address = const_cast<void*>(startAddr);
-    jmethod.method_size = funcSize;
-    jmethod.line_number_size = 0;
-    char buf[128];
-    snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
-    jmethod.method_name = buf;
-    iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
-#endif
-  }
-  /*
-          for continuous set
-          funcSize = endAddr - <previous set endAddr>
-  */
-  void set(const char* funcName, const void* endAddr) {
-    set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
-    startAddr_ = endAddr;
-  }
-};
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-}  // namespace util
-}  // namespace Xbyak
-
-#endif
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 113b94fa6f7c9..e0ed32630277e 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -63,7 +63,7 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
       tp.get());
 }
 
-void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_COMPUTE_TYPE comp_type,
+void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level,
              bool has_zeropoint, bool use_float16, float fp16_abs_error = 0.02f) {
   RandomValueGenerator random{1234};
   std::vector<float> input0_vals(random.Gaussian<float>(std::vector<int64_t>({M, K}), 0.0f, 0.25f));
@@ -110,7 +110,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
   test.AddAttribute<int64_t>("N", N);
   test.AddAttribute<int64_t>("block_size", block_size);
   test.AddAttribute<int64_t>("bits", QBits);
-  test.AddAttribute<int64_t>("accuracy_level", comp_type);
+  test.AddAttribute<int64_t>("accuracy_level", accuracy_level);
   if (use_float16) {
     test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
@@ -134,7 +134,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_vals);
-    if (comp_type == CompInt8) {
+    if (accuracy_level == 4) {
       test.SetOutputAbsErr("Y", 0.1f);
     }
 
@@ -147,10 +147,17 @@ TEST(MatMulNBits, Float32) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          for (auto comp : {CompUndef, CompFp32, CompInt8}) {
-            RunTest(M, N, K, block_size, comp, false, false);
-            RunTest(M, N, K, block_size, comp, true, false);
+#ifdef ORT_NEURAL_SPEED
+          for (auto accuracy_level : {0, 1, 4}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
           }
+#else
+          for (auto accuracy_level : {0}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
+          }
+#endif
         }
       }
     }
@@ -163,8 +170,8 @@ TEST(MatMulNBits, Float16) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          RunTest(M, N, K, block_size, CompUndef, false, true);
-          RunTest(M, N, K, block_size, CompUndef, true, true);
+          RunTest(M, N, K, block_size, 0, false, true);
+          RunTest(M, N, K, block_size, 0, true, true);
         }
       }
     }
@@ -174,9 +181,9 @@ TEST(MatMulNBits, Float16) {
 TEST(MatMulNBits, Float16Large) {
   for (auto block_size : {16, 32, 64, 128}) {
     for (auto symmetric : {false, true}) {
-      RunTest(1, 4096, 4096, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 4096, 11008, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 11008, 4096, block_size, CompUndef, symmetric, true, 0.05f);
+      RunTest(1, 4096, 4096, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 4096, 11008, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 11008, 4096, block_size, 0, symmetric, true, 0.05f);
     }
   }
 }
@@ -184,11 +191,11 @@ TEST(MatMulNBits, Float16Large) {
 #endif
 
 void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_size, bool is_asym,
-                                   MLAS_SQNBIT_COMPUTE_TYPE acc_lvl) {
+                                   int64_t acc_lvl) {
   // (M x K) X (K x N)
 
   OpTester test("MatMulNBits", 1, kMSDomain);
-  test.AddAttribute<int64_t>("accuracy_level", int64_t(acc_lvl));
+  test.AddAttribute<int64_t>("accuracy_level", acc_lvl);
   test.AddAttribute<int64_t>("block_size", int64_t(block_size));
   test.AddAttribute<int64_t>("bits", QBits);
   test.AddAttribute<int64_t>("N", N);
@@ -268,7 +275,7 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
     test.AddInput<uint8_t>("zero_points", {N, static_cast<int64_t>(kblks / 2)}, input3_vals, true);
   }
   test.AddOutput<float>("Y", {M, N}, expected_vals, false);
-  if (acc_lvl == CompInt8) {
+  if (acc_lvl == 4) {
     test.SetOutputAbsErr("Y", 0.1f);
   }
 
@@ -341,14 +348,14 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
   }
 }
 
-#ifdef MLAS_JBLAS
+#ifdef ORT_NEURAL_SPEED
 TEST(MatMulNBits, SharedPrepackedWeights) {
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, CompInt8);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, 4);
 }
 #endif
 }  // namespace test
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
index 2a56d37b899f8..668d7a0611367 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
@@ -112,64 +112,3 @@ static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
 }
 
 BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
-
-#if defined(MLAS_JBLAS)
-
-void Q4GEMM_Jblas(benchmark::State& state, int block_size, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE cmp_type) {
-  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
-  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
-  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
-  if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!");
-
-  const size_t M = static_cast<size_t>(state.range(0));
-  const size_t N = static_cast<size_t>(state.range(1));
-  const size_t K = static_cast<size_t>(state.range(2));
-  const size_t threads = static_cast<size_t>(state.range(3));
-  block_size = block_size == -1 ? static_cast<int>(K) : block_size;
-  const size_t pack_b_size = MlasNBitsGemmPackBSize(N, K, block_size, 4, is_asym, cmp_type);
-
-  OrtThreadPoolParams tpo;
-  tpo.thread_pool_size = static_cast<int>(threads);
-  tpo.auto_set_affinity = true;
-  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(onnxruntime::concurrency::CreateThreadPool(
-      &onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
-
-  auto A1 = RandomVectorUniform(static_cast<size_t>(M * K), -1.0f, 1.0f);
-  auto B1 = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * K / 2), 0, 255);
-  auto blk_num = static_cast<size_t>((K + block_size - 1) / block_size);
-  auto B_scale = RandomVectorUniform(static_cast<size_t>(N * blk_num), 0.003f, 0.005f);
-  std::vector<float> C1(static_cast<size_t>(M * N));
-  auto B_zp = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * blk_num / 2), 0, 255);
-
-  std::vector<int8_t> B1_packed(pack_b_size);
-  MlasNBitsGemmPackB(B1_packed.data(), B1.data(), B_scale.data(), is_asym ? B_zp.data() : nullptr, N, K, K, block_size,
-                     4, is_asym, true, cmp_type, tp.get());
-
-  MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS params1;
-  params1.A = A1.data();
-  params1.lda = K;
-  params1.C = C1.data();
-  params1.ldc = N;
-  params1.B = B1_packed.data();
-  std::vector<int8_t> workspace(static_cast<size_t>(M <= 32 ? 32 : M) * K * 4);
-  MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-
-  for (auto _ : state) {
-    MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-  }
-}
-
-static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
-  b->ArgNames({"M", "N", "K", "Threads"});
-  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}});
-}
-
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymInt8, 32, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymInt8, 128, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymInt8, -1, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymFp32, 32, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymFp32, 128, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymFp32, -1, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32AsymFp32, 32, true, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-
-#endif  // defined(MLAS_JBLAS)

From dc1fed7268876bcf8c12161d9398a65c427af315 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 19 Jan 2024 05:26:26 +0800
Subject: [PATCH 093/100] [Fix] Dual Cuda version isn't supported as expected
 in Linux Gpu pipeline (#19192)

### Description
<!-- Describe your changes. -->


### Motivation and Context
It isn't support expected dual cuda version

cuda 12 link

https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1272235&view=logs&j=f2f63060-d9d6-52d0-adee-b97db5a9ab91
---
 .../azure-pipelines/linux-gpu-ci-pipeline.yml | 97 ++++++++++---------
 1 file changed, 52 insertions(+), 45 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 5bc8c3603ee92..1060a0138e0b7 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -43,7 +43,6 @@ resources:
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
 variables:
-  - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
@@ -56,6 +55,12 @@ variables:
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
       value: 8.6.1.6-1.cuda12.0
 
+  - name: Repository
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: 'onnxruntimecuda11build'
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: 'onnxruntimecuda12build'
+
 jobs:
 - job: Linux_Build
   timeoutInMinutes: 120
@@ -65,6 +70,7 @@ jobs:
   workspace:
     clean: all
   pool: onnxruntime-Ubuntu2204-AMD-CPU
+
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
@@ -73,24 +79,25 @@ jobs:
   - checkout: self
     clean: true
     submodules: none
+
   - template: templates/get-docker-image-steps.yml
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
-      --network=host 
+      --network=host
       --build-arg BASEIMAGE=$(docker_base_image)
-      --build-arg TRT_VERSION=$(linux_trt_version) 
+      --build-arg TRT_VERSION=$(linux_trt_version)
       --build-arg BUILD_UID=$( id -u )
       "
-      Repository: onnxruntimecuda11build
+      Repository: $(Repository)
 
   - task: Cache@2
     inputs:
-      key: '"ccache" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
+      key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
       path: $(CCACHE_DIR)
       restoreKeys: |
-        "ccache" | "$(Build.SourceBranch)"
+        "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)"
         "ccache"
       cacheHitVar: CACHE_RESTORED
     displayName: Cach Task
@@ -100,41 +107,41 @@ jobs:
     condition: ne(variables.CACHE_RESTORED, 'true')
     displayName: Create Cache Dir
 
-  - task: CmdLine@2
-    inputs:
-      script: |
-        mkdir -p $HOME/.onnx
-        docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
-          --volume /data/onnx:/data/onnx:ro \
-          --volume $(Build.SourcesDirectory):/onnxruntime_src \
-          --volume $(Build.BinariesDirectory):/build \
-          --volume /data/models:/build/models:ro \
-          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-          --volume $(Pipeline.Workspace)/ccache:/cache \
-          -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-          -e NIGHTLY_BUILD \
-          -e BUILD_BUILDNUMBER \
-          -e CCACHE_DIR=/cache \
-          onnxruntimecuda11build \
-          /bin/bash -c "
-            set -ex; \
-            env; \
-            ccache -s; \
-            /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build --cmake_generator Ninja \
-              --config Release --update --build \
-              --skip_submodule_sync \
-              --build_shared_lib \
-              --parallel \
-              --build_wheel \
-              --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \
-              --enable_cuda_profiling --enable_cuda_nhwc_ops \
-              --enable_pybind --build_java \
-              --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
-                ccache -sv; \
-                ccache -z"
-      workingDirectory: $(Build.SourcesDirectory)
+  - script: |
+      set -e -x
+      mkdir -p $HOME/.onnx
+      docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
+        --volume /data/onnx:/data/onnx:ro \
+        --volume $(Build.SourcesDirectory):/onnxruntime_src \
+        --volume $(Build.BinariesDirectory):/build \
+        --volume /data/models:/build/models:ro \
+        --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+        --volume $(Pipeline.Workspace)/ccache:/cache \
+        -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+        -e NIGHTLY_BUILD \
+        -e BUILD_BUILDNUMBER \
+        -e CCACHE_DIR=/cache \
+        $(Repository) \
+        /bin/bash -c "
+          set -ex; \
+          env; \
+          ccache -s; \
+          /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+            --build_dir /build --cmake_generator Ninja \
+            --config Release --update --build \
+            --skip_submodule_sync \
+            --build_shared_lib \
+            --parallel \
+            --build_wheel \
+            --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \
+            --enable_cuda_profiling --enable_cuda_nhwc_ops \
+            --enable_pybind --build_java \
+            --use_cache \
+            --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              ccache -sv; \
+              ccache -z"
+    workingDirectory: $(Build.SourcesDirectory)
+    displayName: Build Onnxruntime
 
   - task: CmdLine@2
     inputs:
@@ -179,12 +186,12 @@ jobs:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
-      --network=host 
+      --network=host
       --build-arg BASEIMAGE=$(docker_base_image)
       --build-arg TRT_VERSION=$(linux_trt_version)
       --build-arg BUILD_UID=$( id -u )
       "
-      Repository: onnxruntimecuda11build
+      Repository: $(Repository)
 
   - task: CmdLine@2
     inputs:
@@ -197,7 +204,7 @@ jobs:
           --volume /data/models:/build/models:ro \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
           --volume /data/onnx:/data/onnx \
-          onnxruntimecuda11build \
+          $(Repository) \
           /bin/bash -c "
             set -ex; \
             cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
@@ -209,7 +216,7 @@ jobs:
             cd /tmp; \
             /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests \
-              --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
+              --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
               --enable_pybind --build_java --ctest_path '' "
 
   - template: templates/clean-agent-build-directory-step.yml

From d69b622ef489dfa9e9fdd56f6ce6df7376f9fa6f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 18 Jan 2024 13:45:42 -0800
Subject: [PATCH 094/100] [js/web] upgrade dependency packages version (#19193)

### Description
upgrade packages version.

```
# npm audit report

electron  23.0.0-alpha.1 - 23.3.13
Severity: moderate
ASAR Integrity bypass via filetype confusion in electron - https://github.com/advisories/GHSA-7m48-wc93-9g85
fix available via `npm audit fix --force`
Will install electron@28.1.4, which is a breaking change
node_modules/electron

get-func-name  <2.0.1
Severity: high
Chaijs/get-func-name vulnerable to ReDoS - https://github.com/advisories/GHSA-4q6p-r6v2-jvc5
fix available via `npm audit fix`
node_modules/get-func-name

semver  <=5.7.1 || 6.0.0 - 6.3.0 || 7.0.0 - 7.5.1
Severity: moderate
semver vulnerable to Regular Expression Denial of Service - https://github.com/advisories/GHSA-c2qf-rxjj-qqgw
semver vulnerable to Regular Expression Denial of Service - https://github.com/advisories/GHSA-c2qf-rxjj-qqgw
semver vulnerable to Regular Expression Denial of Service - https://github.com/advisories/GHSA-c2qf-rxjj-qqgw
fix available via `npm audit fix`
node_modules/cross-spawn/node_modules/semver
node_modules/global-agent/node_modules/semver
node_modules/semver
```
---
 js/web/package-lock.json | 80 +++++++++++++++++-----------------------
 js/web/package.json      |  2 +-
 2 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 1815767fd2320..41c44aaa2679b 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -28,7 +28,7 @@
         "@webgpu/types": "^0.1.38",
         "base64-js": "^1.5.1",
         "chai": "^4.3.7",
-        "electron": "^23.1.2",
+        "electron": "^28.1.4",
         "globby": "^13.1.3",
         "karma": "^6.4.1",
         "karma-browserstack-launcher": "^1.6.0",
@@ -862,9 +862,9 @@
       }
     },
     "node_modules/cross-spawn/node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
       "dev": true,
       "bin": {
         "semver": "bin/semver"
@@ -1042,14 +1042,14 @@
       "dev": true
     },
     "node_modules/electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "hasInstallScript": true,
       "dependencies": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
       },
       "bin": {
@@ -1059,12 +1059,6 @@
         "node": ">= 12.20.55"
       }
     },
-    "node_modules/electron/node_modules/@types/node": {
-      "version": "16.18.14",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-      "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-      "dev": true
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -1432,9 +1426,9 @@
       }
     },
     "node_modules/get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true,
       "engines": {
         "node": "*"
@@ -1542,9 +1536,9 @@
       }
     },
     "node_modules/global-agent/node_modules/semver": {
-      "version": "7.3.8",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-      "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+      "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
       "dev": true,
       "optional": true,
       "dependencies": {
@@ -2908,9 +2902,9 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true,
       "bin": {
         "semver": "bin/semver.js"
@@ -4203,9 +4197,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
           "dev": true
         }
       }
@@ -4339,22 +4333,14 @@
       "dev": true
     },
     "electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "requires": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
-      },
-      "dependencies": {
-        "@types/node": {
-          "version": "16.18.14",
-          "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-          "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-          "dev": true
-        }
       }
     },
     "emoji-regex": {
@@ -4657,9 +4643,9 @@
       "dev": true
     },
     "get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true
     },
     "get-intrinsic": {
@@ -4742,9 +4728,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "7.3.8",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-          "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+          "version": "7.5.4",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+          "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
           "dev": true,
           "optional": true,
           "requires": {
@@ -5780,9 +5766,9 @@
       "dev": true
     },
     "semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true
     },
     "semver-compare": {
diff --git a/js/web/package.json b/js/web/package.json
index aa89606c00a1e..a502c2b6b032d 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -47,7 +47,7 @@
     "@webgpu/types": "^0.1.38",
     "base64-js": "^1.5.1",
     "chai": "^4.3.7",
-    "electron": "^23.1.2",
+    "electron": "^28.1.4",
     "globby": "^13.1.3",
     "karma": "^6.4.1",
     "karma-browserstack-launcher": "^1.6.0",

From 28a16c223cb7a8707fb90340c8aecdc46eec3f2f Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Thu, 18 Jan 2024 14:59:23 -0800
Subject: [PATCH 095/100] [QNN EP] Update QNN pipelines to use QNN SDK 2.18 by
 default (#19129)

### Description
Update QNN pipelines to use QNN SDK 2.18 by default


### Motivation and Context
Test with the latest version of QNN SDK by default.
---
 ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml |  9 +--------
 .../azure-pipelines/linux-qnn-ci-pipeline.yml |  2 +-
 .../qnn-ep-nuget-packaging-pipeline.yml       | 19 +++++++------------
 .../win-qnn-arm64-ci-pipeline.yml             |  2 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   |  2 +-
 5 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index e2ca4f64a0ecb..2b181810b0788 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124
+  default: qnn-v2.18.0.240101
 
 jobs:
 - job: Build_QNN_EP
@@ -88,13 +88,6 @@ jobs:
       cp -r cmake/external/onnx/onnx/backend/test/data/node/test_basic_conv_with_padding build_qnn/Release/testdata/QNN/node_tests
     displayName: Initialize test directories
 
-  - task: JavaToolInstaller@0
-    displayName: Use jdk 11
-    inputs:
-      versionSpec: '11'
-      jdkArchitectureOption: 'x64'
-      jdkSourceOption: 'PreInstalled'
-
   # This is commented out for now. The emulator runs correctly, onnx_test_runner is executable, and the test passes
   # with the CPU EP but returns 139 when attempting to use the QNN EP. Maybe some QNN EP parameters need to be provided?
   #
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index d286c4f3a46fe..07910911ab67a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124
+  default: qnn-v2.18.0.240101
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 0b4951f01ff01..47d97787d3b9e 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -1,13 +1,8 @@
 parameters:
-- name: qnn_sdk_path_win
-  displayName: QNN Windows SDK path
+- name: QnnSdk
+  displayName: QNN SDK Version
   type: string
-  default: C:\data\qnnsdk\qnn-v2.17.0.231124_win
-
-- name: qnn_sdk_info
-  displayName: QNN SDK Version Information
-  type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.18.0.240101_win
 
 - name: build_config
   displayName: Build Configuration
@@ -42,7 +37,7 @@ jobs:
       buildArch: x64
       setVcvars: true
       ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}} --parallel'
+      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home C:\data\qnnsdk\${{parameters.QnnSdk}} --parallel'
 
     steps:
       - template: templates/set-version-number-variables-step.yml
@@ -85,7 +80,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package x64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.QnnSdk }}
             cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\x64
@@ -125,7 +120,7 @@ jobs:
         displayName: 'Generate CMake Configuration for arm64'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}} --parallel'
+          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home C:\data\qnnsdk\${{parameters.QnnSdk}} --parallel'
 
       - task: VSBuild@1
         displayName: 'Build onnxruntime arm64'
@@ -173,7 +168,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package arm64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.QnnSdk }}
             cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\arm64
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 6dc428d6606af..13d4589a67cdc 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.18.0.240101_win
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index fbec572fd346c..c686fc57ab5f1 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.18.0.240101_win
 
 jobs:
 - job: 'build'

From eaf047c82092062e5a5df7f4087a9a5c7ec75f2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Fri, 19 Jan 2024 19:36:19 +0100
Subject: [PATCH 096/100] Increment year to 2024 in conf.py (python
 documentation) (#19107)

### Description
Update copyright in python documentation.
---
 docs/python/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/python/conf.py b/docs/python/conf.py
index 065149441b72c..7ab2d42aa15e1 100644
--- a/docs/python/conf.py
+++ b/docs/python/conf.py
@@ -17,7 +17,7 @@
 # -- Project information -----------------------------------------------------
 
 project = "Python API"
-copyright = "2018-2023, Microsoft"
+copyright = "2018-2024, Microsoft"
 author = "Microsoft"
 
 # -- General configuration ---------------------------------------------------

From a3ecb6326711ca368ccc402886dcaf1ef56ff79c Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:09:24 -0800
Subject: [PATCH 097/100] Update LLaMA attention fusions (#19200)

### Description
This PR updates the LLaMA-2 attention fusions by adding the following.

- Loading the PyTorch model from Hugging Face with the `LlamaAttention`
class before exporting
- Updating the attention mask pattern matching to support another case

This PR also fixes [this
issue](https://github.com/microsoft/onnxruntime/issues/19040).

### Motivation and Context
Recent changes to Hugging Face's `transformers` library break the
existing pattern matching. Since the attention fusions aim to change the
graph from `LayerNorm Op --> Set of Attention Nodes --> LayerNorm Op` to
`LayerNorm Op --> Attention Op --> LayerNorm Op` per layer, ultimately
it does not matter what nodes comprise the `Set of Attention Nodes`
because they will all be removed and replaced by the `Attention Op` in
the end.

Therefore, it does not matter whether the `LlamaAttention` class or a
different attention class is used to load the PyTorch model before
exporting because the expected graphs after the attention fusions will
look identical no matter the attention class chosen. By loading the
PyTorch model with the `LlamaAttention` class instead of other attention
classes (e.g. `LlamaFlashAttention2` or `LlamaSdpaAttention`) and then
exporting it to ONNX, the existing pattern matching will continue to
work.
---
 .../transformers/fusion_rotary_attention.py   | 10 ++++++
 .../tools/transformers/models/llama/README.md | 31 +++++--------------
 .../models/llama/convert_to_onnx.py           | 27 ++++++++++++++++
 .../transformers/models/llama/llama_torch.py  |  1 +
 .../models/llama/requirements.txt             |  2 +-
 5 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
index de89b35366a23..618d3c2fab12c 100644
--- a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
@@ -539,6 +539,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # attn_mask_nodes_1, attn_mask_nodes_2 are for LLaMA-2 Microsoft's 3D attention mask
         # attn_mask_nodes_3, attn_mask_nodes_4 are for LLaMA-2 Hugging Face's 2D attention mask
+        # attn_mask_nodes_5, attn_mask_nodes_6 are for LLaMA-2 Microsoft's model for the DML EP
+        # attn_mask_nodes_7 is for LLaMA-2 Hugging Face's changes to the attention mask
         attn_mask, add_qk_str = "", ""
         attn_mask_nodes_1 = self.model.match_parent_path(
             add_qk,
@@ -570,6 +572,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Expand", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
             [1, 0, 2, 1, 0, 0, 0],
         )
+        attn_mask_nodes_7 = self.model.match_parent_path(
+            add_qk,
+            ["Where", "Cast", "Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0],
+        )
         if attn_mask_nodes_1 is not None:
             _, slice_mask_1, slice_mask_2 = attn_mask_nodes_1
             attn_mask = slice_mask_1.output[0]
@@ -588,6 +595,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         elif attn_mask_nodes_6 is not None:
             # The mask has already been reshaped to (B,N,S,T)
             add_qk_str = attn_mask_nodes_6[0].output[0]
+        elif attn_mask_nodes_7 is not None:
+            # Reshape from (B,1,S,T) to (B,N,S,T)
+            add_qk_str = self.reshape_add_qk(attn_mask_nodes_7[0].output[0])
         else:
             logger.debug("fuse_rotary_attention: failed to match attention mask nodes")
             return
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index e7bcc19635f40..f9552e02d74b9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -42,23 +42,6 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama
 
 To make this option compatible with [Hugging Face's Optimum](https://github.com/huggingface/optimum), you will need to create `config.json` and `generation_config.json` for your model and store them in the same directory as your ONNX models. For example, you can find those JSON files for LLaMA-2 7B on Hugging Face [here](https://huggingface.co/meta-llama/Llama-2-7b-hf).
 
-As indicated in `requirements.txt`, you will also need to install Optimum from source. Once installed, you will need to modify `ORTModelForCausalLM.forward` in `optimum/optimum/onnxruntime/modeling_decoder.py` as follows:
-
-```
-# Before
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:]
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-
-
-# After
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:] if past_key_values[0][0].shape[2] != 0 else input_ids
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-```
-
 ### Option 2: from [Microsoft's custom export](https://github.com/microsoft/Llama-2-Onnx)
 
 Please follow the [README instructions](https://github.com/microsoft/Llama-2-Onnx#before-you-start) in the custom export of LLaMA-2.
@@ -254,7 +237,7 @@ Here are some examples of how you can benchmark LLaMA-2.
 
 1. PyTorch without `torch.compile`, FP32
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp32 \
@@ -266,7 +249,7 @@ python3 -m models.llama.benchmark \
 
 2. PyTorch with `torch.compile`, FP16
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
@@ -278,7 +261,7 @@ python3 -m models.llama.benchmark \
 
 3. Optimum + ONNX Runtime, FP32, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -291,7 +274,7 @@ python3 -m models.llama.benchmark \
 
 4. Optimum + ONNX Runtime, FP16, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -304,7 +287,7 @@ python3 -m models.llama.benchmark \
 
 5. ONNX Runtime, FP32, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -316,7 +299,7 @@ python3 -m models.llama.benchmark \
 
 6. ONNX Runtime, FP16, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -367,7 +350,7 @@ You can profile a variant by adding the `--profile` flag and providing one batch
 ### Benchmark All
 You can use `benchmark_all.py` to benchmark across various options and automatically store the results in a CSV file. Here is an example.
 ```
-python3 -m models.llama.benchmark_all \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --hf-pt-eager \
     --hf-pt-compile \
     --hf-ort-dir-path ./llama2-7b-fp16/ \
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index bc09b52574a27..71f52faa2c1e6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -4,6 +4,8 @@
 import logging
 import os
 import shutil
+import subprocess
+import sys
 from itertools import chain
 
 import onnx
@@ -408,6 +410,31 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov
         only_onnxruntime=False,
     )
     model_opt.save_model_to_file(output_path, use_external_data_format=True)
+
+    # Run symbolic shape inference on optimized model to avoid shape errors during runtime
+    # Ex: Before attention fusion, RotaryEmbedding assumes a 4D input and produces a 4D output.
+    # After attention fusion, RotaryEmbedding expects a 3D input and produces a 3D output.
+    wheel_cmd = [sys.executable, "-m", "onnxruntime.tools.symbolic_shape_infer"]
+    source_cmd = [sys.executable, "../symbolic_shape_infer.py"]
+    symbolic_shape_infer_args = [
+        "--input",
+        output_path,
+        "--output",
+        output_path,
+        "--auto_merge",
+        "--save_as_external_data",
+        "--all_tensors_to_one_file",
+        "--external_data_location",
+        os.path.basename(output_path) + ".data",
+    ]
+
+    file_path = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(file_path, "../../../tools/symbolic_shape_infer.py")):
+        main_cmd = wheel_cmd
+    else:
+        main_cmd = source_cmd
+    subprocess.run(main_cmd + symbolic_shape_infer_args)  # noqa: PLW1510
+
     logger.info(f"The ONNX model at {input_path} has been successfully optimized and saved at {output_path}!")
     if remove_model:
         remove_existing_model(input_path)
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index 94e0397116d1c..89b459c80beec 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -21,6 +21,7 @@ def setup_torch_model(args, location, use_auth_token, torch_dtype=torch.float32,
         if i == rank % (world_size):
             l_config = AutoConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir)
             l_config.use_cache = True
+            l_config._attn_implementation = "eager"  # "eager" uses LlamaAttention for attention layer
             llama = AutoModelForCausalLM.from_pretrained(
                 location,
                 use_auth_token=use_auth_token,
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index 4210f36982aef..b72c972e7a16a 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,4 +1,4 @@
-git+https://github.com/huggingface/optimum.git
+optimum>=1.14.1
 transformers>=4.33.2
 torch>=2.2.0.dev20230920
 onnx>=1.14.0

From 6e17571f2f0cb8eb841395f208d8c31359e2f054 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Fri, 19 Jan 2024 15:16:17 -0800
Subject: [PATCH 098/100] Fix issue that the generated context cache model
 inputs/outputs order is not guaranteed (#19195)

Fix issue that the generated context cache model inputs/outputs order is not guaranteed

### Description
Currently, QNN EP generate the context cache model in Compile() method which only get access to the partitioned graph. And the inputs/outputs order for the partitioned graph is not guaranteed. And EP doesn't have the view of the input user model. Have to move the context cache model generation to a higher level in GraphPartitioner which has the view of the partitioned model.
This is also a break down of PR for multi-partition support.
https://github.com/microsoft/onnxruntime/pull/18865
---
 .../core/framework/execution_provider.h       |   9 ++
 .../onnxruntime_session_options_config_keys.h |   2 +-
 .../core/framework/graph_partitioner.cc       | 105 ++++++++++++++++++
 .../core/framework/graph_partitioner.h        |   3 +
 .../qnn/builder/onnx_ctx_model_helper.cc      |  13 +--
 .../qnn/builder/onnx_ctx_model_helper.h       |   3 +-
 .../providers/qnn/qnn_execution_provider.cc   |  16 ++-
 .../providers/qnn/qnn_execution_provider.h    |   4 +
 onnxruntime/core/session/inference_session.cc |   9 +-
 .../test/framework/session_state_test.cc      |  25 +++--
 .../test/providers/qnn/qnn_basic_test.cc      |  45 ++++++++
 .../test/providers/qnn/simple_op_htp_test.cc  |   2 +
 .../testdata/qnn_ctx_2_inputs_order_test.onnx | Bin 0 -> 2053 bytes
 13 files changed, 210 insertions(+), 26 deletions(-)
 create mode 100644 onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index ea4f52f99649d..1de0217c7e1fa 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -326,6 +326,15 @@ class IExecutionProvider {
    */
   virtual std::vector<AllocatorPtr> CreatePreferredAllocators() { return std::vector<AllocatorPtr>(); };
 
+  /**
+   * Get the array of pointers for EPContext nodes
+   * EP needs to implement this if has the requirement to generate the context cache model. Otherwise leave it.
+   * Default return an empty vector if not provided by the Execution Provider
+   */
+  virtual const InlinedVector<const Node*> GetEpContextNodes() const {
+    return InlinedVector<const Node*>();
+  }
+
  private:
   const std::string type_;
 
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index df79cb6e5b21b..8fd51962bf087 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -236,7 +236,7 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
-// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
 // "1": enable.
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index e4fe0c7564548..07b465c80745a 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -634,6 +635,100 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   return Status::OK();
 }
 
+static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
+                                   const Graph& graph,
+                                   const std::string& ep_context_path,
+                                   const logging::Logger& logger) {
+  InlinedVector<const Node*> all_ep_context_nodes;
+  for (const auto& ep : execution_providers) {
+    const InlinedVector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
+    all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
+  }
+
+  auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
+    for (auto& node : all_ep_context_nodes) {
+      if (node_name == node->Name()) {
+        return std::make_pair(true, node);
+      }
+    }
+    return std::make_pair(false, static_cast<const Node*>(nullptr));
+  };
+
+  onnxruntime::PathString context_cache_path;
+  PathString model_pathstring = graph.ModelPath().ToPathString();
+  if (all_ep_context_nodes.size() > 0) {
+    if (!ep_context_path.empty()) {
+      context_cache_path = ToPathString(ep_context_path);
+    } else if (!model_pathstring.empty()) {
+      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+    }
+
+    {
+#ifdef _WIN32
+      std::wifstream fs(context_cache_path);
+#else
+      std::ifstream fs(context_cache_path);
+#endif
+      ORT_RETURN_IF(fs.good(), "Failed to generate EP context model since the file exist already.");
+    }
+
+    Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           graph.DomainToVersionMap(), {}, logger);
+    auto& ep_graph = ep_context_model.MainGraph();
+    ep_graph.SetDescription(graph.Description());
+
+    // Set inputs outputs explicitly to make sure the order is same as the user model.
+    auto inputs = graph.GetInputs();
+    auto outputs = graph.GetOutputs();
+
+    InlinedVector<const NodeArg*> ep_graph_inputs;
+    ep_graph_inputs.reserve(inputs.size());
+    for (auto& input : inputs) {
+      auto input_arg = graph.GetNodeArg(input->Name());
+      auto& ep_graph_input_arg = ep_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto());
+      ep_graph_inputs.push_back(&ep_graph_input_arg);
+    }
+
+    InlinedVector<const NodeArg*> ep_graph_outputs;
+    ep_graph_outputs.reserve(outputs.size());
+    for (auto& output : outputs) {
+      auto output_arg = graph.GetNodeArg(output->Name());
+      auto& ep_graph_output_arg = ep_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto());
+      ep_graph_outputs.push_back(&ep_graph_output_arg);
+    }
+
+    ep_graph.SetInputs(ep_graph_inputs);
+    ep_graph.SetOutputs(ep_graph_outputs);
+
+    for (const auto& node : graph.Nodes()) {
+      // the fused node and EPContext node has same node name
+      auto ep_context_node = get_ep_context_node(node.Name());
+      // Use EpContext node created by the EPs if name matched, otherwise use node from original model
+      if (ep_context_node.first) {
+        ep_graph.AddNode(*ep_context_node.second);
+      } else {
+        ep_graph.AddNode(node);
+      }
+    }
+
+    // handle initializers
+    for (const auto& input : graph.GetInputsIncludingInitializers()) {
+      const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+      if (graph.GetInitializedTensor(input->Name(), initializer)) {
+        // There initializer could have duplicates so make sure we only add once
+        const ONNX_NAMESPACE::TensorProto* subgraph_initializer = nullptr;
+        if (!ep_graph.GetInitializedTensor(input->Name(), subgraph_initializer)) {
+          ep_graph.AddInitializedTensor(*initializer);
+        }
+      }
+    }
+
+    ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  }
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager) {
@@ -840,6 +935,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
 
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                                   const ConfigOptions& config_options,
+                                   const logging::Logger& logger,
                                    Mode mode,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
@@ -886,7 +983,15 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 #if !defined(ORT_MINIMAL_BUILD)
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
                                                  providers_, kernel_registry_mgr_));
+
+    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    if (ep_context_enabled) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+    }
 #else
+    ORT_UNUSED_PARAMETER(config_options);
+    ORT_UNUSED_PARAMETER(logger);
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 4fc85c2588260..d1ef193cf1520 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
+struct ConfigOptions;
 
 class GraphPartitioner {
  public:
@@ -31,6 +32,8 @@ class GraphPartitioner {
   // Run partitioning.
   Status Partition(Graph& graph, FuncManager& func_mgr,
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                   const ConfigOptions& config_options,
+                   const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index fd9bf200c45ef..5d3f406f50612 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -230,8 +230,7 @@ Status ValidateWithContextFile(const onnxruntime::PathString& context_cache_path
   return Status::OK();
 }
 
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
+Status GenerateCtxCacheOnnxModel(Model* model,
                                  unsigned char* buffer,
                                  uint64_t buffer_size,
                                  const std::string& sdk_build_version,
@@ -240,11 +239,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                  const onnxruntime::PathString& context_cache_path,
                                  bool qnn_context_embed_mode,
                                  const logging::Logger& logger) {
-  std::unordered_map<std::string, int> domain_to_version = {{kOnnxDomain, 11}, {kMSDomain, 1}};
-  Model model(model_name, false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-              domain_to_version, {}, logger);
-  auto& graph = model.MainGraph();
-  graph.SetDescription(model_description);
+  auto& graph = model->MainGraph();
 
   using namespace ONNX_NAMESPACE;
   int index = 0;
@@ -270,7 +265,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                   nullptr,
                                   kMSDomain);
 
-    // Only dump the context buffer once since all QNN graph are in one single context
+    // Only dump the context buffer once since all QNN graphs are in one single context
     if (0 == index) {
       if (qnn_context_embed_mode) {
         std::string cache_payload(buffer, buffer + buffer_size);
@@ -296,8 +291,6 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
     ep_node.AddAttribute(SOURCE, kQnnExecutionProvider);
     ++index;
   }
-  ORT_RETURN_IF_ERROR(graph.Resolve());
-  ORT_RETURN_IF_ERROR(Model::Save(model, context_cache_path));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 0011d0f43f5bc..ba6fe23ecd56e 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -73,8 +73,7 @@ Status GetMetadataFromEpContextModel(const onnxruntime::PathString& ctx_onnx_mod
                                      std::string& cache_source,
                                      const logging::Logger& logger);
 
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
+Status GenerateCtxCacheOnnxModel(Model* model,
                                  unsigned char* buffer,
                                  uint64_t buffer_size,
                                  const std::string& sdk_build_version,
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 04bd58c237141..56eb1f4f59f33 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -613,8 +613,8 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
     ORT_RETURN_IF(fused_nodes_and_graphs.size() != 1, "Only support single partition for context cache feature.");
     uint64_t buffer_size(0);
     auto context_buffer = qnn_backend_manager_->GetContextBinaryBuffer(buffer_size);
-    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(model_name,
-                                                       model_description,
+    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(qnn_ep_context_model_.get(),
                                                        context_buffer.get(),
                                                        buffer_size,
                                                        qnn_backend_manager_->GetSdkVersion(),
@@ -626,4 +626,16 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
   }
   return Status::OK();
 }
+
+const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const {
+  InlinedVector<const Node*> ep_context_nodes;
+  if (qnn_ep_context_model_) {
+    const auto& graph = qnn_ep_context_model_->MainGraph();
+    for (const auto& node : graph.Nodes()) {
+      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    }
+  }
+
+  return ep_context_nodes;
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 8b5d0929209ee..d4927f3fa505e 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -9,6 +9,7 @@
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
+#include "core/graph/model.h"
 
 namespace onnxruntime {
 
@@ -35,6 +36,8 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   DataLayout GetPreferredLayout() const override;
 
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                        std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
@@ -66,6 +69,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
+  std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 93877c8dd66bd..e8853c8824738 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1164,6 +1164,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
 
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
+                                                       session_options_.config_options, *session_logger_,
                                                        mode, debug_graph_fn));
 
   // apply Level2 and higher transformers.
@@ -1458,7 +1459,9 @@ namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
-                               SessionState& session_state) {
+                               SessionState& session_state,
+                               const ConfigOptions& config_options,
+                               const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1479,6 +1482,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
+                                            config_options,
+                                            logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
   return Status::OK();
@@ -1833,7 +1838,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_));
+                                                             *session_state_, session_options_.config_options, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 8990c23e4af39..0c2d8bcb2eb93 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -171,13 +171,16 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
 
   GraphPartitioner partitioner(krm, execution_providers);
   ASSERT_STATUS_OK(
-      partitioner.Partition(graph, session_state.GetMutableFuncMgr(),
-                            [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
-                               const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
-                              AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-                              return layout_transformation::TransformLayoutForEP(
-                                  graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
-                            }));
+      partitioner.Partition(
+          graph, session_state.GetMutableFuncMgr(),
+          [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
+             const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
+            AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
+            return layout_transformation::TransformLayoutForEP(
+                graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
+          },
+          sess_options.config_options,
+          DefaultLoggingManager().DefaultLogger()));
 
   ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -257,7 +260,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(graph, modified, execution_provider,
                                                              cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -314,7 +319,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(
               graph, modified, execution_provider, cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     // Finalize the session state
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index f9064cad3fe12..bc40682cf87b7 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -600,6 +600,51 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
 
   // Make sure the Qnn context cache binary file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Generate context cache model from the ONNX models with 2 inputs.
+// The generated model should have same input order.
+// The input ONNX model is created in the way that the model inputs order
+// is different with the order in the graph (topological order).
+// It cause issue if the generated model doesn't set the inputs/outputs explicitly.
+TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  auto inputs = model->MainGraph().GetInputs();
+  EXPECT_TRUE(inputs.size() == 2);
+  EXPECT_TRUE(inputs[0]->Name() == "attention_mask");
+  EXPECT_TRUE(inputs[1]->Name() == "Add_input_0");
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // A repro of QC case 06838696, accuracy issue for Cast + Op (quantized)
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 4ac1f5ddca643..1e938ae9e334b 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -778,6 +778,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // Run QDQ model on HTP 3 times
diff --git a/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx b/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..46b212dc1fc0e9c2f4e6bd32424fcbd90a2b6c9f
GIT binary patch
literal 2053
zcmb7FKX21O6z3dTxsQ@MPASAq(<W^hPHL1e(hSsM0f8!pE*&@-a!7{84vte*5gS9N
zZit~PTSvYCbl`I^@i~hB#`gIxP6tcA_xE?d_wGG;Nxg4d)>-@W*sxr4LbZiXyW8$O
z{j&n+2<{#9`^2<{W0!QGk~)yld*g({T3R%rj!lNPX}M}MEq@b*eq~zJaLD1<_2Oxo
z*rZj?y1)g3su3VDNteV>`>>J-Lp-aAAdM!GKBuuagGOx9QdlSWj-YI~F7+6*Eiy1h
zpI|k6j`*oD(iEs2MwPvC%+kh8s~k~35EN5?n?i0MBn?1V9%7L7Sw<ZCej=Fb_O~bX
zF+-OK@n(S?2lQK*hYM(m8!4homF?gKkgm4i8+ThJk3TqrsT|6&Mbt*s={eOvjj52|
zIu8Z<T+@6*P!MZAKwHy<BHC+03l8BUC=3X5)>~bV5;IJ_@F!bXg(qPZz9N`q3(HZU
zaNJa)Q>rR;ex}24=sn<Sa}{x5W|(3Wi(wuu?g6$SdAijluNU_QcAuW$4MoD}jxHGP
zcwIrQyVdLAzMkI*xaKRbe?RN#>~Qw{g0S2fjdx`vOL9q(bl#+Yxp;JRb#-%twJ-+n
zLEuMz>epXRdph%e@K)$0p2oQ0`~Lbj+1I&)+>0Gx&leo8`JR6-FME2XH;#AM6``u2
z$VgoVNk-l$d0*+cnOU(slXrE9s>#!SY$RiYr`IK=>TCi8JKnTDP)Du!|IXOkT>~1f
F{r^R&&7%MS

literal 0
HcmV?d00001


From c8ce83967e5b52062558046d769f3af7d871e893 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 19 Jan 2024 15:30:09 -0800
Subject: [PATCH 099/100] Download protoc for all Apple host builds, remove
 protoc build from iOS packaging pipeline. (#19209)

---
 .../external/onnxruntime_external_deps.cmake  | 74 ++++++++++---------
 .../stages/mac-ios-packaging-build-stage.yml  |  7 +-
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 78f63227c8392..403b4b2c4107a 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -108,41 +108,14 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
+  if (CMAKE_HOST_APPLE)
+    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
+    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
+    # To keep it simple, just download and use the universal protoc binary for all Apple host builds.
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -150,6 +123,38 @@ if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
+  elseif (CMAKE_CROSSCOMPILING)
+    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    endif()
   endif()
 endif()
 
@@ -184,9 +189,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause 
+#TODO: we'd better to turn the following option off. However, it will cause
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -562,4 +567,3 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
-
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index d1dff0769e25f..ed32c5d0e15be 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -78,10 +78,6 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
-    - script: |
-        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
-      displayName: "Build Host Protoc"
-
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -91,8 +87,7 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }} \
-          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
+          ${{ variables.optionalIncludeOpsByConfigOption }}
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |

From f3402de01e732283283aaa208022d6c7ae85ca4a Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Sun, 21 Jan 2024 10:51:58 -0800
Subject: [PATCH 100/100] [TensorRT EP] Enhance EP context configs in session
 options and provider options (#19154)

Several changes:

1. To align with other EPs' setting of EP context configs in session
options, for example [QNN
EP](https://github.com/microsoft/onnxruntime/pull/18877), EP context
configs for TRT EP can be configured through:
1. Session Options: `ep.context_enable`, `ep.context_file_path` and
`ep.context_embed_mode`
2. Provider Options: `trt_dump_ep_context_model`,
`trt_ep_context_file_path` and `trt_dump_ep_context_embed_mode`
3. Above setting has 1:1 mapping and provider options has higher
priority over session options.

```
    Please note that there are rules for using following context model related provider options:

     1. In the case of dumping the context model and loading the context model,
        for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
        the absolute path or relative path that is outside of context model directory.
        It means engine cache needs to be in the same directory or sub-directory of context model.

     2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
        For example:
        If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
           if "trt_ep_context_file_path" is "./context_model_dir",
           - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
           - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
```

2. User can decide the naming of the dumped "EP context" model by using
`trt_ep_context_file_path`, please see GetCtxModelPath() for more
details.

3. Added suggested comments from
https://github.com/microsoft/onnxruntime/pull/18217
---
 .../tensorrt/tensorrt_provider_options.h      |  28 ++-
 .../tensorrt/onnx_ctx_model_helper.cc         | 211 +++++++++++++-----
 .../tensorrt/onnx_ctx_model_helper.h          |  34 +--
 .../tensorrt/tensorrt_execution_provider.cc   | 153 ++++++++-----
 .../tensorrt/tensorrt_execution_provider.h    |   5 +-
 .../tensorrt_execution_provider_info.cc       |  13 +-
 .../tensorrt_execution_provider_info.h        |   2 +-
 .../tensorrt/tensorrt_provider_factory.cc     |   9 +-
 .../core/session/provider_bridge_ort.cc       |  87 +++++++-
 .../python/onnxruntime_pybind_state.cc        |  17 +-
 .../gen_trt_engine_wrapper_onnx_model.py      |  19 +-
 .../providers/tensorrt/tensorrt_basic_test.cc | 208 ++++++++++++++++-
 12 files changed, 624 insertions(+), 162 deletions(-)

diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
index 60196d0c80cbb..32a9f06464ace 100644
--- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
+++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -11,6 +11,8 @@
 /// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
 /// </summary>
 struct OrtTensorRTProviderOptionsV2 {
+  OrtTensorRTProviderOptionsV2& operator=(const OrtTensorRTProviderOptionsV2& other);  // copy assignment operator
+
   int device_id{0};                                      // cuda device id.
   int has_user_compute_stream{0};                        // indicator of user specified CUDA compute stream.
   void* user_compute_stream{nullptr};                    // user specified CUDA compute stream.
@@ -46,8 +48,26 @@ struct OrtTensorRTProviderOptionsV2 {
   const char* trt_profile_max_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   const char* trt_profile_opt_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
   int trt_cuda_graph_enable{0};                          // Enable CUDA graph in ORT TRT
-  int trt_dump_ep_context_model{0};                      // Dump EP context node model
-  int trt_ep_context_embed_mode{0};                      // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
-  int trt_ep_context_compute_capability_enable{1};       // Add GPU compute capability as an EP context node's attribute
-  const char* trt_engine_cache_prefix{nullptr};          // specify engine cache prefix
+
+  /*
+   * Please note that there are rules for using following context model related provider options:
+   *
+   * 1. In the case of dumping the context model and loading the context model,
+   *    for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
+   *    the absolute path or relative path that is outside of context model directory.
+   *    It means engine cache needs to be in the same directory or sub-directory of context model.
+   *
+   * 2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
+   *    For example:
+   *    If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
+   *       if "trt_ep_context_file_path" is "./context_model_dir",
+   *       - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
+   *       - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
+   *
+   */
+  int trt_dump_ep_context_model{0};               // Dump EP context node model
+  const char* trt_ep_context_file_path{nullptr};  // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
+  int trt_ep_context_embed_mode{0};               // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+
+  const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
 };
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
index 4d8ba6a0891e3..1994d1f5ab0b8 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -38,13 +38,6 @@ const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer) {
   return main_graph.ModelPath();
 }
 
-std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path) {
-  std::filesystem::path base_path(path.ToPathString());
-  std::filesystem::path parent_path = base_path.parent_path();
-  std::filesystem::path engine_path = parent_path.append(engine_cache_path);
-  return engine_path;
-}
-
 /*
  * Update ep_cache_context attribute of the EP context node with the given engine binary data
  */
@@ -69,14 +62,13 @@ void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
 /*
  * Create "EP context node" model where engine information is embedded
  */
-ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
-                                               const std::string engine_cache_path,
-                                               char* engine_data,
-                                               size_t size,
-                                               const int64_t embed_mode,
-                                               bool compute_capability_enable,
-                                               std::string compute_capability,
-                                               const logging::Logger* logger) {
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           std::string compute_capability,
+                                           const logging::Logger* logger) {
   auto model_build = graph_viewer.CreateModel(*logger);
   auto& graph_build = model_build->MainGraph();
 
@@ -107,21 +99,20 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
       engine_data_str.assign(engine_data, size);
     }
     attr_1->set_s(engine_data_str);
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
   } else {
     attr_1->set_s(engine_cache_path);
   }
+  attr_2->set_name(COMPUTE_CAPABILITY);
+  attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
+  attr_2->set_s(compute_capability);
+
   auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create();
-  int num_attributes = compute_capability_enable ? 3 : 2;
+  int num_attributes = 3;
   node_attributes->reserve(num_attributes);
   node_attributes->emplace(EMBED_MODE, *attr_0);
   node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1);
-
-  if (compute_capability_enable) {
-    attr_2->set_name(COMPUTE_CAPABILITY);
-    attr_2->set_type(onnx::AttributeProto_AttributeType_STRING);
-    attr_2->set_s(compute_capability);
-    node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
-  }
+  node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2);
 
   // Create EP context node
   graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN);
@@ -138,14 +129,111 @@ ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
 }
 
 /*
- * Dump "EP context node" model
+ * Return the directory where the ep context model locates
+ */
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path) {
+  if (ep_context_file_path.empty()) {
+    return std::filesystem::path();
+  }
+  std::filesystem::path ctx_path(ep_context_file_path);
+  if (std::filesystem::is_directory(ep_context_file_path)) {
+    return ctx_path;
+  } else {
+    return ctx_path.parent_path();
+  }
+}
+
+/*
+ * Get "EP context" model path.
+ *
+ * Function logic:
+ * If ep_context_file_path is provided,
+ *     - If ep_context_file_path is a file, return "ep_context_file_path".
+ *     - If ep_context_file_path is a directory, return "ep_context_file_path/original_model_name_ctx.onnx".
+ * If ep_context_file_path is not provided,
+ *     - Return "original_model_name_ctx.onnx".
+ *
+ * TRT EP has rules about context model path and engine cache path (see tensorrt_execution_provider.cc):
+ * - If dump_ep_context_model_ and engine_cache_enabled_ is enabled, TRT EP will dump context model and save engine cache
+ *   to the same directory provided by ep_context_file_path_. (i.e. engine_cache_path_ = ep_context_file_path_)
+ *
+ * Example 1:
+ * ep_context_file_path = "/home/user/ep_context_model_directory"
+ * original_model_path = "model.onnx"
+ * => return "/home/user/ep_context_model_folder/model_ctx.onnx"
+ *
+ * Example 2:
+ * ep_context_file_path = "my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "my_ctx_model.onnx"
+ *
+ * Example 3:
+ * ep_context_file_path = "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ * original_model_path = "model.onnx"
+ * => return "/home/user2/ep_context_model_directory/my_ctx_model.onnx"
+ *
+ */
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path) {
+  std::string ctx_model_path;
+
+  if (!ep_context_file_path.empty() && !std::filesystem::is_directory(ep_context_file_path)) {
+    ctx_model_path = ep_context_file_path;
+  } else {
+    std::filesystem::path model_path = original_model_path;
+    std::filesystem::path model_name_stem = model_path.stem();  // model_name.onnx -> model_name
+    std::string ctx_model_name = model_name_stem.string() + "_ctx.onnx";
+
+    if (std::filesystem::is_directory(ep_context_file_path)) {
+      std::filesystem::path model_directory = ep_context_file_path;
+      ctx_model_path = model_directory.append(ctx_model_name).string();
+    } else {
+      ctx_model_path = ctx_model_name;
+    }
+  }
+  return ctx_model_path;
+}
+
+/*
+ * Dump "EP context" model
  *
  */
-void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
-                      const std::string engine_cache_path) {
-  std::fstream dump(engine_cache_path + "_wrapper.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path) {
+  std::fstream dump(ctx_model_path, std::ios::out | std::ios::trunc | std::ios::binary);
   model_proto->SerializeToOstream(dump);
-  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path + "_wrapper.onnx";
+  LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path;
+}
+
+bool IsAbsolutePath(std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  return path.is_absolute();
+#else
+  if (!path_string.empty() && path_string[0] == '/') {
+    return true;
+  }
+  return false;
+#endif
+}
+
+// Like "../file_path"
+bool IsRelativePathToParentPath(std::string& path_string) {
+#ifdef _WIN32
+  onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string);
+  auto path = std::filesystem::path(ort_path_string.c_str());
+  auto relative_path = path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#else
+  if (!path_string.empty() && path_string.find("..", 0) != std::string::npos) {
+    return true;
+  }
+  return false;
+#endif
 }
 
 Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
@@ -157,7 +245,7 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
 
   const int64_t embed_mode = attrs.at(EMBED_MODE).i();
   if (embed_mode) {
-    // Get engine from byte stream
+    // Get engine from byte stream.
     const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
     *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(const_cast<char*>(context_binary.c_str()),
                                                                                                 static_cast<size_t>(context_binary.length())));
@@ -167,19 +255,41 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph
                              "TensorRT EP could not deserialize engine from binary data");
     }
   } else {
-    // Get engine from cache file
-    std::ifstream engine_file(engine_cache_path_.string(), std::ios::binary | std::ios::in);
+    // Get engine from cache file.
+    std::string cache_path = attrs.at(EP_CACHE_CONTEXT).s();
+
+    // For security purpose, in the case of running context model, TRT EP won't allow
+    // engine cache path to be the relative path like "../file_path" or the absolute path.
+    // It only allows the engine cache to be in the same directory or sub directory of the context model.
+    if (IsAbsolutePath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "For security purpose, the ep_cache_context attribute should be set with a relative path, but it is an absolute path:  " + cache_path);
+    }
+    if (IsRelativePathToParentPath(cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "The file path in ep_cache_context attribute has '..'. For security purpose, it's not allowed to point outside the directory.");
+    }
+
+    // The engine cache and context model (current model) should be in the same directory
+    std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_));
+    auto engine_cache_path = ctx_model_dir.append(cache_path);
+
+    if (!std::filesystem::exists(engine_cache_path)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP can't find engine cache: " + engine_cache_path.string() +
+                                 ". Please make sure engine cache is in the same directory or sub-directory of context model.");
+    }
+
+    std::ifstream engine_file(engine_cache_path.string(), std::ios::binary | std::ios::in);
     engine_file.seekg(0, std::ios::end);
     size_t engine_size = engine_file.tellg();
     engine_file.seekg(0, std::ios::beg);
     std::unique_ptr<char[]> engine_buf{new char[engine_size]};
     engine_file.read((char*)engine_buf.get(), engine_size);
     *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
-    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path_.string();
     if (!(*trt_engine_)) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path_.string());
+                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string());
     }
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string();
   }
   return Status::OK();
 }
@@ -193,37 +303,26 @@ bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewe
   auto node = graph_viewer.GetNode(0);
   auto& attrs = node->GetAttributes();
 
-  // Check hardware_architecture(compute_capability) if it's present as an attribute
+  // Show the warning if compute capability is not matched
   if (attrs.count(COMPUTE_CAPABILITY) > 0) {
     std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s();
     if (model_compute_capability != compute_capability_) {
-      LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache doesn't match with the GPU's compute capability";
-      LOGS_DEFAULT(ERROR) << "The compute capability of the engine cache: " << model_compute_capability;
-      LOGS_DEFAULT(ERROR) << "The compute capability of the GPU: " << compute_capability_;
-      return false;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal";
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability;
+      LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_;
     }
   }
 
   // "embed_mode" attr and "ep_cache_context" attr should be present
-  if (attrs.count(EMBED_MODE) > 0 && attrs.count(EP_CACHE_CONTEXT) > 0) {
-    // ep_cache_context: payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0
-    const int64_t embed_mode = attrs.at(EMBED_MODE).i();
-
-    // engine cache path
-    if (embed_mode == 0) {
-      // First assume engine cache path is relatvie to model path,
-      // If not, then assume the engine cache path is an absolute path.
-      engine_cache_path_ = LocateEngineRelativeToPath(attrs.at(EP_CACHE_CONTEXT).s(), GetModelPath(graph_viewer));
-      auto default_engine_cache_path_ = engine_cache_path_;
-      if (!std::filesystem::exists(engine_cache_path_)) {
-        engine_cache_path_.assign(attrs.at(EP_CACHE_CONTEXT).s());
-        if (!std::filesystem::exists(engine_cache_path_)) {
-          LOGS_DEFAULT(ERROR) << "Can't find " << default_engine_cache_path_.string() << " or " << engine_cache_path_.string() << " TensorRT engine";
-          return false;
-        }
-      }
-    }
+  assert(attrs.count(EMBED_MODE) > 0);
+  assert(attrs.count(EP_CACHE_CONTEXT) > 0);
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode == 1) {
+    // engine binary data
+    LOGS_DEFAULT(WARNING) << EPCONTEXT_WARNING;
   }
+
   return true;
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
index ab6ea733adfa1..bf3bf9e3495d7 100644
--- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -16,20 +16,27 @@ static const std::string EMBED_MODE = "embed_mode";
 static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
 static const std::string COMPUTE_CAPABILITY = "hardware_architecture";
 static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft";
+static const std::string EPCONTEXT_WARNING =
+    "It's suggested to set the ORT graph optimization level to 0 and  \
+                                              make \"embed_mode\" to 0 (\"ep_cache_context\" is the cache path)\
+                                              for the best model loading time";
 
 bool GraphHasCtxNode(const GraphViewer& graph_viewer);
 const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer);
-std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path);
-ONNX_NAMESPACE::ModelProto* CreateCtxNodeModel(const GraphViewer& graph_viewer,
-                                               const std::string engine_cache_path,
-                                               char* engine_data,
-                                               size_t size,
-                                               const int64_t embed_mode,
-                                               bool compute_capability_enable,
-                                               std::string compute_capability,
-                                               const logging::Logger* logger);
-void DumpCtxNodeModel(ONNX_NAMESPACE::ModelProto* model_proto,
-                      const std::string engine_cache_path);
+std::filesystem::path GetPathOrParentPathOfCtxModel(const std::string& ep_context_file_path);
+ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer,
+                                           const std::string engine_cache_path,
+                                           char* engine_data,
+                                           size_t size,
+                                           const int64_t embed_mode,
+                                           std::string compute_capability,
+                                           const logging::Logger* logger);
+std::string GetCtxModelPath(const std::string& ep_context_file_path,
+                            const std::string& original_model_path);
+bool IsAbsolutePath(std::string& path_string);
+bool IsRelativePathToParentPath(std::string& path_string);
+void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto,
+                  const std::string& ctx_model_path);
 void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto,
                                      char* engine_data,
                                      size_t size);
@@ -38,7 +45,8 @@ class TensorRTCacheModelHandler {
  public:
   TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
                             nvinfer1::IRuntime* trt_runtime,
-                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), compute_capability_(compute_capability) {
+                            std::string ep_context_model_path,
+                            std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), ep_context_model_path_(ep_context_model_path), compute_capability_(compute_capability) {
   }
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
 
@@ -49,7 +57,7 @@ class TensorRTCacheModelHandler {
  private:
   std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine_;
   nvinfer1::IRuntime* trt_runtime_;
-  std::filesystem::path engine_cache_path_;
+  std::string ep_context_model_path_;  // If using context model, it implies context model and engine cache is in the same directory
   std::string compute_capability_;
 };  // TRTCacheModelHandler
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index aa02d8384afa6..fe6b959b962de 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1079,8 +1079,6 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
                         char const* output_name,
                         size_t output_index,
                         size_t output_type,
-                        std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
-                        OrtAllocator* alloc,
                         cudaStream_t stream) {
   auto allocator = allocator_map[output_name].get();
   auto& shape = allocator->getOutputShape();
@@ -1350,6 +1348,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     timing_cache_enable_ = info.timing_cache_enable;
     force_timing_cache_match_ = info.force_timing_cache;
     detailed_build_log_ = info.detailed_build_log;
+    dump_ep_context_model_ = info.dump_ep_context_model;
+    ep_context_file_path_ = info.ep_context_file_path;
+    ep_context_embed_mode_ = info.ep_context_embed_mode;
     if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
       cache_path_ = info.engine_cache_path;
       cache_prefix_ = info.engine_cache_prefix;
@@ -1380,9 +1381,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     profile_max_shapes = info.profile_max_shapes;
     profile_opt_shapes = info.profile_opt_shapes;
     cuda_graph_enable_ = info.cuda_graph_enable;
-    dump_ep_context_model_ = info.dump_ep_context_model;
-    ep_context_embed_mode_ = info.ep_context_embed_mode;
-    ep_context_compute_capability_enable_ = info.ep_context_compute_capability_enable;
   } else {
     try {
       const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations);
@@ -1461,6 +1459,21 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
       }
 
+      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
+      if (!dump_ep_context_model_env.empty()) {
+        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
+      }
+
+      const std::string ep_context_file_path_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
+      if (!ep_context_file_path_env.empty()) {
+        ep_context_file_path_ = ep_context_file_path_env;
+      }
+
+      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
+      if (!ep_context_embed_mode_env.empty()) {
+        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
+      }
+
       if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
         const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
         cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
@@ -1538,21 +1551,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
         cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true);
       }
 
-      const std::string dump_ep_context_model_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDumpEpContextModel);
-      if (!dump_ep_context_model_env.empty()) {
-        dump_ep_context_model_ = (std::stoi(dump_ep_context_model_env) == 0 ? false : true);
-      }
-
-      const std::string ep_context_embed_mode_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextEmbedMode);
-      if (!ep_context_embed_mode_env.empty()) {
-        ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env);
-      }
-
-      const std::string ep_context_compute_capability_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEpContextComputeCapabilityEnable);
-      if (!ep_context_compute_capability_env.empty()) {
-        ep_context_compute_capability_enable_ = (std::stoi(ep_context_compute_capability_env) == 0 ? false : true);
-      }
-
     } catch (const std::invalid_argument& ex) {
       LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what();
     } catch (const std::out_of_range& ex) {
@@ -1580,7 +1578,36 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
     dla_core_ = 0;
   }
 
-  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_ || !cache_prefix_.empty()) {
+  // If ep_context_file_path_ is provided as a directory, create it if it's not existed
+  if (dump_ep_context_model_ && !ep_context_file_path_.empty() && std::filesystem::path(ep_context_file_path_).extension().empty() && !std::filesystem::is_directory(ep_context_file_path_)) {
+    if (!std::filesystem::create_directory(ep_context_file_path_)) {
+      throw std::runtime_error("Failed to create directory " + ep_context_file_path_);
+    }
+  }
+
+  // If dump_ep_context_model_ is enable, TRT EP forces cache_path_ to be the relative path of ep_context_file_path_.
+  // For example,
+  //    - original cache path = "engine_cache_dir" -> new cache path = "./context_model_dir/engine_cache_dir"
+  //    - original cache path = ""                 -> new cache path = "./context_model_dir"
+  // The new cache path will be saved as the "ep_cache_context" node attritue of the EP context node.
+  // For security reason, it needs to make sure the engine cache is saved inside context model directory.
+  if (dump_ep_context_model_ && engine_cache_enable_) {
+    if (IsAbsolutePath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, the trt_engine_cache_path should be set with a relative path, but it is an absolute path:  " << cache_path_;
+    }
+    if (IsRelativePathToParentPath(cache_path_)) {
+      LOGS_DEFAULT(ERROR) << "In the case of dumping context model and for security purpose, The trt_engine_cache_path has '..', it's not allowed to point outside the directory.";
+    }
+
+    // Engine cache relative path to context model directory.
+    // It's used when dumping the "ep_cache_context" node attribute.
+    engine_cache_relative_path_to_context_model_dir = cache_path_;
+
+    // Make cache_path_ to be the relative path of ep_context_file_path_
+    cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
+  }
+
+  if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
     if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
       if (!fs::create_directory(cache_path_)) {
         throw std::runtime_error("Failed to create directory " + cache_path_);
@@ -1692,6 +1719,9 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
                         << ", trt_profile_max_shapes: " << profile_max_shapes
                         << ", trt_profile_opt_shapes: " << profile_opt_shapes
                         << ", trt_cuda_graph_enable: " << cuda_graph_enable_
+                        << ", trt_dump_ep_context_model: " << dump_ep_context_model_
+                        << ", trt_ep_context_file_path: " << ep_context_file_path_
+                        << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_
                         << ", trt_cache_prefix: " << cache_prefix_;
 }
 
@@ -2309,6 +2339,14 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   // Construct subgraph capability from node list
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
+  // Get ModelPath
+  const auto& path_string = graph.ModelPath().ToPathString();
+#ifdef _WIN32
+  wcstombs_s(nullptr, model_path_, sizeof(model_path_), path_string.c_str(), sizeof(model_path_));
+#else
+  strcpy(model_path_, path_string.c_str());
+#endif
+
   // If the model consists of only a single "EPContext" contrib op, it means TRT EP can fetch the precompiled engine info from the node and
   // load the engine directly without having to go through the processes of graph proto reconstruction, calling TRT parser and engine compilation.
   // So, simply return the ComputeCapability here.
@@ -2319,14 +2357,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     return result;
   }
 
-  // Get ModelPath
-  const auto& path_string = graph.ModelPath().ToPathString();
-#ifdef _WIN32
-  wcstombs_s(nullptr, model_path_, sizeof(model_path_), path_string.c_str(), sizeof(model_path_));
-#else
-  strcpy(model_path_, path_string.c_str());
-#endif
-
   // Generate unique kernel name for TRT graph
   HashValue model_hash = TRTGenerateId(graph);
 
@@ -2831,10 +2861,8 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
   std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
 
-  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
-  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
-  std::string cache_suffix = "";
   std::string cache_path = "";
+  std::string cache_suffix = "";
   // Customize cache prefix if assigned
   if (!cache_prefix_.empty()) {
     // Generate cache suffix in case user would like to customize cache prefix
@@ -2843,11 +2871,19 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   } else {
     cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
   }
+
+  // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache
+  // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity
   const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_;
   const std::string engine_cache_path = cache_path_prefix + ".engine";
   const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted";
   const std::string profile_cache_path = cache_path_prefix + ".profile";
 
+  // Generate file name for dumping ep context model
+  if (dump_ep_context_model_ && ctx_model_path_.empty()) {
+    ctx_model_path_ = GetCtxModelPath(ep_context_file_path_, model_path_);
+  }
+
   if (!has_dynamic_shape) {
     std::string timing_cache_path = "";
     bool engine_update = false;
@@ -2984,15 +3020,20 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         }
         // dump EP context node model
         if (dump_ep_context_model_) {
-          std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxNodeModel(graph_body_viewer,
-                                                                                     engine_cache_path,
-                                                                                     reinterpret_cast<char*>(serialized_engine->data()),
-                                                                                     serialized_engine->size(),
-                                                                                     ep_context_embed_mode_,
-                                                                                     ep_context_compute_capability_enable_,
-                                                                                     compute_capability_,
-                                                                                     GetLogger())};
-          DumpCtxNodeModel(model_proto.get(), cache_path_prefix);
+          // "ep_cache_context" node attribute should be a relative path to context model directory
+          if (ep_cache_context_attr_.empty()) {
+            auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+            ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+          }
+
+          std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto{CreateCtxModel(graph_body_viewer,
+                                                                                 ep_cache_context_attr_,
+                                                                                 reinterpret_cast<char*>(serialized_engine->data()),
+                                                                                 serialized_engine->size(),
+                                                                                 ep_context_embed_mode_,
+                                                                                 compute_capability_,
+                                                                                 GetLogger())};
+          DumpCtxModel(model_proto.get(), ctx_model_path_);
         }
       }
     }
@@ -3052,16 +3093,20 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
   // TRT EP will serialize the model at inference time due to engine can be updated and the updated engine should be included in the model.
   // However, if the embed_mode is 0 (only includes engine path), TRT EP will serialize it here.
   if (dump_ep_context_model_ && has_dynamic_shape) {
-    model_proto_.reset(CreateCtxNodeModel(graph_body_viewer,
-                                          engine_cache_path,
-                                          nullptr,
-                                          0,
-                                          ep_context_embed_mode_,
-                                          ep_context_compute_capability_enable_,
-                                          compute_capability_,
-                                          GetLogger()));
+    // "ep_cache_context" node attribute should be a relative path to context model directory
+    if (ep_cache_context_attr_.empty()) {
+      auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
+      ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string();
+    }
+    model_proto_.reset(CreateCtxModel(graph_body_viewer,
+                                      ep_cache_context_attr_,
+                                      nullptr,
+                                      0,
+                                      ep_context_embed_mode_,
+                                      compute_capability_,
+                                      GetLogger()));
     if (ep_context_embed_mode_ == 0) {
-      DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+      DumpCtxModel(model_proto_.get(), ctx_model_path_);
     }
   }
 
@@ -3382,7 +3427,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
       // dump ep context model
       if (dump_ep_context_model_ && ep_context_embed_mode_) {
         UpdateCtxNodeModelEngineContext(model_proto_.get(), reinterpret_cast<char*>(serialized_engine->data()), serialized_engine->size());
-        DumpCtxNodeModel(model_proto_.get(), cache_path_prefix);
+        DumpCtxModel(model_proto_.get(), ctx_model_path_);
       }
       context_update = true;
     }
@@ -3521,7 +3566,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
@@ -3575,7 +3620,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
   std::unordered_map<std::string, size_t> output_types;    // TRT engine output name -> ORT output tensor type
 
   // Get engine binary data and deserialize it
-  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), compute_capability_);
+  auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), model_path_, compute_capability_);
   auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer);
   if (status != Status::OK()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
@@ -3802,7 +3847,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con
         if (index_iter != output_indexes.end()) {
           output_index = index_iter->second;
         }
-        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, scratch_buffers, alloc, stream);
+        auto status = BindKernelOutput(ctx, &mem_info, dds_output_allocator_map, output_name, output_index, output_type, stream);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 401a8da119ac2..ad2d2c55c67e1 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -301,8 +301,11 @@ class TensorrtExecutionProvider : public IExecutionProvider {
 
   // For create/dump EP context node model
   bool dump_ep_context_model_ = false;
+  std::string ep_context_file_path_;
   int ep_context_embed_mode_ = 0;
-  bool ep_context_compute_capability_enable_ = true;
+  std::string ctx_model_path_;
+  std::string ep_cache_context_attr_;
+  std::string engine_cache_relative_path_to_context_model_dir;
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> model_proto_ = ONNX_NAMESPACE::ModelProto::Create();
 
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
index 28f6e1720f615..ba9251c71bced 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -47,9 +47,9 @@ constexpr const char* kProfilesMinShapes = "trt_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "trt_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "trt_profile_opt_shapes";
 constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable";
-constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode";
-constexpr const char* kEpContextComputeCapabilityEnable = "trt_ep_context_compute_capability_enable";
+constexpr const char* kEpContextFilePath = "trt_ep_context_file_path";
+constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model";
 }  // namespace provider_option_names
 }  // namespace tensorrt
 
@@ -103,8 +103,8 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
           .AddAssignmentToReference(tensorrt::provider_option_names::kProfilesOptShapes, info.profile_opt_shapes)
           .AddAssignmentToReference(tensorrt::provider_option_names::kCudaGraphEnable, info.cuda_graph_enable)
           .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model)
+          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path)
           .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode)
-          .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, info.ep_context_compute_capability_enable)
           .Parse(options));  // add new provider option here.
 
   return info;
@@ -148,8 +148,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
       {tensorrt::provider_option_names::kProfilesOptShapes, MakeStringWithClassicLocale(info.profile_opt_shapes)},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.cuda_graph_enable)},
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)},
+      {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)},
-      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -166,6 +166,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
   const std::string kProfilesMinShapes_ = empty_if_null(info.trt_profile_min_shapes);
   const std::string kProfilesMaxShapes_ = empty_if_null(info.trt_profile_max_shapes);
   const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes);
+  const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path);
 
   const ProviderOptions options{
       {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)},
@@ -202,9 +203,9 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor
       {tensorrt::provider_option_names::kProfilesMaxShapes, kProfilesMaxShapes_},
       {tensorrt::provider_option_names::kProfilesOptShapes, kProfilesOptShapes_},
       {tensorrt::provider_option_names::kCudaGraphEnable, MakeStringWithClassicLocale(info.trt_cuda_graph_enable)},
+      {tensorrt::provider_option_names::kEpContextFilePath, kEpContextFilePath_},
       {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)},
       {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)},
-      {tensorrt::provider_option_names::kEpContextComputeCapabilityEnable, MakeStringWithClassicLocale(info.trt_ep_context_compute_capability_enable)},
   };
   return options;
 }
@@ -299,6 +300,6 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options
   trt_provider_options_v2.trt_cuda_graph_enable = internal_options.cuda_graph_enable;
   trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model;
   trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode;
-  trt_provider_options_v2.trt_ep_context_compute_capability_enable = internal_options.ep_context_compute_capability_enable;
+  trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
 }
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
index a133ef45affe8..80424b8d6d196 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h
@@ -52,8 +52,8 @@ struct TensorrtExecutionProviderInfo {
   std::string profile_opt_shapes{""};
   bool cuda_graph_enable{false};
   bool dump_ep_context_model{false};
+  std::string ep_context_file_path{""};
   int ep_context_embed_mode{0};
-  bool ep_context_compute_capability_enable{1};
   std::string engine_cache_prefix{""};
 
   static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
index 62f124afbd1e5..568da57a50956 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc
@@ -61,13 +61,6 @@ std::unique_ptr<IExecutionProvider> TensorrtProviderFactory::CreateProvider() {
   return std::make_unique<TensorrtExecutionProvider>(info_);
 }
 
-std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
-  TensorrtExecutionProviderInfo info;
-  info.device_id = device_id;
-  info.has_trt_options = false;
-  return std::make_shared<onnxruntime::TensorrtProviderFactory>(info);
-}
-
 struct Tensorrt_Provider : Provider {
   void* GetInfo() override { return &g_info; }
   std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(int device_id) override {
@@ -117,8 +110,8 @@ struct Tensorrt_Provider : Provider {
     info.profile_opt_shapes = options.trt_profile_opt_shapes == nullptr ? "" : options.trt_profile_opt_shapes;
     info.cuda_graph_enable = options.trt_cuda_graph_enable != 0;
     info.dump_ep_context_model = options.trt_dump_ep_context_model != 0;
+    info.ep_context_file_path = options.trt_ep_context_file_path == nullptr ? "" : options.trt_ep_context_file_path;
     info.ep_context_embed_mode = options.trt_ep_context_embed_mode;
-    info.ep_context_compute_capability_enable = options.trt_ep_context_compute_capability_enable != 0;
     info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix;
 
     return std::make_shared<TensorrtProviderFactory>(info);
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 45d8006e6b49e..3269c9f0f4e4b 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -89,6 +89,10 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 #include "core/providers/cann/cann_provider_options.h"
 #include "core/providers/dnnl/dnnl_provider_options.h"
 
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#endif
+
 // The filename extension for a shared library is different per platform
 #ifdef _WIN32
 #define LIBRARY_PREFIX
@@ -1372,10 +1376,6 @@ std::shared_ptr<IExecutionProviderFactory> DnnlProviderFactoryCreator::Create(in
   return s_library_dnnl.Get().CreateExecutionProviderFactory(use_arena);
 }
 
-std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
-  return s_library_tensorrt.Get().CreateExecutionProviderFactory(device_id);
-}
-
 std::shared_ptr<IExecutionProviderFactory> MIGraphXProviderFactoryCreator::Create(int device_id) {
   return s_library_migraphx.Get().CreateExecutionProviderFactory(device_id);
 }
@@ -1419,11 +1419,44 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti
   trt_options_converted.trt_profile_max_shapes = "";
   trt_options_converted.trt_profile_opt_shapes = "";
   trt_options_converted.trt_cuda_graph_enable = 0;
+  trt_options_converted.trt_dump_ep_context_model = 0;
+  trt_options_converted.trt_ep_context_file_path = "";
+  trt_options_converted.trt_ep_context_embed_mode = 0;
   trt_options_converted.trt_engine_cache_prefix = "";
 
   return trt_options_converted;
 }
 
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+// Apply configs from session options to TensorRT provider options V2 that are needed for TensorRT EP.
+// For example, EP context configs.
+void UpdateOrtTensorRTProviderOptionsV2FromSessionOptionsConfigs(OrtSessionOptions* session_options, OrtTensorRTProviderOptionsV2* tensorrt_options) {
+  if (session_options) {
+    auto context_cache_enabled = (session_options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+    tensorrt_options->trt_dump_ep_context_model = context_cache_enabled;
+    LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled;
+
+    auto context_cache_path = (session_options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    tensorrt_options->trt_ep_context_file_path = context_cache_path.c_str();
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << tensorrt_options->trt_ep_context_file_path;
+
+    auto embed_mode = (session_options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1");
+    if ("1" == embed_mode) {
+      tensorrt_options->trt_ep_context_embed_mode = 1;
+    } else if ("0" == embed_mode) {
+      tensorrt_options->trt_ep_context_embed_mode = 0;
+    } else {
+      LOGS_DEFAULT(VERBOSE) << "Invalid ep.context_embed_mode: " << embed_mode << " only 0 or 1 allowed. Set to 1.";
+    }
+    LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << tensorrt_options->trt_ep_context_embed_mode;
+  }
+}
+#endif
+
+std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(int device_id) {
+  return s_library_tensorrt.Get().CreateExecutionProviderFactory(device_id);
+}
+
 std::shared_ptr<IExecutionProviderFactory> TensorrtProviderFactoryCreator::Create(const OrtTensorRTProviderOptions* provider_options) {
   OrtTensorRTProviderOptionsV2 trt_options_converted = onnxruntime::OrtTensorRTProviderOptionsToOrtTensorRTProviderOptionsV2(provider_options);
   return s_library_tensorrt.Get().CreateExecutionProviderFactory(&trt_options_converted);
@@ -1708,7 +1741,24 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_MIGraphX, _In_ OrtS
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+
+  std::shared_ptr<onnxruntime::IExecutionProviderFactory> factory;
+
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+  auto ep_context_cache_enabled_from_sess_options = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+  // If EP context configs are provided in session options, we need to propagate them to provider options
+  if (ep_context_cache_enabled_from_sess_options) {
+    OrtTensorRTProviderOptionsV2 trt_options_converted = onnxruntime::OrtTensorRTProviderOptionsToOrtTensorRTProviderOptionsV2(tensorrt_options);
+
+    onnxruntime::UpdateOrtTensorRTProviderOptionsV2FromSessionOptionsConfigs(options, &trt_options_converted);
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(&trt_options_converted);
+  } else {
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+  }
+#else
+  factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+#endif
+
   if (!factory) {
     return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library");
   }
@@ -1845,7 +1895,31 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_ROCM, _In_ Or
 
 ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT_V2, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptionsV2* tensorrt_options) {
   API_IMPL_BEGIN
-  auto factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+
+  std::shared_ptr<onnxruntime::IExecutionProviderFactory> factory;
+
+#if !defined(ORT_MINIMAL_BUILD) && defined(USE_TENSORRT)
+  auto ep_context_cache_enabled_from_provider_options = tensorrt_options->trt_dump_ep_context_model != 0;
+  auto ep_context_cache_enabled_from_sess_options = (options->value).config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") != "0";
+
+  // If EP context configs are provided in session options, we need to propagate them to provider options. However,
+  // if provider options already have the EP context configs provided, the configs in session options will be ignored
+  // since provider options has higher priority than session options.
+  if (!ep_context_cache_enabled_from_provider_options && ep_context_cache_enabled_from_sess_options) {
+    // We need to create a new provider options V2 object and copy from provider_options, due to the "const" object pointed by provider_options can't be modified.
+    // Note: No need to worry about tensorrt_options being a local variable, CreateExecutionProviderFactory() in TRT EP will
+    // create a factory object that copies any provider options from tensorrt_options including "const char*" provider options.
+    OrtTensorRTProviderOptionsV2 new_tensorrt_options = *tensorrt_options;  // copy and assign from tensorrt_options
+
+    onnxruntime::UpdateOrtTensorRTProviderOptionsV2FromSessionOptionsConfigs(options, &new_tensorrt_options);
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(&new_tensorrt_options);
+  } else {
+    factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+  }
+#else
+  factory = onnxruntime::TensorrtProviderFactoryCreator::Create(tensorrt_options);
+#endif
+
   if (!factory) {
     return OrtApis::CreateStatus(ORT_FAIL, "OrtSessionOptionsAppendExecutionProvider_TensorRT: Failed to load shared library");
   }
@@ -1991,6 +2065,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor
     delete[] ptr->trt_profile_min_shapes;
     delete[] ptr->trt_profile_max_shapes;
     delete[] ptr->trt_profile_opt_shapes;
+    delete[] ptr->trt_ep_context_file_path;
   }
 
   std::unique_ptr<OrtTensorRTProviderOptionsV2> p(ptr);
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index d2cd6140b838e..f7ed5520727db 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -475,7 +475,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
       // So we need these std::string variables defined here as they will be kept alive for the lifetime of TRT EP and we can still access them from OrtTensorRTProviderOptionsV2 instance.
       // (The reason is string copy is involved, for example params.trt_engine_cache_path = cache_path.c_str() and those std::string variable is referenced by OrtTensorRTProviderOptionsV2 instance
       // and TRT EP instance, so it won't be released.)
-      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile;
+      std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path;
       auto it = provider_options_map.find(type);
       if (it != provider_options_map.end()) {
         OrtTensorRTProviderOptionsV2 params;
@@ -728,20 +728,19 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_dump_ep_context_model' should be 'True' or 'False'. Default value is 'False'.\n");
             }
+          } else if (option.first == "trt_ep_context_file_path") {
+            if (!option.second.empty()) {
+              ep_context_file_path = option.second;
+              params.trt_ep_context_file_path = ep_context_file_path.c_str();
+            } else {
+              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_file_path' should be a string.\n");
+            }
           } else if (option.first == "trt_ep_context_embed_mode") {
             if (!option.second.empty()) {
               params.trt_ep_context_embed_mode = std::stoi(option.second);
             } else {
               ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n");
             }
-          } else if (option.first == "trt_ep_context_compute_capability_enable") {
-            if (option.second == "True" || option.second == "true") {
-              params.trt_ep_context_compute_capability_enable = true;
-            } else if (option.second == "False" || option.second == "false") {
-              params.trt_ep_context_compute_capability_enable = false;
-            } else {
-              ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_compute_capability_enable' should be 'True' or 'False'. Default value is 'False'.\n");
-            }
           } else {
             ORT_THROW("Invalid TensorRT EP option: ", option.first);
           }
diff --git a/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
index 717a0816247e7..b94c2cb76a635 100644
--- a/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
+++ b/onnxruntime/python/tools/tensorrt/gen_trt_engine_wrapper_onnx_model.py
@@ -15,6 +15,7 @@ def __init__(self, args):
         engine_cache_path = args.trt_engine_cache_path
         self.model_name = args.model_name
         self.dynamic_dim_count = 0
+        self.plugins = args.plugins
 
         # Get serialized engine from engine cache
         with open(engine_cache_path, "rb") as file:
@@ -25,8 +26,16 @@ def __init__(self, args):
         else:
             ep_cache_context_content = engine_cache_path
 
-        # Deserialize an TRT engine
         logger = trt.Logger(trt.Logger.WARNING)
+
+        # Enable TRT plugins
+        trt.init_libnvinfer_plugins(logger, "")
+        if len(self.plugins):
+            import ctypes
+
+            ctypes.CDLL(self.plugins)
+
+        # Deserialize an TRT engine
         runtime = trt.Runtime(logger)
         engine = runtime.deserialize_cuda_engine(engine_buffer)
         num_bindings = engine.num_bindings
@@ -165,6 +174,14 @@ def main():
         default="trt_engine_wrapper.onnx",
         type=str,
     )
+    parser.add_argument(
+        "--plugins",
+        help="List of plugin paths to load",
+        required=False,
+        default=[],
+        nargs="+",
+        type=str,
+    )
     args = parser.parse_args()
     ctor = TensorRTEngineWrapperCreator(args)
     ctor.create_model()
diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
index 508739ae1d235..4d2538c947dcc 100644
--- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
+++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc
@@ -122,9 +122,15 @@ void CreateBaseModel(std::string model_name,
   status = onnxruntime::Model::Save(model, model_name);
 }
 
-bool HasCacheFileWithPrefix(const std::string& prefix) {
-  const std::filesystem::path current_dir = std::filesystem::current_path();
-  for (const auto& entry : std::filesystem::directory_iterator(current_dir)) {
+bool HasCacheFileWithPrefix(const std::string& prefix, std::string file_dir = "") {
+  std::filesystem::path target_dir;
+  if (file_dir.empty()) {
+    target_dir = std::filesystem::current_path();
+  } else {
+    target_dir = std::filesystem::path(file_dir);
+  }
+
+  for (const auto& entry : std::filesystem::directory_iterator(target_dir)) {
     if (entry.is_regular_file()) {
       std::string filename = entry.path().filename().string();
       if (filename.rfind(prefix, 0) == 0) {
@@ -191,6 +197,8 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
   OrtTensorRTProviderOptionsV2 params;
   params.trt_engine_cache_enable = 1;
   params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
+  params.trt_dump_ep_context_model = 1;
+  params.trt_ep_context_file_path = "EP_Context_model.onnx";
   std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
   EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
   auto status = session_object.Load(model_name);
@@ -209,6 +217,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string
 
   // Verify on cache with customized prefix
   ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
+
+  // Verify EP context model with user provided name
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
 }
 
 void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string sess_log_id, bool has_non_zero_node = false) {
@@ -348,6 +359,192 @@ TEST(TensorrtExecutionProviderTest, TRTModelIdGeneratorUsingModelHashing) {
   ASSERT_EQ(model_hash, model_hash3) << "model 1&3 are same models and they have same hash, no matter where they are loaded";
 }
 
+TEST(TensorrtExecutionProviderTest, EPContextNode) {
+  std::string model_name = "EPContextNode_test.onnx";
+  std::string graph_name = "EPContextNode_test";
+  std::string sess_log_id = "EPContextNode_test";
+  std::vector<int> dims = {1, 3, 2};
+  CreateBaseModel(model_name, graph_name, dims);
+
+  SessionOptions so;
+  so.session_logid = sess_log_id;
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+  InferenceSession session_object{so, GetEnvironment()};
+  auto cuda_provider = DefaultCudaExecutionProvider();
+  auto cpu_allocator = cuda_provider->CreatePreferredAllocators()[1];
+  std::vector<int64_t> dims_mul_x = {1, 3, 2};
+  std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x);
+  OrtValue ml_value_y;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y);
+  OrtValue ml_value_z;
+  CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z);
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+  feeds.insert(std::make_pair("Y", ml_value_y));
+  feeds.insert(std::make_pair("Z", ml_value_z));
+
+  // prepare outputs
+  std::vector<std::string> output_names;
+  output_names.push_back("M");
+
+  // prepare expected inputs and outputs
+  std::vector<int64_t> expected_dims_mul_m = {1, 3, 2};
+  std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
+
+  /*
+   * Test case 1: Dump context model
+   *
+   * provider options=>
+   *   trt_ep_context_file_path = "EP_Context_model.onnx"
+   *
+   * expected result =>
+   *   context model "EP_Context_model.onnx" should be created in current directory
+   *
+   */
+  OrtTensorRTProviderOptionsV2 params;
+  params.trt_engine_cache_enable = 1;
+  params.trt_dump_ep_context_model = 1;
+  params.trt_ep_context_file_path = "EP_Context_model.onnx";
+  std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
+  EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  auto status = session_object.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
+
+  /*
+   * Test case 2: Dump context model
+   *
+   * provider options=>
+   *   trt_engine_cache_prefix = "TRT_engine_cache"
+   *   trt_ep_context_file_path = "context_model_folder"
+   *   trt_engine_cache_path = "engine_cache_folder"
+   *
+   * expected result =>
+   *   engine cache "./context_model_folder/engine_cache_folder/TRT_engine_cache...engine" should be created
+   *   context model "./context_model_folder/EPContextNode_test_ctx.onnx" should be created
+   */
+  InferenceSession session_object2{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params2;
+  params2.trt_engine_cache_enable = 1;
+  params2.trt_dump_ep_context_model = 1;
+  params2.trt_engine_cache_prefix = "TRT_engine_cache";
+  params2.trt_engine_cache_path = "engine_cache_folder";  // due to dump_ep_context_model = 1, the new cache path is ./context_model_folder/engine_cache_folder
+  params2.trt_ep_context_file_path = "context_model_folder";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params2);
+  EXPECT_TRUE(session_object2.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object2.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object2.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  auto new_engine_cache_path = std::filesystem::path(params2.trt_ep_context_file_path).append(params2.trt_engine_cache_path).string();
+  // Test engine cache path:
+  // "./context_model_folder/engine_cache_folder/TRT_engine_cache...engine" should be created
+  ASSERT_TRUE(HasCacheFileWithPrefix(params2.trt_engine_cache_prefix, new_engine_cache_path));
+  // Test context model path:
+  // "./context_model_folder/EPContextNode_test_ctx.onnx" should be created
+  ASSERT_TRUE(HasCacheFileWithPrefix("EPContextNode_test_ctx.onnx", params2.trt_ep_context_file_path));
+
+  /*
+   * Test case 3: Run the dumped context model
+   *
+   * context model path = "./EP_Context_model.onnx" (created from case 1)
+   *
+   * expected result=>
+   *   engine cache is also in the same current dirctory as "./xxxxx.engine"
+   *   and the "ep_cache_context" attribute node of the context model should point to that.
+   *
+   */
+  InferenceSession session_object3{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params3;
+  model_name = params.trt_ep_context_file_path;
+  params3.trt_engine_cache_enable = 1;
+  execution_provider = TensorrtExecutionProviderWithOptions(&params3);
+  EXPECT_TRUE(session_object3.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object3.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object3.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object3, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 4: Run the dumped context model
+   *
+   * context model path = "./context_model_folder/EPContextNode_test_ctx.onnx" (created from case 2)
+   *
+   * expected result=>
+   *   engine cache path is "./context_model_folder/engine_cache_folder/xxxxx.engine"
+   *   and the "ep_cache_context" attribute node of the context model should point to "engine_cache_folder/xxxxx.engine".
+   *
+   */
+  InferenceSession session_object4{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params4;
+  model_name = "./context_model_folder/EPContextNode_test_ctx.onnx";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params4);
+  EXPECT_TRUE(session_object4.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object4.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object4.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object4, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+
+  /*
+   * Test case 5: Dump context model with embed_model = 1
+   */
+  InferenceSession session_object5{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params5;
+  params5.trt_dump_ep_context_model = 1;
+  params5.trt_ep_context_embed_mode = 1;
+  params5.trt_ep_context_file_path = "EP_Context_model_2.onnx";
+  model_name = "EPContextNode_test.onnx";
+  execution_provider = TensorrtExecutionProviderWithOptions(&params5);
+  EXPECT_TRUE(session_object5.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object5.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object5.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  /*
+   * Test case 6: Run context model with embed_model = 1 (created from case 5)
+   */
+  InferenceSession session_object6{so, GetEnvironment()};
+  OrtTensorRTProviderOptionsV2 params6;
+  params6.trt_ep_context_embed_mode = 1;
+  model_name = params5.trt_ep_context_file_path;
+  execution_provider = TensorrtExecutionProviderWithOptions(&params6);
+  EXPECT_TRUE(session_object6.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
+  status = session_object6.Load(model_name);
+  ASSERT_TRUE(status.IsOK());
+  status = session_object6.Initialize();
+  ASSERT_TRUE(status.IsOK());
+  // run inference
+  // TRT engine will be created and cached
+  // TRT profile will be created and cached only for dynamic input shape
+  // Data in profile,
+  // X: 1, 3, 3, 2, 2, 2
+  // Y: 1, 3, 3, 2, 2, 2
+  // Z: 1, 3, 3, 2, 2, 2
+  RunSession(session_object6, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
+}
+
 TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) {
   std::string model_name = "testdata/trt_plugin_custom_op_test.onnx";
   SessionOptions so;
@@ -448,6 +645,8 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
 
     params.trt_engine_cache_enable = 1;
     params.trt_engine_cache_prefix = "TRTEP_Cache_Test";
+    params.trt_dump_ep_context_model = 1;
+    params.trt_ep_context_file_path = "EP_Context_model.onnx";
     std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
     EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
     auto status = session_object.Load(model_name);
@@ -576,6 +775,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) {
     // Verify on cache with customized prefix
     ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix));
 
+    // Verify EP context model with user provided name
+    ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_ep_context_file_path));
+
     if (input_type.compare("static") == 0) {
       // Can't run inference since input shape changes but the engine is built with static input
       ASSERT_FALSE(status.IsOK());