From f917dde71740982c4520febc0ced1bff58b0068d Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Sat, 13 Jan 2024 23:04:02 -0800
Subject: [PATCH 01/39] [web] remove xnnpack from web backends (#19116)

### Description
XNNPACK is already disabled in web assembly build. This change removes
the xnnpack backend registration in JS.
---
 js/common/lib/inference-session.ts                         | 2 +-
 js/web/lib/index.ts                                        | 7 ++-----
 js/web/lib/wasm/session-options.ts                         | 3 ---
 js/web/script/test-runner-cli-args.ts                      | 7 +++----
 js/web/test/test-runner.ts                                 | 4 ++--
 .../github/azure-pipelines/templates/win-web-ci.yml        | 6 +++---
 .../azure-pipelines/templates/win-web-multi-browsers.yml   | 6 +++---
 7 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts
index edc32535fc64d..1221b52cd4985 100644
--- a/js/common/lib/inference-session.ts
+++ b/js/common/lib/inference-session.ts
@@ -181,7 +181,7 @@ export declare namespace InferenceSession {
 
   // Currently, we have the following backends to support execution providers:
   // Backend Node.js binding: supports 'cpu' and 'cuda'.
-  // Backend WebAssembly: supports 'cpu', 'wasm', 'xnnpack' and 'webnn'.
+  // Backend WebAssembly: supports 'cpu', 'wasm', 'webgpu' and 'webnn'.
   // Backend ONNX.js: supports 'webgl'.
   // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android).
   interface ExecutionProviderOptionMap {
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 4f1a3943de69a..baf45e74addea 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -26,11 +26,8 @@ if (!BUILD_DEFS.DISABLE_WASM) {
   }
   registerBackend('cpu', wasmBackend, 10);
   registerBackend('wasm', wasmBackend, 10);
-  if (BUILD_DEFS.DISABLE_TRAINING) {
-    registerBackend('xnnpack', wasmBackend, 9);
-    if (!BUILD_DEFS.DISABLE_WEBNN) {
-      registerBackend('webnn', wasmBackend, 9);
-    }
+  if (!BUILD_DEFS.DISABLE_WEBNN) {
+    registerBackend('webnn', wasmBackend, 9);
   }
 }
 
diff --git a/js/web/lib/wasm/session-options.ts b/js/web/lib/wasm/session-options.ts
index 45ea48a2df209..41ab2d52ca209 100644
--- a/js/web/lib/wasm/session-options.ts
+++ b/js/web/lib/wasm/session-options.ts
@@ -60,9 +60,6 @@ const setExecutionProviders =
 
         // check EP name
         switch (epName) {
-          case 'xnnpack':
-            epName = 'XNNPACK';
-            break;
           case 'webnn':
             epName = 'WEBNN';
             if (typeof ep !== 'string') {
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index fc74adfed1fee..8f6c5f6f04122 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -36,7 +36,6 @@ Options:
                                    webgl
                                    webgpu
                                    wasm
-                                   xnnpack
                                    webnn
  -e=<...>, --env=<...>         Specify the environment to run the test. Should be one of the following:
                                  chrome     (default)
@@ -111,7 +110,7 @@ Examples:
 
 export declare namespace TestRunnerCliArgs {
   type Mode = 'suite0'|'suite1'|'model'|'unittest'|'op';
-  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'xnnpack'|'webnn';
+  type Backend = 'cpu'|'webgl'|'webgpu'|'wasm'|'onnxruntime'|'webnn';
   type Environment = 'chrome'|'edge'|'firefox'|'electron'|'safari'|'node'|'bs';
   type BundleMode = 'dev'|'perf';
   type IOBindingMode = 'none'|'gpu-tensor'|'gpu-location';
@@ -378,13 +377,13 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   }
 
   // Option: -b=<...>, --backend=<...>
-  const browserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack', 'webnn'];
+  const browserBackends = ['webgl', 'webgpu', 'wasm', 'webnn'];
 
   // TODO: remove this when Chrome support WebNN.
   //       we need this for now because Chrome does not support webnn yet,
   //       and ChromeCanary is not in CI.
 
-  const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm', 'xnnpack' /*, 'webnn'*/];
+  const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm' /*, 'webnn'*/];
   const nodejsBackends = ['cpu', 'wasm'];
   const backendArgs = args.backend || args.b;
   const backend = (typeof backendArgs !== 'string') ? (env === 'node' ? nodejsBackends : defaultBrowserBackends) :
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 3492c8f3780ea..442cb1bcf1f34 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -96,7 +96,7 @@ async function loadTensors(
   const outputs: Test.NamedTensor[] = [];
   let dataFileType: 'none'|'pb'|'npy' = 'none';
 
-  const allowInt64 = ['wasm', 'xnnpack', 'webgpu', 'webnn'].includes(backendName);
+  const allowInt64 = ['wasm', 'webgpu', 'webnn'].includes(backendName);
 
   for (const dataFile of testCase.dataFiles) {
     const ext = extname(dataFile);
@@ -317,7 +317,7 @@ export class TensorResultValidator {
     } else if (backend === 'webgpu') {
       this.absoluteThreshold = WEBGPU_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WEBGPU_THRESHOLD_RELATIVE_ERROR;
-    } else if (backend === 'wasm' || backend === 'xnnpack' || backend === 'webnn') {
+    } else if (backend === 'wasm' || backend === 'webnn') {
       this.absoluteThreshold = WASM_THRESHOLD_ABSOLUTE_ERROR;
       this.relativeThreshold = WASM_THRESHOLD_RELATIVE_ERROR;
     } else if (backend === 'onnxruntime') {
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index 8d4efc79eaca8..8ba3517530edd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -169,12 +169,12 @@ jobs:
       errorActionPreference: stop
     displayName: 'Pack NPM packages'
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,xnnpack
+     npm test -- -e=chrome -b=webgl,wasm
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
-    displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)'
+    displayName: 'Run ort-web tests (wasm,webgl backend)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'false')
   - script: |
-     npm test -- -e=chrome -b=webgl,wasm,xnnpack,webgpu $(webgpuCommandlineExtraFlags)
+     npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags)
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'Run ort-web tests (ALL backends)'
     condition: eq('${{ parameters.RunWebGpuTests }}', 'true')
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index f7876f15029c1..31ee488318a0b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -68,15 +68,15 @@ jobs:
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm ci /js/web/'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --wasm-init-timeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Chrome)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --env=firefox --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --env=firefox --wasm-init-timeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Firefox)'
   - script: |
-      npm test -- suite0 -b=wasm,webgl,xnnpack --env=edge --wasm-init-timeout=30000 --file-cache
+      npm test -- suite0 -b=wasm,webgl --env=edge --wasm-init-timeout=30000 --file-cache
     workingDirectory: '$(Build.SourcesDirectory)\js\web'
     displayName: 'npm test (Suite0, Edge)'
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From bb4011b2b14cb2702a4922ccd0b070d9ecc49a93 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Sun, 14 Jan 2024 11:36:49 -0800
Subject: [PATCH 02/39] Set default flags nvcc and do not set default compile
 flags for ROCM EP (#19124)

### Description
Set default flags nvcc and do not set the flags for ROCM EP.


### Motivation and Context
1. To meet a BinSkim requirement for CUDA EP.

https://github.com/microsoft/binskim/blob/main/docs/BinSkimRules.md#rule-BA2024EnableSpectreMitigations

2. The ROCM EP's pipeline is broken since PR #19073 . Unit tests failed
to load the EP with the following error message:

Failed to load library libonnxruntime_providers_rocm.so with error:
/build/Release/libonnxruntime_providers_rocm.so: undefined symbol:
vtable for onnxruntime::InsertMaxPoolOutput .

This PR is a hot fix to bring the pipeline back. So far I don't know why
the error happened. The symbol "InsertMaxPoolOutput" is in
onnxruntime_optimizers. I don't see any EP code references it directly.
---
 tools/ci_build/build.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 315b9a237b1c4..0da4adb51767d 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1474,15 +1474,18 @@ def generate_build_tree(
     cflags = None
     cxxflags = None
     ldflags = None
+    cudaflags = []
     for config in configs:
         # Setup default values for cflags/cxxflags/ldflags.
         # The values set here are purely for security and compliance purposes. ONNX Runtime should work fine without these flags.
         if (
             "CFLAGS" not in os.environ
             and "CXXFLAGS" not in os.environ
+            and (not args.use_cuda or "CUDAFLAGS" not in os.environ)
             and not args.ios
             and not args.android
             and not args.build_wasm
+            and not args.use_rocm
             and not (is_linux() and platform.machine() != "aarch64" and platform.machine() != "x86_64")
         ):
             if is_windows():
@@ -1515,9 +1518,19 @@ def generate_build_tree(
                 cxxflags = cflags.copy()
                 if not args.disable_exceptions:
                     cxxflags += ["/EHsc"]
+                if args.use_cuda:
+                    # On Windows, nvcc passes /EHsc to the host compiler by default.
+                    cuda_compile_flags_str = ""
+                    for compile_flag in cflags:
+                        if compile_flag.startswith("/D"):
+                            cudaflags.append(compile_flag)
+                        else:
+                            cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag
+                    if len(cuda_compile_flags_str) != 0:
+                        cudaflags.append('-Xcompiler="%s"' % cuda_compile_flags_str)
             elif is_linux() or is_macOS():
                 if is_linux():
-                    ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now"]
+                    ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", "-Wl,-z,noexecstack"]
                 else:
                     ldflags = []
                 if config == "Release":
@@ -1560,7 +1573,8 @@ def generate_build_tree(
                     # The following flags needs GCC 8 and newer
                     cflags += ["-fstack-clash-protection", "-fcf-protection"]
                 cxxflags = cflags.copy()
-
+                if args.use_cuda:
+                    cudaflags = cflags.copy()
         config_build_dir = get_config_build_dir(build_dir, config)
         os.makedirs(config_build_dir, exist_ok=True)
         if args.use_tvm:
@@ -1580,6 +1594,8 @@ def generate_build_tree(
                 "-DCMAKE_C_FLAGS=%s" % (" ".join(cflags)),
                 "-DCMAKE_CXX_FLAGS=%s" % (" ".join(cxxflags)),
             ]
+        if cudaflags is not None and len(cudaflags) != 0:
+            temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT=%s" % (" ".join(cudaflags))]
         if ldflags is not None and len(ldflags) != 0:
             temp_cmake_args += [
                 "-DCMAKE_EXE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)),

From 76797127d6a3125fc59e605670809957a2183cbe Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sun, 14 Jan 2024 14:37:26 -0500
Subject: [PATCH 03/39] Always download cuda and trt libraries from Azure blob
 (#19118)

### Description
This way, we will not need to update the windows images constantly and
allow more flexibility to choose the cuda version in the future.
---
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 ++
 .../jobs/download_win_gpu_library.yml         | 36 +++++++++++--------
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 93d3b7f37008b..f80b035582f18 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -1172,6 +1172,7 @@ stages:
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     Skipx86Tests: 'true'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
@@ -1183,6 +1184,7 @@ stages:
     StageSuffix: 'GPU'
     MoreSuffix: '_Windows'
     Skipx86Tests: 'true'
+    CudaVersion: ${{ parameters.CudaVersion }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index b7ae9ffa3c219..538cccd3c903b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -20,31 +20,37 @@ steps:
     - powershell: |
         Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}\bin;$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}\extras\CUPTI\lib64"
       displayName: 'Append CUDA SDK Directory to PATH'
+
     - task: CmdLine@2
       inputs:
         script: |
           echo %PATH%
-      displayName: 'Print PATH'
+      displayName: 'Print PATH after download CUDA SDK'
 
   - ${{ if eq(parameters.DownloadTRT, true) }}:
     - ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      - powershell: |
-          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8'
-      - powershell: |
-          Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8\lib"
-        displayName: 'Append TensorRT Directory to PATH'
-
+        - bash: |
+            echo "##vso[task.setvariable variable=trtCudaVersion]11.8"
+          displayName: Set trtCudaVersion
     - ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      - powershell: |
-          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 $(Agent.TempDirectory)
-        displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0'
-      - powershell: |
-          Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0\lib"
-        displayName: 'Append TensorRT Directory to PATH'
+        - bash: |
+            echo "##vso[task.setvariable variable=trtCudaVersion]12.0"
+          displayName: Set trtCudaVersion
+
+    - bash: |
+        echo $(trtCudaVersion)
+      displayName: Get trtCudaVersion
+
+    - powershell: |
+        azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion) $(Agent.TempDirectory)
+      displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion)'
+
+    - powershell: |
+        Write-Host "##vso[task.prependpath]$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-$(trtCudaVersion)\lib"
+      displayName: 'Append TensorRT Directory to PATH'
 
     - task: CmdLine@2
       inputs:
         script: |
           echo %PATH%
-      displayName: 'Print PATH'
\ No newline at end of file
+      displayName: 'Print PATH after download TensorRT'
\ No newline at end of file

From c3ce9df80c2cfc7013445f8b44213f3e75cac753 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sun, 14 Jan 2024 17:51:00 -0500
Subject: [PATCH 04/39] Disabling python3.12 on training python packaging
 pipleines (#19123)

---
 .../templates/py-packaging-training-cuda-stage.yml  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
index e7b935712ac6c..158037661f072 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml
@@ -98,12 +98,13 @@ stages:
             OpsetVersion: ${{ parameters.opset_version }}
             CudaVersion: ${{ parameters.cuda_version }}
             UploadWheel: ${{ parameters.upload_wheel }}
-          Python312:
-            PythonVersion: '3.12'
-            TorchVersion: ${{ parameters.torch_version }}
-            OpsetVersion: ${{ parameters.opset_version }}
-            CudaVersion: ${{ parameters.cuda_version }}
-            UploadWheel: ${{ parameters.upload_wheel }}
+# TODO: enable this when we have torch support pyton 3.12
+#          Python312:
+#            PythonVersion: '3.12'
+#            TorchVersion: ${{ parameters.torch_version }}
+#            OpsetVersion: ${{ parameters.opset_version }}
+#            CudaVersion: ${{ parameters.cuda_version }}
+#            UploadWheel: ${{ parameters.upload_wheel }}
 
       steps:
       - task: CmdLine@2

From 71657d1eb8b0a24a4b6584d9e904506a0b4e1521 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Sun, 14 Jan 2024 17:53:26 -0500
Subject: [PATCH 05/39] [java] Fix double close (#19133)

### Description
The `OnnxValue` and `OrtProviderOptions` implementations now check to
see if they've been closed before accessing the native pointer, and also
before close is called.

### Motivation and Context
Before they could be closed twice which SIGSEGV'd the JVM. Fixes #19125.
---
 .../src/main/java/ai/onnxruntime/OnnxMap.java | 27 +++++++++++++--
 .../java/ai/onnxruntime/OnnxSequence.java     | 27 +++++++++++++--
 .../java/ai/onnxruntime/OnnxSparseTensor.java | 18 ++++++++--
 .../main/java/ai/onnxruntime/OnnxTensor.java  | 24 +++++++++++---
 .../java/ai/onnxruntime/OnnxTensorLike.java   | 16 +++++++++
 .../main/java/ai/onnxruntime/OnnxValue.java   |  9 ++++-
 .../ai/onnxruntime/OrtProviderOptions.java    | 30 ++++++++++++++++-
 .../ai/onnxruntime/OrtTrainingSession.java    | 33 +++++++++++++++++--
 .../StringConfigProviderOptions.java          |  1 +
 .../java/ai/onnxruntime/InferenceTest.java    |  2 ++
 .../java/ai/onnxruntime/OnnxTensorTest.java   | 27 +++++++++++++--
 .../test/java/ai/onnxruntime/TestHelpers.java | 12 +++++++
 12 files changed, 208 insertions(+), 18 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/OnnxMap.java b/java/src/main/java/ai/onnxruntime/OnnxMap.java
index 354ebec61274d..cde9f0de4ff0a 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxMap.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxMap.java
@@ -8,6 +8,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.logging.Logger;
 
 /**
  * A container for a map returned by {@link OrtSession#run(Map)}.
@@ -16,6 +17,7 @@
  * values: String, Long, Float, Double.
  */
 public class OnnxMap implements OnnxValue {
+  private static final Logger logger = Logger.getLogger(OnnxMap.class.getName());
 
   static {
     try {
@@ -107,6 +109,8 @@ public static OnnxMapValueType mapFromOnnxJavaType(OnnxJavaType type) {
 
   private final OnnxMapValueType valueType;
 
+  private boolean closed;
+
   /**
    * Constructs an OnnxMap containing a reference to the native map along with the type information.
    *
@@ -122,6 +126,7 @@ public static OnnxMapValueType mapFromOnnxJavaType(OnnxJavaType type) {
     this.info = info;
     this.stringKeys = info.keyType == OnnxJavaType.STRING;
     this.valueType = OnnxMapValueType.mapFromOnnxJavaType(info.valueType);
+    this.closed = false;
   }
 
   /**
@@ -146,6 +151,7 @@ public OnnxValueType getType() {
    */
   @Override
   public Map<? extends Object, ? extends Object> getValue() throws OrtException {
+    checkClosed();
     Object[] keys = getMapKeys();
     Object[] values = getMapValues();
     HashMap<Object, Object> map = new HashMap<>(OrtUtil.capacityFromSize(keys.length));
@@ -222,10 +228,27 @@ public String toString() {
     return "ONNXMap(size=" + size() + ",info=" + info.toString() + ")";
   }
 
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   /** Closes this map, releasing the native memory backing it and it's elements. */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed map.");
+    }
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
   }
 
   private native String[] getStringKeys(long apiHandle, long nativeHandle, long allocatorHandle)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSequence.java b/java/src/main/java/ai/onnxruntime/OnnxSequence.java
index 93e1be21588b4..7722514b913b6 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSequence.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSequence.java
@@ -8,6 +8,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.logging.Logger;
 
 /**
  * A sequence of {@link OnnxValue}s all of the same type.
@@ -24,6 +25,7 @@
  * </ul>
  */
 public class OnnxSequence implements OnnxValue {
+  private static final Logger logger = Logger.getLogger(OnnxSequence.class.getName());
 
   static {
     try {
@@ -40,6 +42,8 @@ public class OnnxSequence implements OnnxValue {
 
   private final SequenceInfo info;
 
+  private boolean closed;
+
   /**
    * Creates the wrapper object for a native sequence.
    *
@@ -53,6 +57,7 @@ public class OnnxSequence implements OnnxValue {
     this.nativeHandle = nativeHandle;
     this.allocatorHandle = allocatorHandle;
     this.info = info;
+    this.closed = false;
   }
 
   @Override
@@ -76,6 +81,7 @@ public OnnxValueType getType() {
    */
   @Override
   public List<? extends OnnxValue> getValue() throws OrtException {
+    checkClosed();
     if (info.sequenceOfMaps) {
       OnnxMap[] maps = getMaps(OnnxRuntime.ortApiHandle, nativeHandle, allocatorHandle);
       return Collections.unmodifiableList(Arrays.asList(maps));
@@ -110,10 +116,27 @@ public String toString() {
     return "OnnxSequence(info=" + info.toString() + ")";
   }
 
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   /** Closes this sequence, releasing the native memory backing it and it's elements. */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed sequence.");
+    }
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
   }
 
   private native OnnxMap[] getMaps(long apiHandle, long nativeHandle, long allocatorHandle)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
index 53bd4c7f9b3e6..804fe742ad624 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
@@ -14,6 +14,7 @@
 import java.nio.LongBuffer;
 import java.nio.ShortBuffer;
 import java.util.Arrays;
+import java.util.logging.Logger;
 
 /**
  * A Java object wrapping an OnnxSparseTensor.
@@ -22,6 +23,7 @@
  * different static inner class representing each type.
  */
 public final class OnnxSparseTensor extends OnnxTensorLike {
+  private static final Logger logger = Logger.getLogger(OnnxSparseTensor.class.getName());
   private final SparseTensorType sparseTensorType;
 
   // Held to prevent deallocation while used in native code.
@@ -198,6 +200,7 @@ public OnnxValueType getType() {
 
   @Override
   public SparseTensor<? extends Buffer> getValue() throws OrtException {
+    checkClosed();
     Buffer buffer = getValuesBuffer();
     long[] indicesShape = getIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
     switch (sparseTensorType) {
@@ -234,8 +237,13 @@ public SparseTensor<? extends Buffer> getValue() throws OrtException {
   }
 
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed OnnxSparseTensor.");
+    }
   }
 
   /**
@@ -257,6 +265,7 @@ public SparseTensorType getSparseTensorType() {
    * @return The indices.
    */
   public Buffer getIndicesBuffer() {
+    checkClosed();
     switch (sparseTensorType) {
       case COO:
       case CSRC:
@@ -295,6 +304,7 @@ public Buffer getIndicesBuffer() {
    * @return The inner indices.
    */
   public LongBuffer getInnerIndicesBuffer() {
+    checkClosed();
     if (sparseTensorType == SparseTensorType.CSRC) {
       LongBuffer buf =
           getInnerIndicesBuffer(OnnxRuntime.ortApiHandle, nativeHandle)
@@ -320,6 +330,7 @@ public LongBuffer getInnerIndicesBuffer() {
    * @return The data buffer.
    */
   public Buffer getValuesBuffer() {
+    checkClosed();
     ByteBuffer buffer =
         getValuesBuffer(OnnxRuntime.ortApiHandle, nativeHandle).order(ByteOrder.nativeOrder());
     switch (info.type) {
@@ -396,6 +407,7 @@ public Buffer getValuesBuffer() {
    * @return The indices shape.
    */
   public long[] getIndicesShape() {
+    checkClosed();
     return getIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
   }
 
@@ -405,6 +417,7 @@ public long[] getIndicesShape() {
    * @return The indices shape.
    */
   public long[] getInnerIndicesShape() {
+    checkClosed();
     if (sparseTensorType == SparseTensorType.CSRC) {
       return getInnerIndicesShape(OnnxRuntime.ortApiHandle, nativeHandle);
     } else {
@@ -420,6 +433,7 @@ public long[] getInnerIndicesShape() {
    * @return The values shape.
    */
   public long[] getValuesShape() {
+    checkClosed();
     return getValuesShape(OnnxRuntime.ortApiHandle, nativeHandle);
   }
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensor.java b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
index 0078adb6402f8..e1ee2c14fd9d1 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
@@ -14,12 +14,14 @@
 import java.nio.LongBuffer;
 import java.nio.ShortBuffer;
 import java.util.Optional;
+import java.util.logging.Logger;
 
 /**
  * A Java object wrapping an OnnxTensor. Tensors are the main input to the library, and can also be
  * returned as outputs.
  */
 public class OnnxTensor extends OnnxTensorLike {
+  private static final Logger logger = Logger.getLogger(OnnxTensor.class.getName());
 
   /**
    * This reference is held for OnnxTensors backed by a java.nio.Buffer to ensure the buffer does
@@ -97,6 +99,7 @@ public OnnxValueType getType() {
    */
   @Override
   public Object getValue() throws OrtException {
+    checkClosed();
     if (info.isScalar()) {
       switch (info.type) {
         case FLOAT:
@@ -144,16 +147,21 @@ public Object getValue() throws OrtException {
 
   @Override
   public String toString() {
-    return "OnnxTensor(info=" + info.toString() + ")";
+    return "OnnxTensor(info=" + info.toString() + ",closed=" + closed + ")";
   }
 
   /**
-   * Closes the tensor, releasing it's underlying memory (if it's not backed by an NIO buffer). If
-   * it is backed by a buffer then the memory is released when the buffer is GC'd.
+   * Closes the tensor, releasing its underlying memory (if it's not backed by an NIO buffer). If it
+   * is backed by a buffer then the memory is released when the buffer is GC'd.
    */
   @Override
-  public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+  public synchronized void close() {
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed tensor.");
+    }
   }
 
   /**
@@ -165,6 +173,7 @@ public void close() {
    * @return A ByteBuffer copy of the OnnxTensor.
    */
   public ByteBuffer getByteBuffer() {
+    checkClosed();
     if (info.type != OnnxJavaType.STRING) {
       ByteBuffer buffer = getBuffer(OnnxRuntime.ortApiHandle, nativeHandle);
       ByteBuffer output = ByteBuffer.allocate(buffer.capacity());
@@ -183,6 +192,7 @@ public ByteBuffer getByteBuffer() {
    * @return A FloatBuffer copy of the OnnxTensor.
    */
   public FloatBuffer getFloatBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.FLOAT) {
       // if it's fp32 use the efficient copy.
       FloatBuffer buffer = getBuffer().asFloatBuffer();
@@ -212,6 +222,7 @@ public FloatBuffer getFloatBuffer() {
    * @return A DoubleBuffer copy of the OnnxTensor.
    */
   public DoubleBuffer getDoubleBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.DOUBLE) {
       DoubleBuffer buffer = getBuffer().asDoubleBuffer();
       DoubleBuffer output = DoubleBuffer.allocate(buffer.capacity());
@@ -230,6 +241,7 @@ public DoubleBuffer getDoubleBuffer() {
    * @return A ShortBuffer copy of the OnnxTensor.
    */
   public ShortBuffer getShortBuffer() {
+    checkClosed();
     if ((info.type == OnnxJavaType.INT16)
         || (info.type == OnnxJavaType.FLOAT16)
         || (info.type == OnnxJavaType.BFLOAT16)) {
@@ -250,6 +262,7 @@ public ShortBuffer getShortBuffer() {
    * @return An IntBuffer copy of the OnnxTensor.
    */
   public IntBuffer getIntBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.INT32) {
       IntBuffer buffer = getBuffer().asIntBuffer();
       IntBuffer output = IntBuffer.allocate(buffer.capacity());
@@ -268,6 +281,7 @@ public IntBuffer getIntBuffer() {
    * @return A LongBuffer copy of the OnnxTensor.
    */
   public LongBuffer getLongBuffer() {
+    checkClosed();
     if (info.type == OnnxJavaType.INT64) {
       LongBuffer buffer = getBuffer().asLongBuffer();
       LongBuffer output = LongBuffer.allocate(buffer.capacity());
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java b/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
index c2989fe296dc2..bbfd4e981ece2 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensorLike.java
@@ -28,6 +28,9 @@ public abstract class OnnxTensorLike implements OnnxValue {
   /** The size and shape information for this tensor. */
   protected final TensorInfo info;
 
+  /** Is this value closed? */
+  protected boolean closed;
+
   /**
    * Constructs a tensor-like (the base class of OnnxTensor and OnnxSparseTensor).
    *
@@ -39,6 +42,7 @@ public abstract class OnnxTensorLike implements OnnxValue {
     this.nativeHandle = nativeHandle;
     this.allocatorHandle = allocatorHandle;
     this.info = info;
+    this.closed = false;
   }
 
   /**
@@ -59,4 +63,16 @@ long getNativeHandle() {
   public TensorInfo getInfo() {
     return info;
   }
+
+  @Override
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
+  /** Checks if the OnnxValue is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OnnxValue");
+    }
+  }
 }
diff --git a/java/src/main/java/ai/onnxruntime/OnnxValue.java b/java/src/main/java/ai/onnxruntime/OnnxValue.java
index 752a0e74267d3..e829bc80f09f6 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxValue.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxValue.java
@@ -64,7 +64,14 @@ public enum OnnxValueType {
    */
   public ValueInfo getInfo();
 
-  /** Closes the OnnxValue, freeing it's native memory. */
+  /**
+   * Checks if this value is closed (i.e., the native object has been released).
+   *
+   * @return True if the value is closed and the native object has been released.
+   */
+  public boolean isClosed();
+
+  /** Closes the OnnxValue, freeing its native memory. */
   @Override
   public void close();
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java b/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
index 39a5121fad7a2..70af10ff8cd79 100644
--- a/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
+++ b/java/src/main/java/ai/onnxruntime/OrtProviderOptions.java
@@ -5,11 +5,14 @@
 package ai.onnxruntime;
 
 import java.io.IOException;
+import java.util.logging.Logger;
 
 /** An abstract base class for execution provider options classes. */
 // Note this lives in ai.onnxruntime to allow subclasses to access the OnnxRuntime.ortApiHandle
 // package private field.
 public abstract class OrtProviderOptions implements AutoCloseable {
+  private static final Logger logger = Logger.getLogger(OrtProviderOptions.class.getName());
+
   static {
     try {
       OnnxRuntime.init();
@@ -21,6 +24,9 @@ public abstract class OrtProviderOptions implements AutoCloseable {
   /** The native pointer. */
   protected final long nativeHandle;
 
+  /** Is the native object closed? */
+  protected boolean closed;
+
   /**
    * Constructs a OrtProviderOptions wrapped around a native pointer.
    *
@@ -28,6 +34,7 @@ public abstract class OrtProviderOptions implements AutoCloseable {
    */
   protected OrtProviderOptions(long nativeHandle) {
     this.nativeHandle = nativeHandle;
+    this.closed = false;
   }
 
   /**
@@ -46,9 +53,30 @@ protected static long getApiHandle() {
    */
   public abstract OrtProvider getProvider();
 
+  /**
+   * Is the native object closed?
+   *
+   * @return True if the native object has been released.
+   */
+  public synchronized boolean isClosed() {
+    return closed;
+  }
+
   @Override
   public void close() {
-    close(OnnxRuntime.ortApiHandle, nativeHandle);
+    if (!closed) {
+      close(OnnxRuntime.ortApiHandle, nativeHandle);
+      closed = true;
+    } else {
+      logger.warning("Closing an already closed tensor.");
+    }
+  }
+
+  /** Checks if the OrtProviderOptions is closed, if so throws {@link IllegalStateException}. */
+  protected void checkClosed() {
+    if (closed) {
+      throw new IllegalStateException("Trying to use a closed OrtProviderOptions");
+    }
   }
 
   /**
diff --git a/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java b/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
index 49ddf29c22335..eeede3a1bed0b 100644
--- a/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtTrainingSession.java
@@ -12,6 +12,7 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.logging.Logger;
 
 /**
  * Wraps an ONNX training model and allows training and inference calls.
@@ -1049,8 +1050,12 @@ private native void exportModelForInference(
 
   /** Wrapper class for the checkpoint state. */
   static final class OrtCheckpointState implements AutoCloseable {
+    private static final Logger logger = Logger.getLogger(OrtCheckpointState.class.getName());
+
     final long nativeHandle;
 
+    private boolean closed;
+
     /**
      * Wraps an object around the checkpoint native handle.
      *
@@ -1058,6 +1063,7 @@ static final class OrtCheckpointState implements AutoCloseable {
      */
     OrtCheckpointState(long nativeHandle) {
       this.nativeHandle = nativeHandle;
+      this.closed = false;
     }
 
     /**
@@ -1097,6 +1103,7 @@ static OrtCheckpointState loadCheckpoint(String checkpoint) throws OrtException
      * @throws OrtException If the checkpoint failed to save.
      */
     public void saveCheckpoint(Path outputPath, boolean saveOptimizer) throws OrtException {
+      checkClosed();
       Objects.requireNonNull(outputPath, "checkpoint path must not be null");
       String outputStr = outputPath.toString();
       saveCheckpoint(
@@ -1115,6 +1122,7 @@ public void saveCheckpoint(Path outputPath, boolean saveOptimizer) throws OrtExc
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, float value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1127,6 +1135,7 @@ public void addProperty(String name, float value) throws OrtException {
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, int value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1139,6 +1148,7 @@ public void addProperty(String name, int value) throws OrtException {
      * @throws OrtException If the call failed.
      */
     public void addProperty(String name, String value) throws OrtException {
+      checkClosed();
       addProperty(
           OnnxRuntime.ortApiHandle, OnnxRuntime.ortTrainingApiHandle, nativeHandle, name, value);
     }
@@ -1152,6 +1162,7 @@ public void addProperty(String name, String value) throws OrtException {
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public float getFloatProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getFloatProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1169,6 +1180,7 @@ public float getFloatProperty(OrtAllocator allocator, String name) throws OrtExc
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public int getIntProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getIntProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1186,6 +1198,7 @@ public int getIntProperty(OrtAllocator allocator, String name) throws OrtExcepti
      * @throws OrtException If the property does not exist, or is of the wrong type.
      */
     public String getStringProperty(OrtAllocator allocator, String name) throws OrtException {
+      checkClosed();
       return getStringProperty(
           OnnxRuntime.ortApiHandle,
           OnnxRuntime.ortTrainingApiHandle,
@@ -1194,9 +1207,25 @@ public String getStringProperty(OrtAllocator allocator, String name) throws OrtE
           name);
     }
 
+    /** Checks if the OrtCheckpointState is closed, if so throws {@link IllegalStateException}. */
+    private void checkClosed() {
+      if (closed) {
+        throw new IllegalStateException("Trying to use a closed OrtCheckpointState");
+      }
+    }
+
+    public synchronized boolean isClosed() {
+      return closed;
+    }
+
     @Override
-    public void close() {
-      close(OnnxRuntime.ortTrainingApiHandle, nativeHandle);
+    public synchronized void close() {
+      if (!closed) {
+        close(OnnxRuntime.ortTrainingApiHandle, nativeHandle);
+        closed = true;
+      } else {
+        logger.warning("Closing a checkpoint twice");
+      }
     }
 
     /*
diff --git a/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java b/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
index 02207b2949e54..961163035c9a6 100644
--- a/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
+++ b/java/src/main/java/ai/onnxruntime/providers/StringConfigProviderOptions.java
@@ -32,6 +32,7 @@ protected StringConfigProviderOptions(long nativeHandle) {
    * @throws OrtException If the addition failed.
    */
   public void add(String key, String value) throws OrtException {
+    checkClosed();
     Objects.requireNonNull(key, "Key must not be null");
     Objects.requireNonNull(value, "Value must not be null");
     options.put(key, value);
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index e975117fb75bd..f6f9da1829402 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -69,7 +69,9 @@ public void environmentTest() {
     // Checks that the environment instance is the same.
     OrtEnvironment otherEnv = OrtEnvironment.getEnvironment();
     assertSame(env, otherEnv);
+    TestHelpers.quietLogger(OrtEnvironment.class);
     otherEnv = OrtEnvironment.getEnvironment("test-name");
+    TestHelpers.loudLogger(OrtEnvironment.class);
     assertSame(env, otherEnv);
   }
 
diff --git a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
index a5f285ba86a14..c060cf73ecf14 100644
--- a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
+++ b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
@@ -4,6 +4,10 @@
  */
 package ai.onnxruntime;
 
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
 import ai.onnxruntime.platform.Fp16Conversions;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -97,8 +101,8 @@ public void testBufferCreation() throws OrtException {
     float[] arrValues = new float[] {0, 1, 2, 3, 4};
     try (OnnxTensor t = OnnxTensor.createTensor(env, arrValues)) {
       // array creation isn't backed by buffers
-      Assertions.assertFalse(t.ownsBuffer());
-      Assertions.assertFalse(t.getBufferRef().isPresent());
+      assertFalse(t.ownsBuffer());
+      assertFalse(t.getBufferRef().isPresent());
       FloatBuffer buf = t.getFloatBuffer();
       float[] output = new float[arrValues.length];
       buf.get(output);
@@ -146,7 +150,7 @@ public void testBufferCreation() throws OrtException {
     directBuffer.rewind();
     try (OnnxTensor t = OnnxTensor.createTensor(env, directBuffer, new long[] {1, 5})) {
       // direct buffers don't trigger a copy
-      Assertions.assertFalse(t.ownsBuffer());
+      assertFalse(t.ownsBuffer());
       // tensors backed by buffers can get the buffer ref back out
       Assertions.assertTrue(t.getBufferRef().isPresent());
       FloatBuffer buf = t.getFloatBuffer();
@@ -428,4 +432,21 @@ public void testBf16RoundTrip() {
       }
     }
   }
+
+  @Test
+  public void testClose() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    long[] input = new long[] {1, 2, 3, 4, 5};
+    OnnxTensor value = OnnxTensor.createTensor(env, input);
+    assertFalse(value.isClosed());
+    long[] output = (long[]) value.getValue();
+    assertArrayEquals(input, output);
+    value.close();
+    // check use after close throws
+    assertThrows(IllegalStateException.class, value::getValue);
+    // check double close doesn't crash (emits warning)
+    TestHelpers.quietLogger(OnnxTensor.class);
+    value.close();
+    TestHelpers.loudLogger(OnnxTensor.class);
+  }
 }
diff --git a/java/src/test/java/ai/onnxruntime/TestHelpers.java b/java/src/test/java/ai/onnxruntime/TestHelpers.java
index 55d8169434d48..c13cdf222b15b 100644
--- a/java/src/test/java/ai/onnxruntime/TestHelpers.java
+++ b/java/src/test/java/ai/onnxruntime/TestHelpers.java
@@ -22,6 +22,8 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 import java.util.regex.Pattern;
 import org.junit.jupiter.api.Assertions;
 
@@ -258,6 +260,16 @@ static void flattenStringBase(String[] input, List<String> output) {
     output.addAll(Arrays.asList(input));
   }
 
+  static void loudLogger(Class<?> loggerClass) {
+    Logger l = Logger.getLogger(loggerClass.getName());
+    l.setLevel(Level.INFO);
+  }
+
+  static void quietLogger(Class<?> loggerClass) {
+    Logger l = Logger.getLogger(loggerClass.getName());
+    l.setLevel(Level.OFF);
+  }
+
   public static Path getResourcePath(String path) {
     return new File(TestHelpers.class.getResource(path).getFile()).toPath();
   }

From b2ce3eedb9f3d9cee82525c9f29c2d1f42ba58c7 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Mon, 15 Jan 2024 15:09:49 +1000
Subject: [PATCH 06/39] Fix build error for CoreML Split op (#19099)

### Description
<!-- Describe your changes. -->
The `split` input of the Split op is int64_t. Fixing that resolves a
type mismatch build error on Windows when CoreML is enabled (for
debugging the partitioning code).

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix build error

---------

Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
---
 .../core/providers/coreml/builders/impl/split_op_builder.cc   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
index 815f68128ffaf..56c87c883156b 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc
@@ -139,8 +139,8 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar
     }
     const auto& splits_tensor = *initializers.at(input_defs[1]->Name());
     Initializer unpacked_tensor(splits_tensor);
-    auto splits_span = unpacked_tensor.DataAsSpan<uint64_t>();
-    int sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), 0);
+    auto splits_span = unpacked_tensor.DataAsSpan<int64_t>();
+    int64_t sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), int64_t{0});
     if (sum_of_splits != split_dims_at_axis) {
       LOGS(logger, VERBOSE) << "Mismatch between the sum of 'split'. Expected: "
                             << split_dims_at_axis

From 922a2f00e3855fdc9852ed1bfe7f6f0a88e40a24 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Mon, 15 Jan 2024 14:37:22 +0800
Subject: [PATCH 07/39] Extend timeout in Nuget-CUDA-Packaging-Pipeline
 (#19138)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Linux_GPU_x64 job in the pipeline has been canceled due to timeout since
0112.
---
 .../azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index fbdd67bb5de22..48a6e0e8529e6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -15,7 +15,7 @@ stages:
   - job:
     workspace:
       clean: all
-    timeoutInMinutes: 120
+    timeoutInMinutes: 150
     pool: 'Onnxruntime-Linux-GPU'
     variables:
       - name: CUDA_VERSION_MAJOR

From a97199c62de4a96939624ba511313d0f81014f56 Mon Sep 17 00:00:00 2001
From: Ben Niu <niuben003@gmail.com>
Date: Mon, 15 Jan 2024 14:29:19 -0800
Subject: [PATCH 08/39] Fix Arm64EC build for test_q4qdq.cpp (#18523)

### Description
Fix ifdef guards in test_q4qdq.cpp to exclude code blocks intended only
for native x64 compilation instead of x64 + Arm64EC.
---
 onnxruntime/test/mlas/unittest/test_q4qdq.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
index 955c3b1201989..c317395bee970 100644
--- a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
+++ b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include "test_util.h"
 #include "mlas_q4.h"
 
-#if (defined(_M_AMD64) || defined(__x86_64__))
+#if ((defined(_M_AMD64) && !defined(_M_ARM64EC)) || defined(__x86_64__))
 
 /**
  * @brief For testing purpose,
@@ -93,7 +93,7 @@ class MlasQ4dqTest : public MlasTestBase {
                                      << K << "] QType: " << qtype;
     }
 
-#if (defined(_M_AMD64) || defined(__x86_64__))
+#if ((defined(_M_AMD64) && !defined(_M_ARM64EC)) || defined(__x86_64__))
 
     /* Test MlasBlkQ4DequantSgemmPackB, make sure we can reuse SGEMM kernel as it rearrange B the same way as sgemm pack B*/
     const size_t AlignedN = (N + 15) & ~15;

From 191525301f2b30fa4ff7337cd40c5f3f94834488 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Mon, 15 Jan 2024 17:42:50 -0500
Subject: [PATCH 09/39] [java] Updating TensorInfo so it contains the named
 dimensions (#18962)

### Description
The Java `TensorInfo` object which is used to describe a tensor's shape,
along with the input and output placeholders for a model couldn't show
any symbolic/named dimensions in that tensor. Now this information is
stored in Java strings on construction and included in the toString.

### Motivation and Context
Setting symbolic dimensions required external information in Java, the
names were not discoverable from within the API.
---
 .../main/java/ai/onnxruntime/TensorInfo.java  | 63 ++++++++++++++++---
 java/src/main/native/OrtJniUtil.c             | 26 ++++++--
 .../java/ai/onnxruntime/InferenceTest.java    |  6 ++
 3 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/java/src/main/java/ai/onnxruntime/TensorInfo.java b/java/src/main/java/ai/onnxruntime/TensorInfo.java
index 69ccb954e8afe..1c21387b50455 100644
--- a/java/src/main/java/ai/onnxruntime/TensorInfo.java
+++ b/java/src/main/java/ai/onnxruntime/TensorInfo.java
@@ -7,6 +7,7 @@
 import java.lang.reflect.Array;
 import java.nio.Buffer;
 import java.util.Arrays;
+import java.util.stream.Collectors;
 
 /** Describes an {@link OnnxTensor}, including it's size, shape and element type. */
 public class TensorInfo implements ValueInfo {
@@ -159,6 +160,12 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
   /** The shape of the tensor. */
   final long[] shape;
 
+  /** The names of the unbound dimensions. */
+  final String[] dimensionNames;
+
+  /** If there are non-empty dimension names */
+  private final boolean hasNames;
+
   /** The Java type of this tensor. */
   public final OnnxJavaType type;
 
@@ -177,6 +184,9 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
    */
   TensorInfo(long[] shape, OnnxJavaType type, OnnxTensorType onnxType) {
     this.shape = shape;
+    this.dimensionNames = new String[shape.length];
+    Arrays.fill(dimensionNames, "");
+    this.hasNames = false;
     this.type = type;
     this.onnxType = onnxType;
     this.numElements = elementCount(shape);
@@ -188,10 +198,20 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
    * <p>Called from JNI.
    *
    * @param shape The tensor shape.
+   * @param names The dimension names.
    * @param typeInt The native type int.
    */
-  TensorInfo(long[] shape, int typeInt) {
+  TensorInfo(long[] shape, String[] names, int typeInt) {
     this.shape = shape;
+    this.dimensionNames = names;
+    boolean hasNames = false;
+    for (String s : names) {
+      if (!s.isEmpty()) {
+        hasNames = true;
+        break;
+      }
+    }
+    this.hasNames = hasNames;
     this.onnxType = OnnxTensorType.mapFromInt(typeInt);
     this.type = OnnxJavaType.mapFromOnnxTensorType(this.onnxType);
     this.numElements = elementCount(shape);
@@ -206,15 +226,42 @@ public long[] getShape() {
     return Arrays.copyOf(shape, shape.length);
   }
 
+  /**
+   * Get a copy of the tensor's named dimensions.
+   *
+   * @return A copof the tensor's named dimensions.
+   */
+  public String[] getDimensionNames() {
+    return Arrays.copyOf(dimensionNames, dimensionNames.length);
+  }
+
   @Override
   public String toString() {
-    return "TensorInfo(javaType="
-        + type.toString()
-        + ",onnxType="
-        + onnxType.toString()
-        + ",shape="
-        + Arrays.toString(shape)
-        + ")";
+    String output =
+        "TensorInfo(javaType="
+            + type.toString()
+            + ",onnxType="
+            + onnxType.toString()
+            + ",shape="
+            + Arrays.toString(shape);
+    if (hasNames) {
+      output =
+          output
+              + ",dimNames=["
+              + Arrays.stream(dimensionNames)
+                  .map(
+                      a -> {
+                        if (a.isEmpty()) {
+                          return "\"\"";
+                        } else {
+                          return a;
+                        }
+                      })
+                  .collect(Collectors.joining(","))
+              + "]";
+    }
+    output = output + ")";
+    return output;
   }
 
   /**
diff --git a/java/src/main/native/OrtJniUtil.c b/java/src/main/native/OrtJniUtil.c
index 879ba8a310618..7b26291581395 100644
--- a/java/src/main/native/OrtJniUtil.c
+++ b/java/src/main/native/OrtJniUtil.c
@@ -342,7 +342,6 @@ jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorT
   if (code != ORT_OK) {
     return NULL;
   }
-  //printf("numDim %d\n",numDim);
   int64_t* dimensions = (int64_t*) malloc(sizeof(int64_t)*numDim);
   code = checkOrtStatus(jniEnv, api, api->GetDimensions(info, dimensions, numDim));
   if (code != ORT_OK) {
@@ -358,12 +357,31 @@ jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorT
   free(dimensions);
   dimensions = NULL;
 
+  // Create the string array for the names.
+  const char** dimensionNames = (const char**) malloc(sizeof(char*)*numDim);
+  if (dimensionNames == NULL) {
+    throwOrtException(jniEnv, 1, "Not enough memory");
+    return NULL;
+  }
+  code = checkOrtStatus(jniEnv, api, api->GetSymbolicDimensions(info, dimensionNames, numDim));
+  if (code != ORT_OK) {
+    // extraction failed, exception has been thrown, return to Java.
+    free(dimensionNames);
+    return NULL;
+  }
+  jclass stringClazz = (*jniEnv)->FindClass(jniEnv, "java/lang/String");
+  jobjectArray names = (*jniEnv)->NewObjectArray(jniEnv, safecast_size_t_to_jsize(numDim), stringClazz, NULL);
+  for (size_t i = 0; i < numDim; i++) {
+    jobject javaName = (*jniEnv)->NewStringUTF(jniEnv, dimensionNames[i]);
+    (*jniEnv)->SetObjectArrayElement(jniEnv, names, safecast_size_t_to_jsize(i), javaName);
+  }
+  free(dimensionNames);
+
   // Create the TensorInfo object
   static const char *tensorInfoClassName = "ai/onnxruntime/TensorInfo";
   jclass clazz = (*jniEnv)->FindClass(jniEnv, tensorInfoClassName);
-  jmethodID tensorInfoConstructor = (*jniEnv)->GetMethodID(jniEnv,clazz, "<init>", "([JI)V");
-  //printf("TensorInfo class %p, methodID %p\n",clazz,tensorInfoConstructor);
-  jobject tensorInfo = (*jniEnv)->NewObject(jniEnv, clazz, tensorInfoConstructor, shape, onnxTypeInt);
+  jmethodID tensorInfoConstructor = (*jniEnv)->GetMethodID(jniEnv,clazz, "<init>", "([J[Ljava/lang/String;I)V");
+  jobject tensorInfo = (*jniEnv)->NewObject(jniEnv, clazz, tensorInfoConstructor, shape, names, onnxTypeInt);
   return tensorInfo;
 }
 
diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index f6f9da1829402..7fef2dc784b7b 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -590,6 +590,12 @@ public void testSymbolicDimensionAssignment() throws OrtException {
         Map<String, NodeInfo> infoMap = session.getInputInfo();
         TensorInfo aInfo = (TensorInfo) infoMap.get("A").getInfo();
         assertArrayEquals(new long[] {-1, 2}, aInfo.shape);
+        assertEquals(2, aInfo.dimensionNames.length);
+        assertEquals("n", aInfo.dimensionNames[0]);
+        assertEquals("", aInfo.dimensionNames[1]);
+        TensorInfo bInfo = (TensorInfo) infoMap.get("B").getInfo();
+        assertEquals(1, bInfo.dimensionNames.length);
+        assertEquals("m", bInfo.dimensionNames[0]);
       }
     }
     // Check that when the options are assigned it overrides the symbolic dimension

From 1150b1f81ea7e46a840212acf194422af7f764a3 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 16 Jan 2024 08:57:37 +0800
Subject: [PATCH 10/39] ORTModule memory improvement (#18924)

## Dependency

https://github.com/microsoft/onnxruntime/pull/19007

## ORTModule memory efficient gradient management

Previously I have tried to solve the coarsed-grained gradient
accumulation/update problem in ORTModule with
https://github.com/microsoft/onnxruntime/pull/8979, while that
resolution somehow is not fully validated with DDP or there is user
hooks on the gradient accumulation on torch parameter.

This PR is addressing the problem in the similar approach as PR 8979,
e.g. trigger gradient accumulation once ORT computed the grad, but
instead of use a AccumulateGrad op, this time with a ONNX operator
PythonOp, internally it will call param.backward(grad), which will help
handle all related hooks correctly.


## Design

Check the details from


https://microsoftapc-my.sharepoint.com/:p:/g/personal/pengwa_microsoft_com/EaaBq4EzsFhOmsDEXCG7Ba4Bb9bwd0O2sFV_JXJ4jBLYLA?e=7Sz2g8&nav=eyJzSWQiOjI3MSwiY0lkIjozMjE4NzI1NDIzfQ

## Convergence Validation:


![image](https://github.com/microsoft/onnxruntime/assets/10530022/ccf3a213-e815-4b23-b759-165033b2d9fe)

differences are on mostly 0.000x, sometimes 0.00x, which may comes from
the different order gradient apply happens before or after this change
(on deepspeed zero stage 2)


## TODO

Consolidate the logic with Stage3's similar logic.
---
 docs/ORTModule_Training_Guidelines.md         |  10 +
 onnxruntime/core/framework/execution_frame.cc |   3 +-
 .../python/tools/symbolic_shape_infer.py      |   9 +-
 .../ortmodule/_graph_execution_manager.py     | 109 ++++++--
 .../ortmodule/_mem_efficient_grad_mgmt.py     | 246 ++++++++++++++++++
 .../python/training/ortmodule/_onnx_models.py |   1 +
 .../training/ortmodule/_pythonop_helper.py    | 240 +++++++++++++++++
 .../training/ortmodule/_training_manager.py   |  27 +-
 .../python/training/ortmodule/options.py      |  12 +
 .../utils/hooks/_zero_offload_subscriber.py   |   2 +-
 .../python/orttraining_test_ortmodule_api.py  |   2 +-
 .../torch_custom_function_kernel_base.cc      |   5 +-
 ...-linux-nightly-ortmodule-test-pipeline.yml |   2 +-
 13 files changed, 638 insertions(+), 30 deletions(-)
 create mode 100644 orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
 create mode 100644 orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
index bede16204d420..91057d3dfb120 100644
--- a/docs/ORTModule_Training_Guidelines.md
+++ b/docs/ORTModule_Training_Guidelines.md
@@ -293,6 +293,16 @@ A classical usage of disabling the deep copy: when the deep copy before module e
     export ORTMODULE_MEMORY_OPT_LEVEL=0
     ```
 
+### ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT
+
+- **Feature Area**: *ORTMODULE/Optimizations*
+- **Description**: By default, the memory-efficient gradient management is turned off. The gradient after it is computed in ONNX Runtime, will trigger the corresponding parameter's backward function through `PythonOpGrad` operator. This would help release the gradient buffer managed in ONNX Runtime, which originally is released once all backward computation finishes.
+
+	```bash
+	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 # Enable
+	export ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 # Disable
+	```
+
 ### 2.2 Memory Optimization
 
 Q: *Want to run a bigger batch size?*
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index d9c49dc6bea1d..8c08152986cf6 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -223,7 +223,8 @@ void IExecutionFrame::Init(gsl::span<const int> feed_mlvalue_idxs, gsl::span<con
                            const std::unordered_map<int, OrtValue>& initializers,
                            const std::function<bool(const std::string& name)>& is_initializer_sparse_func,
                            gsl::span<const OrtValue> fetches) {
-  ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size());
+  ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size(), "Get feed size: ", feeds.size(), " but expected feed size: ",
+              feed_mlvalue_idxs.size());
   ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs_.size());
 
   // Need this for sparse conversions in host memory
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index e90eea553c185..ef4c4ae906243 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -2415,9 +2415,9 @@ def _infer_RotaryEmbedding(self, node):  # noqa: N802
 
     def _infer_PythonOp(self, node):  # noqa: N802
         output_tensor_types = get_attribute(node, "output_tensor_types")
-        assert output_tensor_types
+        assert output_tensor_types, f"PythonOp '{node.name}' has no output_tensor_types attribute."
         output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
-        assert output_tensor_ranks
+        assert output_tensor_ranks, f"PythonOp '{node.name}' has no output_tensor_ranks attribute."
 
         from onnxruntime.capi._pybind_state import get_shape_inference_function
 
@@ -2438,7 +2438,10 @@ def _infer_PythonOp(self, node):  # noqa: N802
                 input_dtype = self.known_vi_[node.input[input_index]].type.tensor_type.elem_type
                 input_dtypes.append(input_dtype)
             output_shapes, output_dtypes = shape_inferer(node, input_shapes, input_dtypes)
-            assert len(output_shapes) == len(output_dtypes) == (len(node.output) - 1)
+            assert len(output_shapes) == len(output_dtypes) == (len(node.output) - 1), (
+                f"PythonOp '{func_name}' returned {len(output_shapes)} shapes and {len(output_dtypes)} dtypes, "
+                f"but expected {len(node.output) - 1} outputs."
+            )
             for i in range(len(node.output) - 1):
                 output_index = i + 1
                 vi = self.known_vi_[node.output[output_index]]
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 853eab61b4bd6..779b6bfe50422 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -36,7 +36,6 @@
 from ._io import _FlattenedModule, _InputInfo
 from ._runtime_inspector import RuntimeInspector
 from ._utils import check_function_has_param, get_rank
-from ._zero_stage3_compatibility import stage3_export_context
 from .options import DebugOptions, LogLevel, _MemoryOptimizationLevel, _RuntimeOptions
 from .torch_cpp_extensions.cpu.aten_op_executor import load_aten_op_executor_cpp_extension
 
@@ -148,6 +147,10 @@ def __init__(
 
             configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True)
 
+        # Will be reset everytime we re-initialize the graph builder.
+        # Be noted, we will never enable this feature for inference mode.
+        self._mem_efficient_grad_management_is_enabled = False
+
     def _get_torch_gpu_allocator_function_addresses(self):
         if self._runtime_options.use_external_gpu_allocator and torch.cuda.is_available():
             # CPP extension to get torch GPU allocator's alloc and free function addresses
@@ -388,6 +391,8 @@ def _get_exported_model(self, input_schema: ORTModelInputOutputSchemaType, *inpu
         assert self._export_mode is not None, "Please use a concrete instance of ExecutionManager"
 
         try:
+            from ._zero_stage3_compatibility import stage3_export_context
+
             with torch.no_grad(), stage3_export_context(self._runtime_options.enable_zero_stage3_support, self):
                 required_export_kwargs = {
                     "input_names": self._input_info.names,
@@ -496,9 +501,35 @@ def _get_graph_transformer_config(self) -> C.TrainingGraphTransformerConfigurati
     def _initialize_graph_builder(self):
         """Creates a new OrtModuleGraphBuilder, initializes it and saves it to self._graph_builder"""
 
+        self._mem_efficient_grad_management_is_enabled = (
+            self._export_mode != torch.onnx.TrainingMode.EVAL
+            and self._runtime_options.enable_mem_efficient_grad_management
+        )
+
+        # We post process the exported model because the trainable parame might be changed, so this path is
+        # re-triggered by reinitialize_graph_builder.
+        exported_model = copy.deepcopy(self._onnx_models.exported_model)
+        self._onnx_models.processed_exported_model = exported_model
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import post_processing_enable_mem_efficient_training
+
+            # Override the options if model is not modified.
+            (
+                self._mem_efficient_grad_management_is_enabled,
+                exported_model,
+            ) = post_processing_enable_mem_efficient_training(exported_model, self._flattened_module.named_parameters())
+
+            if self._runtime_options.run_symbolic_shape_infer:
+                exported_model = SymbolicShapeInference.infer_shapes(
+                    exported_model, auto_merge=True, guess_output_rank=True
+                )
+
         # All initializer names along with user inputs are a part of the onnx graph inputs
         # since the onnx model was exported with the flag keep_initializers_as_inputs=True
-        onnx_initializer_names = {p.name for p in self._onnx_models.exported_model.graph.input}
+        # We need to use the raw exported model here since the graph inputs include both user inputrs and
+        # parameters.
+        onnx_initializer_names = {p.name for p in exported_model.graph.input}
 
         # TODO: PyTorch exporter bug: changes the initializer order in ONNX model
         initializer_names = [
@@ -521,6 +552,13 @@ def _initialize_graph_builder(self):
 
             # Add stage3 pull weight trigger name to require_grad_names, so that it will be included in the gradient graph.
             input_names_require_grad.append(STAGE3_PULL_WEIGHT_TRIGGER_NAME)
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+
+            # Add mem efficient grad trigger name to require_grad_names, so that it will be included in the gradient graph.
+            input_names_require_grad.append(MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME)
+
         grad_builder_config.input_names_require_grad = input_names_require_grad
         grad_builder_config.build_gradient_graph = self._export_mode == torch.onnx.TrainingMode.TRAINING
         grad_builder_config.enable_caching = self._runtime_options.enable_grad_acc_optimization
@@ -532,12 +570,23 @@ def _initialize_graph_builder(self):
 
         # It is assumed here that the order and names of the inputs and outputs are not modified by the backend in any way
         # and are kept as they appear in the exported onnx model.
-        self._graph_builder.initialize(self._onnx_models.exported_model.SerializeToString(), grad_builder_config)
+        self._graph_builder.initialize(exported_model.SerializeToString(), grad_builder_config)
+
+        raw_onnx_initializer_names = {p.name for p in self._onnx_models.exported_model.graph.input}
+
+        raw_initializer_names = [
+            name for name, _ in self._flattened_module.named_parameters() if name in raw_onnx_initializer_names
+        ]
+        raw_initializer_names_to_train = [
+            name
+            for name, param in self._flattened_module.named_parameters()
+            if param.requires_grad and name in raw_onnx_initializer_names
+        ]
 
         # TODO: Explore ways to make self._graph_info.initializer_names and self._graph_info.initializer_names_to_train
         #       a set (unordered_set in the backend) that does not require a copy on each reference.
-        self._graph_initializer_names = set(initializer_names)
-        self._graph_initializer_names_to_train = set(initializer_names_to_train)
+        self._graph_initializer_names = set(raw_initializer_names)
+        self._graph_initializer_names_to_train = set(raw_initializer_names_to_train)
 
         # Initializers can be cached and used since they are expected not to be re-instantiated
         # between forward calls.
@@ -588,7 +637,7 @@ def _enable_conditional_optimizations(
         # Enable data sparsity inspection if sparse optimizer is ON or user wants to print input density.
         if self._runtime_options.enable_sparse_optimizer or self._runtime_options.print_input_density:
             self._runtime_inspector.enable_input_inspector(
-                self._onnx_models.exported_model, self._graph_builder.get_graph_info().user_input_names
+                self._onnx_models.processed_exported_model, self._graph_builder.get_graph_info().user_input_names
             )
 
             if self._runtime_options.enable_sparse_optimizer:
@@ -596,11 +645,21 @@ def _enable_conditional_optimizations(
                     inputs, kwargs
                 )
 
-                if self._runtime_options.enable_zero_stage3_support:
+                if self._runtime_options.enable_zero_stage3_support or self._mem_efficient_grad_management_is_enabled:
                     self._append_pull_weight_trigger_as_input(kwargs, detected_device)
 
+                param_to_append_as_onnx_graph_inputs = []
+                if self._mem_efficient_grad_management_is_enabled:
+                    from ._mem_efficient_grad_mgmt import get_params_not_connected_to_pull_param_trigger
+
+                    param_to_append_as_onnx_graph_inputs = get_params_not_connected_to_pull_param_trigger(
+                        self._flattened_module.named_parameters(), self._onnx_models.exported_model
+                    )
+                else:
+                    param_to_append_as_onnx_graph_inputs = self._graph_initializers
+
                 _, embed_sparsity_results, label_sparsity_results = _io._combine_input_buffers_initializers(
-                    self._graph_initializers,
+                    param_to_append_as_onnx_graph_inputs,
                     self._graph_builder.get_graph_info().user_input_names,
                     self._input_info,
                     self._flattened_module.named_buffers(),
@@ -632,19 +691,31 @@ def _enable_conditional_optimizations(
                 self._runtime_inspector.disable_input_inspector()
 
     def _append_pull_weight_trigger_as_input(self, kwargs: Dict, device: torch.device):
-        from ._zero_stage3_compatibility import (
-            STAGE3_PULL_WEIGHT_TRIGGER_NAME,
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
-        )
+        if self._runtime_options.enable_zero_stage3_support:
+            from ._zero_stage3_compatibility import (
+                STAGE3_PULL_WEIGHT_TRIGGER_NAME,
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE,
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+            )
 
-        kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
-            STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
-            dtype=onnx_dtype_to_pytorch_dtype(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
-            device=device,
-        ).requires_grad_()
+            kwargs[STAGE3_PULL_WEIGHT_TRIGGER_NAME] = torch.zeros(
+                STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_SHAPE,
+                dtype=onnx_dtype_to_pytorch_dtype(STAGE3_PULL_WEIGHT_TRIGGER_OUTPUT_DTYPE),
+                device=device,
+            ).requires_grad_()
+
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import (
+                MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+            )
 
-        return kwargs
+            kwargs[MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME] = torch.zeros(
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+                dtype=onnx_dtype_to_pytorch_dtype(MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE),
+                device=device,
+            ).requires_grad_()
 
     def _log_feature_stats(self):
         if get_rank() != 0:
diff --git a/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
new file mode 100644
index 0000000000000..4663afdaa94a0
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_mem_efficient_grad_mgmt.py
@@ -0,0 +1,246 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import ctypes
+
+import torch
+from onnx import ModelProto, NodeProto, TensorProto, helper
+
+from onnxruntime.training.utils import pytorch_type_to_onnx_dtype
+
+from ._pythonop_helper import make_pythonop_node
+
+MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME = "mem_efficient_pull_weight_trigger"
+MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE = TensorProto.FLOAT
+MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE = [1]
+
+
+def get_params_connected_to_pull_param_trigger(
+    named_params: dict[str, torch.nn.parameter.Parameter], exported_model: ModelProto
+):
+    # Be noted, some parameters might not in graph input because they are not used in forward, so we filtered them also.
+    onnx_initializer_names = {p.name for p in exported_model.graph.input}
+    return {k: v for k, v in named_params if v.requires_grad and k in onnx_initializer_names}
+
+
+def get_params_not_connected_to_pull_param_trigger(
+    named_params: dict[str, torch.nn.parameter.Parameter], exported_model: ModelProto
+):
+    # Be noted, some parameters might not in graph input because they are not used in forward, so we filtered them also.
+    onnx_initializer_names = {p.name for p in exported_model.graph.input}
+    return [v for k, v in named_params if not v.requires_grad and k in onnx_initializer_names]
+
+
+def post_processing_enable_mem_efficient_training(
+    exported_model: ModelProto,
+    named_params: dict[str, torch.nn.parameter.Parameter],
+) -> tuple[bool, ModelProto]:
+    """This function is used to enable zero stage3 compatibility.
+
+    Args:
+        exported_model (ModelProto): The exported model.
+        named_params (Optional[Dict[str, torch.nn.parameter.Parameter]]): The full parameter map.
+
+    Returns:
+        tuple[bool, ModelProto]: A tuple of bool and ModelProto. The bool indicates whether the model is modified.
+
+    """
+    trainable_named_params = get_params_connected_to_pull_param_trigger(named_params, exported_model)
+    if len(trainable_named_params) == 0:
+        return False, exported_model
+
+    # Create weight retrieving function using trainable_named_params.
+    param_pull_trigger_func_class = _create_param_trigger_function(trainable_named_params)
+    param_retrieve_func_class = _create_param_retrieval_function(trainable_named_params)
+
+    def _get_param_pull_trigger_name(param_name: str) -> str:
+        return f"pull_{param_name}"
+
+    # Create weight retrieving PythonOp.
+    inputs = [
+        helper.make_tensor_value_info(
+            MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME,
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,  # Use the same data type with output for the input
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+        )
+    ]
+
+    outputs = [
+        helper.make_tensor_value_info(
+            _get_param_pull_trigger_name(pname),
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+            MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+        )
+        for pname in trainable_named_params
+    ]
+
+    weight_pull_node = make_pythonop_node(
+        "weight_pull_trigger",
+        inputs,
+        outputs,
+        param_pull_trigger_func_class,
+        training_mode=1,
+        safe_run_mode=0,
+    )
+
+    graph_inputs_to_remove = []
+    input_offset = 0
+    for graph_input in exported_model.graph.input:
+        if graph_input.name not in trainable_named_params:
+            continue
+
+        graph_inputs_to_remove.append(graph_input)
+
+        # Create the param retrieval function for this parameter.
+        node_inputs = [
+            helper.make_tensor_value_info(
+                _get_param_pull_trigger_name(graph_input.name),
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_DTYPE,
+                MEM_EFFICIENT_PARAM_TRIGGER_OUTPUT_SHAPE,
+            ),
+            graph_input.name,  # Second param is a string, which represents the param_name
+        ]
+
+        node_outputs = [
+            helper.make_tensor_value_info(
+                graph_input.name,  # output use the same name as weight
+                int(pytorch_type_to_onnx_dtype(trainable_named_params[graph_input.name].dtype)),
+                list(trainable_named_params[graph_input.name].shape),
+            ),
+        ]
+
+        new_node = make_pythonop_node(
+            f"weight_retrieval_{graph_input.name}",
+            node_inputs,
+            node_outputs,
+            param_retrieve_func_class,
+            training_mode=1,
+            safe_run_mode=0,
+        )
+        exported_model.graph.node.insert(input_offset, new_node)
+        input_offset += 1
+
+    # Delete exported_model.graph.input
+    names_to_remove = [input.name for input in graph_inputs_to_remove]
+    value_infos_to_remove = [
+        value_info for value_info in exported_model.graph.value_info if value_info.name in names_to_remove
+    ]
+    for value_info in value_infos_to_remove:
+        exported_model.graph.value_info.remove(value_info)
+
+    for input_to_remove in graph_inputs_to_remove:
+        exported_model.graph.input.remove(input_to_remove)
+
+    # Re-order graph input to make sure the weight pull trigger is the first user input.
+    offset = 0  # Find the first trainable param, and insert the new input before it, as part of user inputs.
+    for input in exported_model.graph.input:
+        if input.name in named_params:
+            break
+        offset += 1
+    exported_model.graph.input.insert(offset, inputs[0])
+    exported_model.graph.node.insert(0, weight_pull_node)
+
+    return True, exported_model
+
+
+_PARAM_FUNCTION_INDEX = [0]
+
+
+def _create_param_trigger_function(trainable_named_params: dict[str, torch.nn.parameter.Parameter]):
+    """This function is used to create a weight retrieving function using trainable_named_params."""
+
+    @staticmethod
+    def forward(ctx, weight_in_trigger):
+        params = list(trainable_named_params.values())
+        ctx.params = params
+        ctx.dtype = weight_in_trigger.dtype
+        ctx.device = weight_in_trigger.device
+        ctx.shape = weight_in_trigger.shape
+        return (torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype),) * len(params)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype)
+
+    @staticmethod
+    def infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: list[list[int | str] | None],
+        tensor_input_dtypes: list[torch.onnx.TensorProtoDataType],
+    ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]:
+        param_count = len(trainable_named_params.values())
+        tensor_output_shapes = [
+            tensor_input_shapes[0],
+        ] * param_count
+        tensor_output_dtypes = [
+            tensor_input_dtypes[0],
+        ] * param_count
+
+        return tensor_output_shapes, tensor_output_dtypes
+
+    _PARAM_FUNCTION_INDEX[0] += 1
+
+    return type(
+        f"ParamTriggerFunction_{_PARAM_FUNCTION_INDEX[0]}",
+        (torch.autograd.Function,),
+        {
+            "forward": forward,
+            "backward": backward,
+            "infer_shape": infer_shape,
+        },
+    )
+
+
+def _create_param_retrieval_function(trainable_named_params: dict[str, torch.nn.parameter.Parameter]):
+    """This function is used to create a weight retrieving function using trainable_named_params."""
+
+    @staticmethod
+    def forward(ctx, param_trigger, param_name):
+        ctx.param_name = param_name
+        ctx.dtype = param_trigger.dtype
+        ctx.device = param_trigger.device
+        ctx.shape = param_trigger.shape
+        return trainable_named_params[param_name]
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        trainable_named_params[ctx.param_name].backward(grad_outputs[0])
+        return torch.zeros(ctx.shape, device=ctx.device, dtype=ctx.dtype), None
+
+    @staticmethod
+    def infer_shape(
+        node: NodeProto,
+        tensor_input_shapes: list[list[int | str] | None],
+        tensor_input_dtypes: list[torch.onnx.TensorProtoDataType],
+    ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]:
+        input_pointer_scalars_attr_name = "input_pointer_scalars"
+        found = [attr for attr in node.attribute if attr.name == input_pointer_scalars_attr_name]
+
+        assert len(found) == 1
+        input_pointer_scalars = found[0].ints
+
+        # Restore the nn.Module from the pointer.
+        param_name = ctypes.cast(input_pointer_scalars[0], ctypes.py_object).value
+
+        tensor_output_shapes = [
+            list(trainable_named_params[param_name].shape),
+        ]
+        tensor_output_dtypes = [
+            int(pytorch_type_to_onnx_dtype(trainable_named_params[param_name].dtype)),
+        ]
+
+        return tensor_output_shapes, tensor_output_dtypes
+
+    return type(
+        f"ParamRetrievalFunction_{_PARAM_FUNCTION_INDEX[0]}",
+        (torch.autograd.Function,),
+        {
+            "forward": forward,
+            "backward": backward,
+            "infer_shape": infer_shape,
+        },
+    )
diff --git a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
index d687bc24384ed..a0001a2f201f1 100644
--- a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
+++ b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py
@@ -33,6 +33,7 @@ class ONNXModels:
     """
 
     exported_model: Optional[onnx.ModelProto] = None
+    processed_exported_model: Optional[onnx.ModelProto] = None
     optimized_model: Optional[onnx.ModelProto] = None
 
     def save_exported_model(self, path, name_prefix, export_mode):
diff --git a/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py b/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py
new file mode 100644
index 0000000000000..32a564b27acd0
--- /dev/null
+++ b/orttraining/orttraining/python/training/ortmodule/_pythonop_helper.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from __future__ import annotations
+
+import inspect
+
+import onnx
+import torch
+
+from onnxruntime.capi._pybind_state import register_miscellaneous_const_input, register_torch_autograd_function
+
+from ._custom_autograd_function_exporter import register_custom_function_schema_supplementary
+from ._utils import get_fully_qualified_class_name
+
+PYTHON_OP_DOMAIN = "com.microsoft"
+PYTHON_OP_TYPE = "PythonOp"
+
+PYTHON_OP_ATTRIBUTE_FUNC_NAME = "func_name"
+PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE = "safe_run_mode"
+PYTHON_OP_ATTRIBUTE_TRAINING_MODE = "training_mode"
+
+
+def set_safe_run_mode(model: onnx.ModelProto, allowed_unsafe_run_python_op_names: list[str]) -> onnx.ModelProto:
+    # Update safe_run_mode attribute for PythonOp.
+    for node in model.graph.node:
+        if node.domain == PYTHON_OP_DOMAIN and node.op_type == PYTHON_OP_TYPE:
+            func_name = None
+            safe_run_mode_attr = None
+            for attr in node.attribute:
+                if attr.name == PYTHON_OP_ATTRIBUTE_FUNC_NAME:
+                    func_name = attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+                if attr.name == PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE:
+                    safe_run_mode_attr = attr
+
+            if func_name in allowed_unsafe_run_python_op_names:
+                if safe_run_mode_attr:
+                    node.attribute.remove(safe_run_mode_attr)
+                node.attribute.append(onnx.helper.make_attribute(PYTHON_OP_ATTRIBUTE_SAFE_RUN_MODE, 0))
+
+    return model
+
+
+_PYTHON_OP_INCRE_INDEX = [0]
+
+
+def make_pythonop_node(
+    name_prefix: str,
+    inputs: list[
+        onnx.ValueInfoProto | int | bool | float | tuple[int, ...] | tuple[bool, ...] | tuple[float, ...] | object
+    ],
+    outputs: list[onnx.ValueInfoProto],
+    func_class: torch.autograd.Function,
+    training_mode: int,
+    safe_run_mode: int,
+) -> onnx.NodeProto:
+    assert issubclass(func_class, torch.autograd.Function), "func_class must be a subclass of torch.autograd.Function."
+
+    assert len(inputs) > 0, f"inputs must not be empty for function {func_class}."
+    assert len(outputs) > 0, f"outputs must not be empty for function {func_class}."
+
+    all_input_parameters: list[inspect.Parameter] = list(inspect.signature(func_class.forward).parameters.values())
+
+    # Remove the first parameter (ctx) from inspected parameter list.
+    assert len(inputs) == len(all_input_parameters) - 1, (
+        f"The number of inputs ({len(inputs)}) must match the number of parameters "
+        f"({len(all_input_parameters) - 1}) of the forward function."
+    )
+
+    func_full_qual_name = get_fully_qualified_class_name(func_class)
+
+    input_tensor_types = []
+    input_tensor_ranks = []
+
+    input_bool_scalars = []
+    input_bool_scalar_positions = []
+
+    input_int_scalars = []
+    input_int_scalar_positions = []
+
+    input_float_scalars = []
+    input_float_scalar_positions = []
+
+    input_bool_tuples = []
+    input_bool_tuple_positions = []
+    input_bool_tuple_begins = []
+
+    input_int_tuples = []
+    input_int_tuple_positions = []
+    input_int_tuple_begins = []
+
+    input_float_tuples = []
+    input_float_tuple_positions = []
+    input_float_tuple_begins = []
+
+    input_pointer_scalars = []
+    input_pointer_scalar_positions = []
+
+    tensor_args = []
+    debug_comment = ""
+    cconv = ""
+    # Encode inputs to torch.autograd.Function.
+    for i, arg in enumerate(inputs):
+        if isinstance(arg, onnx.ValueInfoProto):
+            # Got a tensor variable.
+            tensor_args.append(arg.name)
+            input_tensor_types.append(arg.type.tensor_type.elem_type)
+            input_tensor_ranks.append(len(arg.type.tensor_type.shape.dim))
+            cconv += "d"
+            continue
+
+        cconv += "c"
+
+        # Got a non-tensor variable.
+        if isinstance(arg, float):
+            # A float.
+            input_float_scalar_positions.append(i)
+            input_float_scalars.append(arg)
+            continue
+        # bool check MUST be before int check since bool is a subclass of int
+        elif isinstance(arg, bool):
+            # A bool.
+            input_bool_scalar_positions.append(i)
+            input_bool_scalars.append(int(arg))
+            continue
+        elif isinstance(arg, int):
+            # A int.
+            input_int_scalar_positions.append(i)
+            input_int_scalars.append(arg)
+            continue
+
+        is_bool_tuple = False
+        is_int_tuple = False
+        is_float_tuple = False
+        if isinstance(arg, tuple) and len(arg) > 0:
+            # bool check MUST be before int check since bool is a subclass of int.
+            is_bool_tuple = all(isinstance(ele, bool) for ele in arg)
+            is_int_tuple = not is_bool_tuple and all(isinstance(ele, int) for ele in arg)
+            is_float_tuple = not is_bool_tuple and not is_int_tuple and all(isinstance(ele, float) for ele in arg)
+
+        # Only support tuple of bool, int or float, for other types, handle it as a pointer.
+        if is_bool_tuple:
+            # A tuple of bool.
+            input_bool_tuple_positions.append(i)
+            input_bool_tuple_begins.append(len(input_bool_tuples))
+            input_bool_tuples.extend([int(ele) for ele in arg])
+            continue
+        elif is_int_tuple:
+            # A tuple of ints.
+            input_int_tuple_positions.append(i)
+            input_int_tuple_begins.append(len(input_int_tuples))
+            input_int_tuples.extend(list(arg))
+            continue
+        elif is_float_tuple:
+            # A tuple of floats.
+            input_float_tuple_positions.append(i)
+            input_float_tuple_begins.append(len(input_float_tuples))
+            input_float_tuples.extend(list(arg))
+            continue
+
+        from onnxruntime.training.utils.hooks._statistics_subscriber import _InspectActivation
+
+        is_inspect_activation = func_full_qual_name == get_fully_qualified_class_name(_InspectActivation)
+        if is_inspect_activation and isinstance(arg, str):
+            # _InspectActivation is a special case where the first argument is a string
+            # that is used to determine the activation name to be inspected.
+            debug_comment += arg
+
+        # All other inputs are accessed via "pointers".
+        input_pointer_scalar_positions.append(i)
+        input_pointer_scalars.append(id(arg))
+
+        # For pointer (for example, ProcessGroup passed to PythonOp) needed for PythonOp execution,
+        # we append it into a global store to hold a reference (in case it is released after module exported).
+        register_miscellaneous_const_input(arg)
+
+    output_tensor_types = []
+    output_tensor_ranks = []
+    for arg in outputs:
+        output_tensor_types.append(arg.type.tensor_type.elem_type)
+        output_tensor_ranks.append(len(arg.type.tensor_type.shape.dim))
+
+    attrs = {
+        "func_name": func_full_qual_name,
+        "input_convention": cconv,
+        "input_tensor_types": input_tensor_types,
+        "input_tensor_ranks": input_tensor_ranks,
+        "output_tensor_types": output_tensor_types,
+        "output_tensor_ranks": output_tensor_ranks,
+        "training_mode": training_mode,
+        "safe_run_mode": safe_run_mode,
+        "comment": debug_comment,
+    }
+
+    if len(input_bool_scalars) > 0:
+        attrs["input_bool_scalars"] = input_bool_scalars
+        attrs["input_bool_scalar_positions"] = input_bool_scalar_positions
+    if len(input_int_scalars) > 0:
+        attrs["input_int_scalars"] = input_int_scalars
+        attrs["input_int_scalar_positions"] = input_int_scalar_positions
+    if len(input_float_scalars) > 0:
+        attrs["input_float_scalars"] = input_float_scalars
+        attrs["input_float_scalar_positions"] = input_float_scalar_positions
+    if len(input_bool_tuples) > 0:
+        attrs["input_bool_tuples"] = input_bool_tuples
+        attrs["input_bool_tuple_positions"] = input_bool_tuple_positions
+        attrs["input_bool_tuple_begins"] = input_bool_tuple_begins
+    if len(input_int_tuples) > 0:
+        attrs["input_int_tuples"] = input_int_tuples
+        attrs["input_int_tuple_positions"] = input_int_tuple_positions
+        attrs["input_int_tuple_begins"] = input_int_tuple_begins
+    if len(input_float_tuples) > 0:
+        attrs["input_float_tuples"] = input_float_tuples
+        attrs["input_float_tuple_positions"] = input_float_tuple_positions
+        attrs["input_float_tuple_begins"] = input_float_tuple_begins
+    if len(input_pointer_scalars) > 0:
+        attrs["input_pointer_scalars"] = input_pointer_scalars
+        attrs["input_pointer_scalar_positions"] = input_pointer_scalar_positions
+
+    # Register function with class names.
+    register_torch_autograd_function(func_full_qual_name, func_class)
+
+    register_custom_function_schema_supplementary(func_class)
+
+    _PYTHON_OP_INCRE_INDEX[0] += 1
+    node_name = f"{name_prefix}_{_PYTHON_OP_INCRE_INDEX[0]}"
+
+    node = onnx.helper.make_node(
+        PYTHON_OP_TYPE,
+        tensor_args,
+        [f"{node_name}_ctx", *[output.name for output in outputs]],
+        node_name,  # node name
+        "",
+        PYTHON_OP_DOMAIN,
+        **attrs,
+    )
+
+    return node
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 5b2c673ce94cb..cc533e549db92 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -310,11 +310,22 @@ def forward(self, *inputs, **kwargs):
 
             self._gradient_accumulation_manager.maybe_update_cache_before_run()
 
-            if self._runtime_options.enable_zero_stage3_support:
+            if self._runtime_options.enable_zero_stage3_support or self._mem_efficient_grad_management_is_enabled:
                 self._append_pull_weight_trigger_as_input(kwargs, self._device)
 
+            param_to_append_as_onnx_graph_inputs = []
+            if self._mem_efficient_grad_management_is_enabled:
+                from ._mem_efficient_grad_mgmt import get_params_not_connected_to_pull_param_trigger
+
+                param_to_append_as_onnx_graph_inputs = get_params_not_connected_to_pull_param_trigger(
+                    self._flattened_module.named_parameters(), self._onnx_models.exported_model
+                )
+
+            else:
+                param_to_append_as_onnx_graph_inputs = self._graph_initializers
+
             prepared_input_list, _, _ = _io._combine_input_buffers_initializers(
-                self._graph_initializers,
+                param_to_append_as_onnx_graph_inputs,
                 self._graph_info.user_input_names,
                 self._input_info,
                 self._flattened_module.named_buffers(),
@@ -492,10 +503,20 @@ def _reinitialize_graph_builder(self, input_info: _InputInfo):
             if param.requires_grad and name in self._graph_initializer_names
         }
 
+        if self._mem_efficient_grad_management_is_enabled:
+            from ._mem_efficient_grad_mgmt import MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+
+            # Remove the inputs we added during model post-processing.
+            existing_require_grad_names = [
+                n for n in self._input_info.require_grad_names if n != MEM_EFFICIENT_PARAM_TRIGGER_INPUT_NAME
+            ]
+        else:
+            existing_require_grad_names = self._input_info.require_grad_names
+
         # If inputs requiring gradient change from forward to the next, the module_gradient_graph_builder
         # needs to be reinitialized so it can compute the backward output for the new inputs that require_grad
         if (
-            input_info.require_grad_names != self._input_info.require_grad_names
+            input_info.require_grad_names != existing_require_grad_names
             or initializer_names_to_train_set_user_model != self._graph_initializer_names_to_train
         ):
             self._input_info = input_info
diff --git a/orttraining/orttraining/python/training/ortmodule/options.py b/orttraining/orttraining/python/training/ortmodule/options.py
index bfa38efb349ae..df3b078788d16 100644
--- a/orttraining/orttraining/python/training/ortmodule/options.py
+++ b/orttraining/orttraining/python/training/ortmodule/options.py
@@ -308,6 +308,9 @@ def __init__(self, logger: Logger):
         # Experimental features.
         self.enable_zero_stage3_support = False  # Once enabled, cannot be disabled.
 
+        # We disable memory efficient grad management by default, will enable once it's fully validated.
+        self.enable_mem_efficient_grad_management = False
+
         self.deepcopy_before_model_export = True
 
         # Override the feature config if it exists in os env.
@@ -397,6 +400,15 @@ def _override_from_env_vars(self):
         if "ORTMODULE_ENABLE_ZERO_STAGE3" in os.environ and int(os.getenv("ORTMODULE_ENABLE_ZERO_STAGE3")) == 1:
             self.enable_zero_stage3_support = True
 
+        if "ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT" in os.environ:
+            enable_grad_mgmt = int(os.getenv("ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT"))
+            self.enable_mem_efficient_grad_management = enable_grad_mgmt == 1 and self.enable_custom_autograd_function
+            if not self.enable_custom_autograd_function and enable_grad_mgmt == 1:
+                self._logger.warning(
+                    "ORTModule optimization for memory efficient gradient management cannot be enabled "
+                    "because PyTorch custom autograd function support is disabled."
+                )
+
         if "ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT" in os.environ:
             self.deepcopy_before_model_export = int(os.getenv("ORTMODULE_DEEPCOPY_BEFORE_MODEL_EXPORT")) == 1
 
diff --git a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
index e6004319ef5ea..d4b9768116e92 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py
@@ -289,7 +289,7 @@ def backward(ctx, *grads):
                     raise RuntimeError(f"param {p} has no grad, this should not happen.")
                 # Param gradient accumulation is triggered here, along with the attached hooks, done by PyTorch.
                 assert p.shape == g.shape, f"param_index: {param_index} - param shape {p.shape} != grad shape {g.shape}"
-                # p.backward(g)
+                p.backward(g)
 
         # At this point, the **real** param grads are already updated, the following grads are only used for
         # completing the full backward propagation, will not affect parameter updates.
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index f944d8bc5ef42..938d33cc9a714 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -684,7 +684,7 @@ def test_input_requires_grad_saved(device):
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device, requires_grad=True) + 1
     model(x)
-    assert model._torch_module._execution_manager(model._is_training())._input_info.require_grad_names == ["input1"]
+    assert "input1" in model._torch_module._execution_manager(model._is_training())._input_info.require_grad_names
 
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
index 3c5ac56cb139a..0a98cd959dd36 100644
--- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
+++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel_base.cc
@@ -385,7 +385,10 @@ void PythonOpGradBase::RunBackward(OpKernelContext* context,
 
 void PythonOpGradBase::SetOutputs(OpKernelContext* context, std::vector<OrtValue>& returned_ortvalues) const {
   auto* ctx_internal = reinterpret_cast<onnxruntime::OpKernelContextInternal*>(context);
-  ORT_ENFORCE(output_convention_.size() == returned_ortvalues.size(), "backward output count mismatch.");
+  ORT_ENFORCE(output_convention_.size() == returned_ortvalues.size(), "backward output count mismatch. Expected ",
+              output_convention_.size(), ", but got ", returned_ortvalues.size(),
+              ". Please check the backward function return same number of outputs as forward function's input for ",
+              name_, ".");
   int tensor_output_index = 0;
   for (size_t i = 0; i < returned_ortvalues.size(); ++i) {
     if (output_convention_[i] == 'd') {
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
index 7824bf2203efe..e13ef9160bed3 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-nightly-ortmodule-test-pipeline.yml
@@ -24,7 +24,7 @@ jobs:
         --volume $(Build.SourcesDirectory)/orttraining/orttraining/test/python:/onnxruntime_src \
         --volume $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly:/requirements_torch_nightly \
         ptebic.azurecr.io/internal/aifx/acpt/nightly-ubuntu-cuda-torch-dev \
-         bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
+         bash -c "python3 -m pip install -r /requirements_torch_nightly/requirements.txt && ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=0 python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py && ORTMODULE_ENABLE_MEM_EFFICIENT_GRAD_MGMT=1 python3 -m pytest -sv /onnxruntime_src/orttraining_test_ortmodule_api.py"
     displayName: 'Run ORTModule Tests'
     condition: succeededOrFailed()
     timeoutInMinutes: 120

From 9f87c5c41d50fdcf30ce439617c708c964d8a050 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 15 Jan 2024 17:10:58 -0800
Subject: [PATCH 11/39] Fix build error due to merge with DML adapter
 enumeration macro defined (#19121)

### Description
Fix build error when ENABLE_NPU_ADAPTER_ENUMERATION is defined


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/providers/dml/dml_provider_factory.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 73a068f3e1de2..b2688094a6d78 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -329,7 +329,6 @@ static std::optional<OrtDmlDeviceFilter> ParseFilter(const ProviderOptions& prov
   static const std::string Any = "any";
   static const std::string Gpu = "gpu";
 #ifdef ENABLE_NPU_ADAPTER_ENUMERATION
-  static const std::string Any = "any";
   static const std::string Npu = "npu";
 #endif
 

From 9dee543bedaed8419957afaed3a64b1ab5fa3a21 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 15 Jan 2024 18:40:38 -0800
Subject: [PATCH 12/39] fix gemm beta for fp16 (#19153)

per onnx spec beta is always fp32 so we need to cast it
---
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 30754c84413b7..a0d4021516bf7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -100,8 +100,8 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt
     ${calculateAlpha}
     ${(() => {
       if (c != null) {
-        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += uniforms.beta * ${
-            c.getByOffset('cOffset')};`;
+        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += ${
+            dataType}(uniforms.beta) * ${c.getByOffset('cOffset')};`;
       }
       return '';
     })()}

From 1bab98988b4e7b6d33be0e672fce361ccbb1d397 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 16 Jan 2024 10:44:25 +0800
Subject: [PATCH 13/39] [WebNN EP] Fixed bug in int8 data type processing
 (#19134)

---
 .../core/providers/webnn/builders/helper.cc    |  5 ++++-
 .../core/providers/webnn/builders/helper.h     |  4 +++-
 .../webnn/builders/impl/cast_op_builder.cc     |  4 +++-
 .../webnn/builders/impl/conv_op_builder.cc     |  4 +++-
 .../core/providers/webnn/builders/model.cc     | 18 ++++++++++++++----
 .../providers/webnn/builders/model_builder.cc  | 11 +++++++++--
 6 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index a55145b0125a7..ef7c10dae580c 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -166,11 +166,14 @@ bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type) {
   // TODO: Remove legacy "type" once all browsers implement the new "dataType".
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       desc.set("type", emscripten::val("uint8"));
       desc.set("dataType", emscripten::val("uint8"));
       return true;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      desc.set("type", emscripten::val("int8"));
+      desc.set("dataType", emscripten::val("int8"));
+      return true;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       desc.set("type", emscripten::val("float16"));
       desc.set("dataType", emscripten::val("float16"));
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index f3fc7ec5cc4cd..85dafcaf66575 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -101,10 +101,12 @@ inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::va
   }
   switch (tensor.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       scalar = emscripten::val{*reinterpret_cast<uint8_t*>(unpacked_tensor.data())};
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      scalar = emscripten::val{*reinterpret_cast<int8_t*>(unpacked_tensor.data())};
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       scalar = emscripten::val{MLFloat16::FromBits(*reinterpret_cast<uint16_t*>(unpacked_tensor.data())).ToFloat()};
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
index 062f1c56061a9..3d961e4589c2e 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc
@@ -39,10 +39,12 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   std::string operand_type;
   switch (to_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       operand_type = "uint8";
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      operand_type = "int8";
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       operand_type = "float16";
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 123a9cc016515..ceacb7c2b38a3 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -184,10 +184,12 @@ Status AddInitializerInNewLayout(ModelBuilder& model_builder,
   size_t element_size{0};
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       element_size = sizeof(uint8_t);
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      element_size = sizeof(int8_t);
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       element_size = sizeof(uint16_t);
       break;
diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc
index a4031fd9350c5..eaf549ef4e072 100644
--- a/onnxruntime/core/providers/webnn/builders/model.cc
+++ b/onnxruntime/core/providers/webnn/builders/model.cc
@@ -33,11 +33,14 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                             static_cast<const int8_t*>(tensor.buffer))};
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint16_t*>(tensor.buffer))};
@@ -90,11 +93,14 @@ Status Model::Predict(const InlinedHashMap<std::string, OnnxTensorData>& inputs,
     emscripten::val view = emscripten::val::undefined();
     switch (tensor.tensor_info.data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint8_t*>(tensor.buffer))};
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                             static_cast<const int8_t*>(tensor.buffer))};
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                              static_cast<const uint16_t*>(tensor.buffer))};
@@ -168,10 +174,12 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = input_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_inputs_.set(input, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        wnn_inputs_.set(input, emscripten::val::global("Int8Array").new_(num_elements));
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         wnn_inputs_.set(input, emscripten::val::global("Uint16Array").new_(num_elements));
         break;
@@ -201,10 +209,12 @@ void Model::AllocateInputOutputBuffers() {
     const auto data_type = output_info.data_type;
     switch (data_type) {
       case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
       case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
         wnn_outputs_.set(output, emscripten::val::global("Uint8Array").new_(num_elements));
         break;
+      case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+        wnn_outputs_.set(output, emscripten::val::global("Int8Array").new_(num_elements));
+        break;
       case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
         wnn_outputs_.set(output, emscripten::val::global("Uint16Array").new_(num_elements));
         break;
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index 4e0c83db8b127..cf8a0e23db43b 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -160,12 +160,16 @@ Status ModelBuilder::RegisterInitializers() {
       }
       switch (data_type) {
         case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-        case ONNX_NAMESPACE::TensorProto_DataType_INT8:
         case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
           desc.set("type", emscripten::val("uint8"));
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint8_t*>(tensor_ptr))};
           break;
+        case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+          desc.set("type", emscripten::val("int8"));
+          view = emscripten::val{emscripten::typed_memory_view(num_elements,
+                                                               reinterpret_cast<int8_t*>(tensor_ptr))};
+          break;
         case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
           view = emscripten::val{emscripten::typed_memory_view(num_elements,
                                                                reinterpret_cast<uint16_t*>(tensor_ptr))};
@@ -318,11 +322,14 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
   ORT_RETURN_IF_NOT(SetWebnnDataType(desc, data_type), "Unsupported data type");
   switch (data_type) {
     case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
-    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
     case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint8_t),
                                                            reinterpret_cast<const uint8_t*>(dest))};
       break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      view = emscripten::val{emscripten::typed_memory_view(size / sizeof(int8_t),
+                                                           reinterpret_cast<const int8_t*>(dest))};
+      break;
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
       view = emscripten::val{emscripten::typed_memory_view(size / sizeof(uint16_t),
                                                            reinterpret_cast<const uint16_t*>(dest))};

From 8d4369b77ef8567653db3e247bbb2f48889fc457 Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 15 Jan 2024 19:04:41 -0800
Subject: [PATCH 14/39] Update DirectML nuget version to 1.13.1 (#19122)

### Description
Update DML version to 1.13.1


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .pipelines/nuget_config/x64/packages.config     | 2 +-
 .pipelines/nuget_config/x86/packages.config     | 2 +-
 cmake/external/dml.cmake                        | 2 +-
 packages.config                                 | 2 +-
 tools/nuget/generate_nuspec_for_native_nuget.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 2583e0d1b2ead..b862dec5e1c87 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.13.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index 5ca659941c159..c348dd3e9cdad 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.13.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index dfd9ad120eb98..ae7e6d3801a64 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.13.0)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.13.1)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(
diff --git a/packages.config b/packages.config
index b67219d6d6913..e5b134d99dd89 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.AI.DirectML" version="1.13.0" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.13.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
   <package id="google.protobuf.tools" version="3.21.12" targetFramework="native" />
 </packages>
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 56e50750ac153..09fe99d36cc34 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version):
 
 
 def generate_dependencies(xml_text, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.13.0"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.13.1"/>'
 
     if package_name == "Microsoft.AI.MachineLearning":
         xml_text.append("<dependencies>")

From c92f72ebebf5f4a1e63b726e6e5cec1a47250bb5 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 16 Jan 2024 11:59:03 -0500
Subject: [PATCH 15/39] Merge Linux Nuget GPU pipeline with zip-nuget (#19120)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../c-api-noopenmp-packaging-pipelines.yml    | 174 ++----------------
 .../nuget-linux-cuda-packaging-stage.yml      |  18 +-
 2 files changed, 31 insertions(+), 161 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index f80b035582f18..2169a3ce1bb9e 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -83,6 +83,16 @@ resources:
 variables:
 - name: ReleaseVersionSuffix
   value: ''
+- name: docker_base_image
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+- name: linux_trt_version
+  ${{ if eq(parameters.CudaVersion, '11.8') }}:
+    value: 8.6.1.6-1.cuda11.8
+  ${{ if eq(parameters.CudaVersion, '12.2') }}:
+    value: 8.6.1.6-1.cuda12.0
 
 stages:
 - stage: Setup
@@ -189,64 +199,11 @@ stages:
     AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
     BuildVariant: 'default'
 
-- stage: Linux_C_API_Packaging_GPU_x64
-  dependsOn: []
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    timeoutInMinutes: 120
-    pool: 'Onnxruntime-Linux-GPU'
-    variables:
-      - name: CUDA_VERSION_MAJOR
-        ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: '11'
-        ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: '12'
-      - name: CUDA_VERSION
-        value: ${{ parameters.CudaVersion }}
-    steps:
-    - template: templates/set-version-number-variables-step.yml
-    - template: templates/get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
-        Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
-        DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build
-
-    - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
-      workingDirectory: $(Build.SourcesDirectory)
-      displayName: 'Build and Test'
-
-    - template: templates/java-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-          arch: 'linux-x64'
-          buildConfig: 'Release'
-          artifactName: 'onnxruntime-java-linux-x64-cuda'
-          version: '$(OnnxRuntimeVersion)'
-          libraryName: 'libonnxruntime.so'
-          nativeLibraryName: 'libonnxruntime4j_jni.so'
-
-    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-        buildConfig: 'Release'
-        artifactName: 'onnxruntime-linux-x64-cuda-$(OnnxRuntimeVersion)'
-        artifactNameNoVersionString: 'onnxruntime-linux-x64-cuda'
-        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
-    - template: templates/clean-agent-build-directory-step.yml
-
-- template: templates/linux-gpu-tensorrt-packaging-pipeline.yml
+- template: stages/nuget-linux-cuda-packaging-stage.yml
   parameters:
-      artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
-      artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-      buildJava: true
-      buildJavaOption: '--build_java'
-      buildNodejs: true
-      buildNodejsOption: '--build_nodejs'
+    CudaVersion: ${{ parameters.CudaVersion }}
+    docker_base_image: ${{ variables.docker_base_image }}
+    linux_trt_version: ${{ variables.linux_trt_version }}
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
@@ -527,109 +484,6 @@ stages:
       displayName: 'Clean Agent Directories'
       condition: always()
 
-- stage: Linux_Packaging_combined_GPU
-  dependsOn:
-  - Linux_C_API_Packaging_GPU_x64
-  - Linux_C_API_Packaging_GPU_TensorRT_x64
-  condition: succeeded()
-  jobs:
-  - job:
-    workspace:
-      clean: all
-    pool: 'Onnxruntime-Linux-GPU'
-
-    steps:
-    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
-      submodules: false
-    - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
-      submodules: false
-    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
-      submodules: false
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
-
-    - script: |
-        set -e -x
-        cd $(Build.SourcesDirectory)
-        mv manylinux onnxruntime
-        ls
-
-    - template: templates/with-container-registry-steps.yml
-      parameters:
-        Steps:
-        - script: |
-            tools/ci_build/get_docker_image.py \
-              --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \
-              --context tools/ci_build/github/linux/docker \
-              --docker-build-args "--network=host --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 --build-arg BUILD_UID=$( id -u )" \
-              --container-registry onnxruntimebuildcache \
-              --multiple_repos \
-              --repository onnxruntimecuda118xtrt86build
-          displayName: "Get onnxruntimecuda118xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda"
-          workingDirectory: $(Build.SourcesDirectory)/onnxruntime
-        ContainerRegistry: onnxruntimebuildcache
-
-    - template: templates/set-version-number-variables-step.yml
-      parameters:
-        versionFileDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-        workingDirectory: '$(Build.SourcesDirectory)/onnxruntime'
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-cuda'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download Pipeline Artifact - Combined GPU'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64-tensorrt'
-        targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ShellScript@2
-      displayName: 'Shell Script'
-      inputs:
-        scriptPath: 'onnxruntime/tools/ci_build/github/linux/extract_and_bundle_gpu_package.sh'
-        args: '-a $(Build.BinariesDirectory)/tgz-artifacts'
-        workingDirectory: '$(Build.BinariesDirectory)/tgz-artifacts'
-
-    - task: ArchiveFiles@2
-      inputs:
-        rootFolderOrFile: '$(Build.BinariesDirectory)/tgz-artifacts/onnxruntime-linux-x64-gpu'
-        includeRootFolder: false
-        archiveType: 'tar' # Options: zip, 7z, tar, wim
-        tarCompression: 'gz'
-        archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        replaceExistingArchive: true
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'tarball'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        ScriptPath: '$(Build.SourcesDirectory)/onnxruntime/tools/nuget/validate_package.py'
-        PlatformsSupported: 'linux-x64'
-        VerifyNugetSigning: false
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-
-    - task: CmdLine@2
-      displayName: 'Test C API application for GPU package'
-      inputs:
-        script: |
-          docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/models:/data/models --volume $(Build.SourcesDirectory):/src_dir \
-          --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
-          /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet
-        workingDirectory: '$(Build.ArtifactStagingDirectory)'
-
-    - task: PublishPipelineArtifact@1
-      inputs:
-        targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
-        artifactName: 'onnxruntime-linux-x64-gpu'
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
-
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index 48a6e0e8529e6..dbbc9ef27e513 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -40,7 +40,16 @@ stages:
     - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
       workingDirectory: $(Build.SourcesDirectory)
       displayName: 'Build and Test'
-
+# We only support Maven package for CUDA 11.8
+    - ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      - template: ../templates/java-api-artifacts-package-and-publish-steps-posix.yml
+        parameters:
+          arch: 'linux-x64'
+          buildConfig: 'Release'
+          artifactName: 'onnxruntime-java-linux-x64-cuda'
+          version: '$(OnnxRuntimeVersion)'
+          libraryName: 'libonnxruntime.so'
+          nativeLibraryName: 'libonnxruntime4j_jni.so'
     - template: ../templates/c-api-artifacts-package-and-publish-steps-posix.yml
       parameters:
         buildConfig: 'Release'
@@ -82,6 +91,10 @@ stages:
         - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
           submodules: false
 
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
+
         - script: |
             set -e -x
             cd $(Build.SourcesDirectory)
@@ -159,3 +172,6 @@ stages:
           inputs:
             targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
             artifactName: 'onnxruntime-linux-x64-gpu'
+        - template: ../templates/component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
\ No newline at end of file

From e2e488d6f8bcd14f40e9e2c8e65f310ce9c0e872 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jan 2024 09:18:35 -0800
Subject: [PATCH 16/39] Revert "iOS packaging pipeline stability" (#19135)

Reverts microsoft/onnxruntime#19097 because it broken Android CI
pipeline.
---
 .../external/onnxruntime_external_deps.cmake  | 74 +++++++++----------
 .../mac-ios-packaging-pipeline.yml            |  2 +-
 .../stages/mac-ios-packaging-build-stage.yml  |  7 +-
 3 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index c79bb87fd7f5d..78f63227c8392 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -108,14 +108,41 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  if (APPLE)
-    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
-    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
-    # To keep it simple, just download and use the universal protoc binary for Apple builds.
+  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+      FetchContent_Populate(protoc_binary)
+    endif()
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
+  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+      FetchContent_Populate(protoc_binary)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+      FetchContent_Populate(protoc_binary)
+    endif()
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
+  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -123,38 +150,6 @@ if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
-  elseif(CMAKE_CROSSCOMPILING)
-    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-        FetchContent_Populate(protoc_binary)
-      endif()
-      if(protoc_binary_SOURCE_DIR)
-        message("Use prebuilt protoc")
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-      endif()
-    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-        FetchContent_Populate(protoc_binary)
-      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-        FetchContent_Populate(protoc_binary)
-      endif()
-      if(protoc_binary_SOURCE_DIR)
-        message("Use prebuilt protoc")
-        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-      endif()
-    endif()
   endif()
 endif()
 
@@ -189,9 +184,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause
+#TODO: we'd better to turn the following option off. However, it will cause 
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -567,3 +562,4 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 34a51649fc384..5fd15b64e03b6 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -53,7 +53,7 @@ stages:
     displayName: "Set common variables"
 
     pool:
-      vmImage: "macOS-12"  # macOS-13 seems less stable. macOS-12 will work for this job.
+      vmImage: "macOS-13"
 
     timeoutInMinutes: 5
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index ed32c5d0e15be..d1dff0769e25f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -78,6 +78,10 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
+    - script: |
+        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
+      displayName: "Build Host Protoc"
+
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -87,7 +91,8 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }}
+          ${{ variables.optionalIncludeOpsByConfigOption }} \
+          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |

From 80f274ca6f2f4572d827edd6dc7f736d7a8c036a Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Tue, 16 Jan 2024 09:42:59 -0800
Subject: [PATCH 17/39] Fix SkipLayerNormalization shape inference (#18724)

SkipLayerNorm has more than one input, so `propagateShapeAndTypeFromFirstInput` is not enough.
---
 .../core/graph/contrib_ops/bert_defs.cc       |  4 +-
 .../contrib_ops/shape_inference_functions.cc  | 39 +++++++++++++++++++
 .../contrib_ops/shape_inference_functions.h   |  3 +-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index df8d0a59cb033..0317ffcfb0e31 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1285,7 +1285,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Output(3, "input_skip_bias_sum", "Sum of the input and skip inputs (and bias if it exists) with shape (batch_size, sequence_length, hidden_size).", "T", OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
     SkipSimplifiedLayerNormalization, 1,
@@ -1334,7 +1334,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 OpSchema::Optional)
         .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
         .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
-        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+        .TypeAndShapeInferenceFunction(SkipLayerNormalizationShapeInference));
 
 constexpr const char* NGramRepeatBlock_ver1_doc = R"DOC(
 Enforce no repetition of n-grams. Scores are set to `-inf` for tokens that form a repeated n-gram if added to the back of the input_ids.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
index eeef20e9dff5e..8b1812f62be25 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.cc
@@ -114,6 +114,45 @@ void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& c
   }
 }
 
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx) {
+  propagateShapeAndTypeFromFirstInput(ctx);
+
+  auto stash_type = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+  if (ctx.getNumOutputs() > 1) {
+    auto output_type = ctx.getOutputType(1);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 2) {
+    auto output_type = ctx.getOutputType(2);
+    output_type->mutable_tensor_type()->set_elem_type(static_cast<int32_t>(stash_type));
+  }
+  if (ctx.getNumOutputs() > 3) {
+    propagateElemTypeFromInputToOutput(ctx, 0, 3);
+  }
+  if (!hasNInputShapes(ctx, 1)) {
+    return;
+  }
+  auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+  int64_t input_ndim = input_shape.dim_size();
+  int axis = static_cast<int>(input_ndim - 1);
+
+  if (ctx.getNumOutputs() > 1) {
+    auto mean_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape();
+    mean_shape->CopyFrom(input_shape);
+    mean_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 2) {
+    auto inv_std_dev_shape = ctx.getOutputType(2)->mutable_tensor_type()->mutable_shape();
+    inv_std_dev_shape->CopyFrom(input_shape);
+    inv_std_dev_shape->mutable_dim(axis)->set_dim_value(1);
+  }
+
+  if (ctx.getNumOutputs() > 3) {
+    propagateShapeFromInputToOutput(ctx, 0, 3);
+  }
+}
+
 // Shape inference for Attention and QAttention
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index) {
   // Input 0, 1, 2 are input, weights and bias.
diff --git a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
index 93cf5b304f653..6eb06af15309c 100644
--- a/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
+++ b/onnxruntime/core/graph/contrib_ops/shape_inference_functions.h
@@ -13,5 +13,6 @@ namespace onnxruntime {
 namespace contrib {
 void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index);
 void EmbedLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
+void SkipLayerNormalizationShapeInference(::ONNX_NAMESPACE::InferenceContext& ctx);
 }  // namespace contrib
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime

From 8e272b9cac70a11c472fb002af755213a4dabf66 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 16 Jan 2024 16:53:15 -0500
Subject: [PATCH 18/39] Update build.py to remove unused functions and update
 python to 3.8 (#19164)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 tools/ci_build/build.py | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 0da4adb51767d..1a6262edf45c9 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -56,7 +56,7 @@ def __init__(self, message):
 
 
 def _check_python_version():
-    required_minor_version = 7
+    required_minor_version = 8
     if (sys.version_info.major, sys.version_info.minor) < (3, required_minor_version):
         raise UsageError(
             f"Invalid Python version. At least Python 3.{required_minor_version} is required. "
@@ -786,11 +786,6 @@ def get_linux_distro():
         return "", ""
 
 
-def is_ubuntu_1604():
-    dist, ver = get_linux_distro()
-    return dist == "Ubuntu" and ver.startswith("16.04")
-
-
 def get_config_build_dir(build_dir, config):
     # build directory per configuration
     return os.path.join(build_dir, config)
@@ -844,15 +839,6 @@ def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
 
 
-def is_docker():
-    path = "/proc/self/cgroup"
-    return (
-        os.path.exists("/.dockerenv")
-        or os.path.isfile(path)
-        and any("docker" in line for line in open(path))  # noqa: SIM115
-    )
-
-
 def install_python_deps(numpy_version=""):
     dep_packages = ["setuptools", "wheel", "pytest"]
     dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
@@ -2401,16 +2387,6 @@ def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt
     run_subprocess(cmd_args, cwd=csharp_source_dir)
 
 
-def is_cross_compiling_on_apple(args):
-    if not is_macOS():
-        return False
-    if args.ios:
-        return True
-    if args.osx_arch != platform.machine():
-        return True
-    return False
-
-
 def generate_documentation(source_dir, build_dir, configs, validate):
     # Randomly choose one build config
     config = next(iter(configs))
@@ -2725,12 +2701,6 @@ def main():
             log.info("Activating emsdk...")
             run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
 
-        if is_ubuntu_1604():
-            if args.arm or args.arm64:
-                raise BuildError("Only Windows ARM(64) cross-compiled builds supported currently through this script")
-            if not is_docker() and not args.use_acl and not args.use_armnn:
-                install_python_deps()
-
         if args.enable_pybind and is_windows():
             install_python_deps(args.numpy_version)
 

From c935c8fbd2e463a3e0153145140a8efd780dfabc Mon Sep 17 00:00:00 2001
From: moyo1997 <54333118+moyo1997@users.noreply.github.com>
Date: Tue, 16 Jan 2024 16:24:37 -0800
Subject: [PATCH 19/39] remove unnecessary environment variable (#19166)

remove unnecessary environment variable when building as arm64x
---
 build_arm64x.bat | 1 -
 1 file changed, 1 deletion(-)

diff --git a/build_arm64x.bat b/build_arm64x.bat
index fbcdd373086a9..1ed268ae94a43 100644
--- a/build_arm64x.bat
+++ b/build_arm64x.bat
@@ -5,7 +5,6 @@
 
 setlocal
 set PATH=C:\Program Files\Git\usr\bin;%PATH%
-set LINK_REPRO_NAME=/mylink.rsp
 
 rem Requires a Python install to be available in your PATH
 python "%~dp0\tools\ci_build\build.py" --arm64 --buildasx  --build_dir "%~dp0\build\arm64-x" %*

From e61861b0a121bca1d60e5d4a3722e52b6820c430 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Tue, 16 Jan 2024 16:36:28 -0800
Subject: [PATCH 20/39] Clean up generated files in QNN UTs (#19127)

### Description
Clean up generated files in QNN UTs
---
 onnxruntime/test/providers/qnn/simple_op_htp_test.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 8ff65c08e8633..c4244fe532456 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -815,7 +815,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
   // Check the Onnx skeleton file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
   // Check the Qnn context cache binary file is generated
-  EXPECT_TRUE(std::filesystem::exists("qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"));
+  std::string qnn_ctx_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  EXPECT_TRUE(std::filesystem::exists(qnn_ctx_bin));
 
   // 2nd run loads and run from QDQ model + Onnx skeleton file + Qnn context cache binary file
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
@@ -837,6 +838,10 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheNonEmbedModeTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(qnn_ctx_bin.c_str()), 0);
 }
 
 // Run QDQ model on HTP 2 times
@@ -898,6 +903,9 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
   ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
   // Verify the return status with code INVALID_GRAPH
   ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // Run QDQ model on HTP with 2 inputs
@@ -955,6 +963,8 @@ TEST_F(QnnHTPBackendTests, ContextBinary2InputsTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {

From 81d363045ba273b16a3ec654c53a15217a2d2a36 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 16 Jan 2024 17:25:18 -0800
Subject: [PATCH 21/39] Upgrade Ubuntu machine pool from 20.04 to 22.04
 (#19117)

### Description
Upgrade Ubuntu machine pool from 20.04 to 22.04
---
 .../build-perf-test-binaries-pipeline.yml     |  2 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |  2 +-
 ...lean-build-docker-image-cache-pipeline.yml | 10 +--------
 .../cuda-packaging-pipeline.yml               |  2 +-
 .../azure-pipelines/linux-ci-pipeline.yml     |  4 ++--
 .../linux-cpu-aten-pipeline.yml               |  2 +-
 .../linux-cpu-eager-pipeline.yml              |  2 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  2 +-
 .../linux-migraphx-ci-pipeline.yml            |  2 +-
 .../npm-packaging-pipeline.yml                |  4 ++--
 .../nuget/templates/test_linux.yml            |  2 +-
 .../orttraining-linux-ci-pipeline.yml         |  2 +-
 .../orttraining-pai-ci-pipeline.yml           |  4 ++--
 .../orttraining-py-packaging-pipeline-cpu.yml |  2 +-
 .../azure-pipelines/post-merge-jobs.yml       |  6 ++---
 .../py-package-test-pipeline.yml              |  2 +-
 .../stages/py-cuda-packaging-stage.yml        |  2 +-
 .../stages/py-cuda-publishing-stage.yml       |  2 +-
 .../templates/android-java-api-aar.yml        |  2 +-
 .../templates/build-linux-wasm-step.yml       | 22 +++++++++----------
 .../azure-pipelines/templates/c-api-cpu.yml   |  4 ++--
 .../templates/c-api-linux-cpu.yml             |  2 +-
 .../azure-pipelines/templates/linux-ci.yml    |  2 +-
 .../linux-cpu-packaging-pipeline.yml          |  2 +-
 .../templates/linux-wasm-ci.yml               |  2 +-
 ...device-training-cpu-packaging-pipeline.yml |  2 +-
 .../py-packaging-selectable-stage.yml         |  2 +-
 .../templates/py-packaging-stage.yml          |  4 ++--
 .../github/azure-pipelines/templates/rocm.yml |  2 +-
 .../azure-pipelines/web-ci-pipeline.yml       |  2 +-
 .../linux/build_linux_python_package.sh       |  6 ++---
 .../ci_build/github/linux/run_python_tests.sh |  2 +-
 32 files changed, 50 insertions(+), 60 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
index 3ddc167bc0a61..d37e9bdc5da4c 100644
--- a/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/build-perf-test-binaries-pipeline.yml
@@ -28,7 +28,7 @@ stages:
         artifactName: 'onnxruntime-android-full-aar'
         job_name_suffix: 'Full'
         publish_executables: '1'
-        pool_name: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool_name: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 # build Python packages
 # Linux GPU only
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 2169a3ce1bb9e..3803333bd880a 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -246,7 +246,7 @@ stages:
     workspace:
       clean: all
     timeoutInMinutes: 120
-    pool: onnxruntime-Ubuntu2004-AMD-CPU
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
     variables:
       RocmVersion: '5.6'
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
index 24086b6166fe4..43e668eef8d00 100644
--- a/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/clean-build-docker-image-cache-pipeline.yml
@@ -19,8 +19,7 @@ variables:
 jobs:
 - job: Clean_Build_Docker_Image_Cache
 
-  pool:
-    vmImage: 'ubuntu-20.04'
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
 
   timeoutInMinutes: 30
 
@@ -29,13 +28,6 @@ jobs:
     submodules: false
     fetchDepth: 1
 
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.9'
-      addToPath: true
-      architecture: 'x64'
-    displayName: "Use Python 3.9"
-
   - task: AzureCLI@2
     inputs:
       azureSubscription: 'AIInfraBuild'
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index df7b5f59d28fc..1d2ba88652f48 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -126,7 +126,7 @@ stages:
         BaseImage: 'registry.access.redhat.com/ubi8/ubi'
         OnnxruntimeArch: 'x64'
         OnnxruntimeNodejsBindingArch: 'x64'
-        PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
         PackageJava: false
         PackageNodeJS: false
   # Nuget Packaging
diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
index 07f672c75d029..cff7c96aa9253 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml
@@ -46,7 +46,7 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
@@ -123,7 +123,7 @@ stages:
         skipComponentGovernanceDetection: true
         ORT_CACHE_DIR: $(Agent.TempDirectory)/ort_ccache
         TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       steps:
       - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
         displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
index 146186e9eeaf5..090ce97296687 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
@@ -43,7 +43,7 @@ jobs:
   variables:
     CCACHE_DIR: $(Agent.TempDirectory)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
index a5c08e95b7efc..d3d13cc5344da 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-eager-pipeline.yml
@@ -51,7 +51,7 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - checkout: self
     clean: true
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 0993a81a02249..5bc8c3603ee92 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -64,7 +64,7 @@ jobs:
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
index f7571a3b7eab6..9cf7a3fb42397 100644
--- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml
@@ -46,7 +46,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
index 7f73da23b5eb1..21fc205c72e89 100644
--- a/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/npm-packaging-pipeline.yml
@@ -41,7 +41,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: true
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-web'
     ExtraBuildArgs: ''
     UseWebPoolName: true
@@ -54,7 +54,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     BuildConfig: 'Release'
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     PackageName: 'onnxruntime-react-native'
     BuildAndroidAARStageDependsOn: 'Precheck_and_extract_commit'
 
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index f44106c145228..2567bec9fdfc2 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   ArtifactSuffix: ''
   NugetPackageName : ''
   StageSuffix: 'CPU'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index 018672e0b2dea..26fd5e1ec0b5d 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -44,7 +44,7 @@ jobs:
     skipComponentGovernanceDetection: true
     CCACHE_DIR: $(Pipeline.Workspace)/ccache
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
-  pool: onnxruntime-Ubuntu-2004-Training-CPU
+  pool: onnxruntime-Ubuntu-2204-Training-CPU
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index a53f91fb317cb..71b224b65964f 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -37,7 +37,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
@@ -132,7 +132,7 @@ jobs:
     TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)]
   workspace:
     clean: all
-  pool: onnxruntime-Ubuntu2004-AMD-CPU
+  pool: onnxruntime-Ubuntu2204-AMD-CPU
   timeoutInMinutes: 120
 
   steps:
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 817ace0571837..a44a8c215939f 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -16,7 +16,7 @@ stages:
       timeoutInMinutes: 180
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
 
       strategy:
         matrix:
diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
index 5ee39876733e2..3ec5400dacc65 100644
--- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml
@@ -4,7 +4,7 @@ stages:
     parameters:
       NpmPackagingMode: 'dev'
       IsReleasePipeline: true
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       BuildStaticLib: true
       ExtraBuildArgs: ''
       UseWebPoolName: true
@@ -367,7 +367,7 @@ stages:
     timeoutInMinutes: 150
     variables:
       skipComponentGovernanceDetection: true
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     steps:
     - template: templates/set-version-number-variables-step.yml
 
@@ -413,7 +413,7 @@ stages:
   - job: AndroidCustomBuildScript
     workspace:
       clean: all
-    pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
     variables:
       dockerImageTag: onnxruntime-android-custom-build
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 55d3150f21aa3..04f555deb1a22 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
   - template: templates/py-packaging-linux-test-cpu.yml
     parameters:
       arch: 'x86_64'
-      machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
       base_image: 'registry.access.redhat.com/ubi8/ubi'
       devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
       ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
index e6d8ee35e75e3..f82c80d4d7e93 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml
@@ -105,7 +105,7 @@ stages:
       - template: ../templates/py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           docker_base_image: ${{ variables.docker_base_image }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
index 4f440e0f61b3d..2a4debcf9fba5 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-publishing-stage.yml
@@ -20,7 +20,7 @@ stages:
       dependsOn: []
     jobs:
       - job:
-        pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
         steps:
           - checkout: none
           - task: DownloadPipelineArtifact@2
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 5e61f88b4aa18..509fea45ebe53 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -33,7 +33,7 @@ parameters:
 - name: pool_name
   displayName: Pool name
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: packageName
   # now we can build onnxruntime or onnxruntime-mobile for Android, need specify it here
diff --git a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
index e664cf69dec76..e77b1a4008b7c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/build-linux-wasm-step.yml
@@ -24,19 +24,17 @@ parameters:
   type: string
 
 steps:
-  - task: Cache@2
-    inputs:
-      ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
-        key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
-      ${{else}}:
-        key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
-      path: ${{parameters.CacheDir}}
-      restoreKeys: |
-        "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
-    displayName: Cache Task
-    condition: eq('${{parameters.WithCache}}', true)
-
   - ${{if eq(parameters.WithCache, true)}}:
+    - task: Cache@2
+      inputs:
+        ${{if eq(variables['Build.SourceBranchName'], 'merge')}}:
+          key: ' "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | merge '
+        ${{else}}:
+          key: '"${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}  | $(Build.SourceVersion) '
+        path: ${{parameters.CacheDir}}
+        restoreKeys: |
+          "${{parameters.TODAY}}" |  ${{parameters.AdditionalKey}}
+      displayName: Cache Task
     - script: |
         set -e -x
         pushd '$(Build.SourcesDirectory)/cmake/external/emsdk'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 81319e07c6b17..168602a17910b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -759,7 +759,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime'
     ArtifactSuffix: 'CPU'
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
@@ -796,7 +796,7 @@ stages:
     OS: Linux
     BuildId: ${{ parameters.BuildId }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - template: final-jar-testing.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
index 8538f15e93753..cf470b3fa2448 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml
@@ -19,7 +19,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: ArtifactNamePrefix
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
index 7b9788d90b17d..15165e3cb0950 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-ci.yml
@@ -1,5 +1,5 @@
 parameters:
-  AgentPool : 'onnxruntime-Ubuntu2004-AMD-CPU'
+  AgentPool : 'onnxruntime-Ubuntu2204-AMD-CPU'
   StageName : 'Linux_CI_Dev'
   RunDockerBuildArgs: '-o ubuntu20.04 -d cpu -x "--build_wheel"'
   NuPackScript: ''
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
index 6ad5f9f38a4db..8972d55f6e190 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml
@@ -32,7 +32,7 @@ stages:
       BaseImage: 'registry.access.redhat.com/ubi8/ubi'
       OnnxruntimeArch: 'x64'
       OnnxruntimeNodejsBindingArch: 'x64'
-      PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+      PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
       ArtifactNamePrefix: ${{ parameters.ArtifactNamePrefix }}
       PackageJava: ${{ parameters.PackageJava }}
       PackageNodeJS: ${{ parameters.PackageNodeJS }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index e6693a6f6d26a..d279e667f9091 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -13,7 +13,7 @@ parameters:
 
 - name: PoolName
   type: string
-  default: 'onnxruntime-Ubuntu2004-AMD-CPU'
+  default: 'onnxruntime-Ubuntu2204-AMD-CPU'
 
 - name: SkipPublish
   type: boolean
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 51583a25f63ac..cf39be23cbdaf 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -336,7 +336,7 @@ stages:
 
 - template: ../nuget/templates/test_linux.yml
   parameters:
-    AgentPool : onnxruntime-Ubuntu2004-AMD-CPU
+    AgentPool : onnxruntime-Ubuntu2204-AMD-CPU
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Training'
     ArtifactSuffix: 'Training-CPU'
     StageSuffix: 'Training_CPU'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
index 00ba5ea4a475a..01cab936aa529 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml
@@ -48,7 +48,7 @@ stages:
       timeoutInMinutes: 90
       workspace:
         clean: all
-      pool: onnxruntime-Ubuntu2004-AMD-CPU
+      pool: onnxruntime-Ubuntu2204-AMD-CPU
       strategy:
         matrix:
           ${{ each PythonVersion in parameters.python_version }}:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index abe06e80f4f19..8669a883c31f1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -430,7 +430,7 @@ stages:
       - template: py-linux.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           base_image: 'registry.access.redhat.com/ubi8/ubi'
           devtoolset_rootpath: /opt/rh/gcc-toolset-12/root
           ld_library_path_arg: /opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64
@@ -443,6 +443,6 @@ stages:
       - template: py-linux-gpu.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2004-AMD-CPU'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/rocm.yml b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
index 2e9e6c6b35a2e..43a80aa4fd4e3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/rocm.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/rocm.yml
@@ -14,7 +14,7 @@ jobs:
   workspace:
     clean: all
   timeoutInMinutes: 180
-  pool: Ubuntu-2004-rocm-aiinfra
+  pool: Ubuntu-2204-rocm-aiinfra
   variables:
     - name: PythonVersion
       value: ${{ parameters.PythonVersion }}
diff --git a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
index e352a04068ee8..24809ccfdec1f 100644
--- a/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/web-ci-pipeline.yml
@@ -53,7 +53,7 @@ stages:
   parameters:
     NpmPackagingMode: ${{ variables.NpmPackagingMode }}
     IsReleasePipeline: false
-    PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+    PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU'
     BuildStaticLib: true
     ExtraBuildArgs: $(ExtraBuildArgs)
     WASMTemplate: linux-wasm-ci.yml
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 1059dd5047477..933d1f3d5874a 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -7,9 +7,9 @@ mkdir -p /build/dist
 
 EXTRA_ARG=""
 
-# Put 3.8 at the last because Ubuntu 20.04 use python 3.8 and we will upload the intermediate build files of this 
-# config to Azure DevOps Artifacts and download them to a Ubuntu 20.04 machine to run the tests.
-PYTHON_EXES=("/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp310-cp310/bin/python3.10" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp38-cp38/bin/python3.8")
+# Put 3.8 at the last because Ubuntu 22.04 use python 3.10 and we will upload the intermediate build files of this 
+# config to Azure DevOps Artifacts and download them to a Ubuntu 22.04 machine to run the tests.
+PYTHON_EXES=("/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9" "/opt/python/cp311-cp311/bin/python3.11" "/opt/python/cp312-cp312/bin/python3.12" "/opt/python/cp310-cp310/bin/python3.10")
 while getopts "d:p:x:c:" parameter_Option
 do case "${parameter_Option}"
 in
diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh
index 3164a10a09dfd..082c561dd17b9 100755
--- a/tools/ci_build/github/linux/run_python_tests.sh
+++ b/tools/ci_build/github/linux/run_python_tests.sh
@@ -15,7 +15,7 @@ c) BUILD_CONFIG=${OPTARG};;
 esac
 done
 
-export PATH=/opt/python/cp38-cp38/bin:$PATH
+export PATH=/opt/python/cp310-cp310/bin:$PATH
 cd /build
 files=(whl/*.whl)
 FILE_NAME="${files[0]}"

From 07d3aed3aa3a054deb502cedf867f559fc690755 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 17 Jan 2024 13:35:13 +0800
Subject: [PATCH 22/39] [WebNN EP] Fixed build issue with disable_rtti (#19173)

Previously building webnn ep with --disable_rtti will throw
unboundTypeError since unbound type names are illegal with RTTI disabled
in Embind API, we can fix it by adding a
-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 flag.
---
 cmake/adjust_global_compile_flags.cmake | 5 +++++
 cmake/onnxruntime_webassembly.cmake     | 5 ++++-
 tools/ci_build/build.py                 | 4 ----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 30d8cbf78fb1a..2c7bf9f1c2f5c 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -123,6 +123,11 @@ if (onnxruntime_DISABLE_RTTI)
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/GR->" "$<$<COMPILE_LANGUAGE:CXX>:/we4541>")
   else()
     add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>")
+    if (onnxruntime_USE_WEBNN)
+      # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
+      # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
+      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0>")
+    endif()
   endif()
 else()
   #MSVC RTTI flag /GR is not added to CMAKE_CXX_FLAGS by default. But, anyway VC++2019 treats "/GR" default on.
diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake
index 858583e64e9df..546d50c1ca2d3 100644
--- a/cmake/onnxruntime_webassembly.cmake
+++ b/cmake/onnxruntime_webassembly.cmake
@@ -268,7 +268,10 @@ else()
   endif()
 
   if (onnxruntime_USE_WEBNN)
-   set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " --bind -sWASM_BIGINT")
+    if (onnxruntime_DISABLE_RTTI)
+      set_property(TARGET onnxruntime_webassembly APPEND_STRING PROPERTY LINK_FLAGS " -fno-rtti -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+    endif()
   endif()
 
   # Set link flag to enable exceptions support, this will override default disabling exception throwing behavior when disable exceptions.
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 1a6262edf45c9..1034a82cb2854 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1283,10 +1283,6 @@ def generate_build_tree(
     if args.use_webnn:
         if not args.build_wasm:
             raise BuildError("WebNN is only available for WebAssembly build.")
-        if args.disable_rtti:
-            # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
-            # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/16911
-            raise BuildError("WebNN is not supported with RTTI disabled.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
     if args.use_snpe:

From 9876cc7c4f5f6249e1dec8b93abf7b8dfcf5ca0c Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Wed, 17 Jan 2024 15:46:19 +0800
Subject: [PATCH 23/39] more inputs support for LLM exporter (#19005)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../transformers/large_model_exporter.py      | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py
index 1601b1a203b9a..9e8b284bf56c7 100644
--- a/onnxruntime/python/tools/transformers/large_model_exporter.py
+++ b/onnxruntime/python/tools/transformers/large_model_exporter.py
@@ -224,24 +224,35 @@ def fetch_onnx_inputs_outputs_name(
     if not num_of_past_key:
         num_of_past_key = model.config.num_hidden_layers
 
-    onnx_inp_names = ("input_ids", "attention_mask")
+    # filter out constant inputs
+    onnx_inp_names = tuple(
+        [torch_input_names[i] for i in range(len(torch_input_names)) if isinstance(onnx_inputs[i], torch.Tensor)]
+    )
+    assert (
+        "input_ids" in onnx_inp_names and "attention_mask" in onnx_inp_names
+    ), "input_ids and attention_mask must be existed in inputs"
     onnx_out_names = ("logits",)
     onnx_dynamic_axes = {
         "input_ids": {0: "batch_size", 1: "seq_len"},
         "attention_mask": {0: "batch_size", 1: "seq_len"},
     }
+    # add dyanmic dimensions for the unkonw inputs
+    for idx, name in enumerate(onnx_inp_names):
+        if name not in onnx_dynamic_axes:
+            unknown_dims = {i: f"{idx}__unknown_dims__{i}" for i in range(onnx_inputs[idx].dim())}
+            onnx_dynamic_axes[name] = unknown_dims
     if input_with_past:
         for i in range(num_of_past_key):
-            onnx_inp_names += (f"present_key.{i}",)
-            onnx_inp_names += (f"present_values.{i}",)
+            onnx_inp_names += (f"past_key_values.{i}.key",)
+            onnx_inp_names += (f"past_key_values.{i}.value",)
 
             onnx_dynamic_axes[onnx_inp_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_inp_names[-2]] = kv_cache_axis
 
     if with_past or input_with_past:
         for i in range(num_of_past_key):
-            onnx_out_names += (f"past_key.{i}",)
-            onnx_out_names += (f"past_values.{i}",)
+            onnx_out_names += (f"present.{i}.key",)
+            onnx_out_names += (f"present.{i}.value",)
             onnx_dynamic_axes[onnx_out_names[-1]] = kv_cache_axis
             onnx_dynamic_axes[onnx_out_names[-2]] = kv_cache_axis
 

From 63dd605d3310f5a9540c414216f3f3b67d455c4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Wed, 17 Jan 2024 19:00:36 +0100
Subject: [PATCH 24/39] Fix untyped float values in quantization tool missing
 from PR #18043 (#19182)

### Description
Extends the code coverage to Entroy, Histogram and Distribution
calibration method, fix bugs while doing it.


### Motivation and Context
Bugs detected in [Olive](https://github.com/microsoft/OLive).
---
 .../python/tools/quantization/calibrate.py    | 86 +++++++++++++++----
 .../python/tools/quantization/quant_utils.py  |  2 +-
 .../python/quantization/test_op_matmul.py     | 66 +++++++++++++-
 3 files changed, 131 insertions(+), 23 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index d0db57c392961..77b3dce9fb004 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -5,6 +5,7 @@
 # license information.
 # --------------------------------------------------------------------------
 import abc
+import copy
 import itertools
 import os
 import uuid
@@ -21,6 +22,48 @@
 from .quant_utils import apply_plot, load_model_with_shape_infer, smooth_distribution
 
 
+def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray:
+    """
+    See https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr.
+    Python implementation.
+    """
+    res = np.empty(pk.shape, dtype=pk.dtype)
+    res[:] = pk[:] * np.log(pk[:] / qk[:])
+    c2 = (pk == 0) & (qk >= 0)
+    res[c2] = 0
+    c1 = (pk > 0) & (qk > 0)
+    res[~c1] = np.inf
+    return res
+
+
+def entropy(
+    pk: np.ndarray,
+    qk: np.ndarray,
+    base: Optional[float] = None,
+    axis: int = 0,
+) -> np.ndarray:
+    """
+    Simplifeied version of entropy.
+    Source: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html.
+    This avoids taking a dependency on scipy just for this function.
+    """
+    assert base is None or base > 0, "base={base} must be a positive number or `None`."
+    assert qk is not None, "qk is None"
+
+    pk = np.asarray(pk).astype(np.float32)
+    pk = 1.0 * pk / np.sum(pk, axis=axis, keepdims=True)
+
+    qk = np.asarray(qk).astype(np.float32)
+    pk, qk = np.broadcast_arrays(pk, qk)
+    qk = 1.0 * qk / np.sum(qk, axis=axis, keepdims=True)
+    vec = rel_entr(pk, qk)
+
+    s = np.sum(vec, axis=axis)
+    if base is not None:
+        s /= np.log(base)
+    return s.astype(pk.dtype)
+
+
 class TensorData:
     _allowed = frozenset(["avg", "std", "lowest", "highest", "hist", "hist_edges", "bins"])
     _floats = frozenset(["avg", "std", "lowest", "highest", "hist_edges"])
@@ -708,8 +751,8 @@ def collect_absolute_value(self, name_to_arr):
                 min_value = np.min(data_arr_np)
                 max_value = np.max(data_arr_np)
             else:
-                min_value = 0
-                max_value = 0
+                min_value = np.array(0, dtype=data_arr_np.dtype)
+                max_value = np.array(0, dtype=data_arr_np.dtype)
 
             data_arr_np = np.absolute(data_arr_np)  # only consider absolute value
 
@@ -725,6 +768,8 @@ def collect_absolute_value(self, name_to_arr):
                 old_histogram = self.histogram_dict[tensor]
                 old_min = old_histogram[2]
                 old_max = old_histogram[3]
+                assert hasattr(old_min, "dtype"), f"old_min should be a numpy array but is {type(old_min)}"
+                assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
                 temp_amax = np.max(data_arr_np)
@@ -757,7 +802,7 @@ def collect_value(self, name_to_arr):
                 min_value = np.array(0, dtype=data_arr.dtype)
                 max_value = np.array(0, dtype=data_arr.dtype)
 
-            threshold = max(abs(min_value), abs(max_value))
+            threshold = np.array(max(abs(min_value), abs(max_value)), dtype=data_arr.dtype)
 
             if tensor in self.histogram_dict:
                 old_histogram = self.histogram_dict[tensor]
@@ -809,7 +854,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
     def compute_collection_result(self):
         if not self.histogram_dict or len(self.histogram_dict) == 0:
             raise ValueError("Histogram has not been collected. Please run collect() first.")
-        print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
+        print(f"Finding optimal threshold for each tensor using {self.method!r} algorithm ...")
 
         if self.method == "entropy":
             return self.compute_entropy()
@@ -938,7 +983,14 @@ def compute_distribution(self):
             assert avg_coef.dtype != np.float64
             assert std_coef.dtype != np.float64
             assert hist_edges.dtype != np.float64
-            thresholds_dict[tensor] = TensorData(avg=avg_coef, std=std_coef, hist=hist, hist_edges=hist_edges)
+            thresholds_dict[tensor] = TensorData(
+                avg=avg_coef,
+                std=std_coef,
+                hist=hist,
+                hist_edges=hist_edges,
+                lowest=hist_edges.min(),
+                highest=hist_edges.max(),
+            )
 
             # Plot histogram for debug only
             if os.environ.get("QUANTIZATION_DEBUG", 0) in (1, "1"):
@@ -952,18 +1004,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
         `q` is a truncated version of the original distribution.
         Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
         """
-        import copy
-
-        from scipy.stats import entropy
-
         hist = histogram[0]
         hist_edges = histogram[1]
         num_bins = hist.size
         zero_bin_index = num_bins // 2
         num_half_quantized_bin = num_quantized_bins // 2
 
+        dtype = histogram[1].dtype
         kl_divergence = np.zeros(zero_bin_index - num_half_quantized_bin + 1)
-        thresholds = [(0, 0) for i in range(kl_divergence.size)]
+        thresholds = [(np.array(0, dtype=dtype), np.array(0, dtype=dtype)) for i in range(kl_divergence.size)]
 
         # <------------ num bins ---------------->
         #        <--- quantized bins ---->
@@ -983,10 +1032,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             start_index = zero_bin_index - i
             end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins
 
-            thresholds[i - num_half_quantized_bin] = (
-                float(hist_edges[start_index]),
-                float(hist_edges[end_index]),
-            )
+            thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index])
 
             sliced_distribution = copy.deepcopy(hist[start_index:end_index])
 
@@ -1020,15 +1066,15 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
 
                 norm = sum(nonzeros[start:end])
                 if norm != 0:
-                    q[start:end] = float(quantized_bins[index]) / float(norm)
+                    q[start:end] = quantized_bins[index] / norm
 
             p = smooth_distribution(p)
             q = smooth_distribution(q)
-
-            if isinstance(q, np.ndarray):
-                kl_divergence[i - num_half_quantized_bin] = entropy(p, q)
+            if p is None or q is None:
+                div = np.array(np.inf, dtype=dtype)
             else:
-                kl_divergence[i - num_half_quantized_bin] = float("inf")
+                div = np.array(entropy(p, q), dtype=dtype)
+            kl_divergence[i - num_half_quantized_bin] = div
 
         min_kl_divergence_idx = np.argmin(kl_divergence)
         optimal_threshold = thresholds[min_kl_divergence_idx]
@@ -1038,6 +1084,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins):
             optimal_threshold = (min_value, optimal_threshold[1])
         if optimal_threshold[1] > max_value:
             optimal_threshold = (optimal_threshold[0], max_value)
+        assert hasattr(optimal_threshold[0], "dtype")
+        assert hasattr(optimal_threshold[1], "dtype")
         return optimal_threshold
 
 
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 68c2b3bf79c8b..036f49b420734 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -653,7 +653,7 @@ def smooth_distribution(p, eps=0.0001):
 
     if not n_nonzeros:
         # raise ValueError('The discrete probability distribution is malformed. All entries are 0.')
-        return -1
+        return None
     eps1 = eps * float(n_zeros) / float(n_nonzeros)
     assert eps1 < 1.0, "n_zeros=%d, n_nonzeros=%d, eps1=%f" % (
         n_zeros,
diff --git a/onnxruntime/test/python/quantization/test_op_matmul.py b/onnxruntime/test/python/quantization/test_op_matmul.py
index 344583aa7c624..91368bd643158 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul.py
@@ -10,13 +10,39 @@
 import numpy as np
 import onnx
 import packaging.version as pv
+from numpy.testing import assert_almost_equal
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail
 from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization.calibrate import entropy
+
+
+def skip_if_new_opset_exception_raised(func):
+    def wrapper(*args, **kwargs):
+        try:
+            func(*args, **kwargs)
+        except Fail as e:
+            if "is under development and support for this is limited" in str(e):
+                raise unittest.SkipTest(f"Skipped {func} due to opset under development.")  # noqa: B904
+            raise
+
+    return wrapper
 
 
 class TestOpMatMul(unittest.TestCase):
+    def test_entropy(self):
+        try:
+            from scipy.stats import entropy as scipy_entropy
+        except ImportError:
+            raise unittest.SkipTest("scipy not installed.")  # noqa: B904
+        pk = (np.arange(10) - 5).astype(np.float32) / 10
+        qk = -(np.arange(10) - 5).astype(np.float32) / 10
+        ent = scipy_entropy(pk, qk)
+        get = entropy(pk, qk)
+        assert_almost_equal(ent, get)
+
     def input_feeds(self, n, name2shape, dtype):
         input_data_list = []
         for _i in range(n):
@@ -324,10 +350,11 @@ def test_quantize_matmul_u8u8(self):
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_u8u8_f16(self):
-        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_u8u8(onnx.TensorProto.FLOAT16, 21, 9)
 
-    def quantize_matmul_s8s8(self, tt, opset, ir_version):
+    def quantize_matmul_s8s8(self, tt, opset, ir_version, calibrate_method=CalibrationMethod.MinMax):
         np.random.seed(1)
         model_fp_path = "matmul_fp.onnx"
         self.construct_model_matmul(model_fp_path, tensor_type=tt, opset=opset, ir_version=ir_version)
@@ -341,6 +368,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
         self.static_quant_test_qdq(
             model_fp_path,
@@ -348,6 +376,7 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
             activation_type=QuantType.QInt8,
             weight_type=QuantType.QInt8,
             extra_options={"ActivationSymmetric": True},
+            calibrate_method=calibrate_method,
         )
 
         # dynamic quantization doesn't support activation:int8
@@ -357,11 +386,42 @@ def quantize_matmul_s8s8(self, tt, opset, ir_version):
     def test_quantize_matmul_s8s8(self):
         self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8)
 
+    def test_quantize_matmul_s8s8_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Entropy)
+
+    def test_quantize_matmul_s8s8_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Percentile)
+
+    def test_quantize_matmul_s8s8_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT, 18, 8, calibrate_method=CalibrationMethod.Distribution)
+
     @unittest.skipIf(
         pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
     )
+    @skip_if_new_opset_exception_raised
     def test_quantize_matmul_s8s8_f16(self):
-        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 19, 9)
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_entropy(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Entropy)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_percentile(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Percentile)
+
+    @unittest.skipIf(
+        pv.Version(onnx.__version__) < pv.Version("1.15.1"), reason="Shape inference bug, see onnx PR #5709"
+    )
+    @skip_if_new_opset_exception_raised
+    def test_quantize_matmul_s8s8_f16_distribution(self):
+        self.quantize_matmul_s8s8(onnx.TensorProto.FLOAT16, 21, 9, calibrate_method=CalibrationMethod.Distribution)
 
     def quantize_matmul_e4m3fn_same(self, tt, opset, ir_version):
         np.random.seed(1)

From bd9d8fb2a545a59d87a4c23308ec543ba6e4c41d Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Wed, 17 Jan 2024 11:18:32 -0800
Subject: [PATCH 25/39] [ORT 1.17.0 release] Bump up version to 1.18.0 (#19170)

### Description
<!-- Describe your changes. -->

Bump up version to 1.18.0 since the release branch has been cut.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
---
 VERSION_NUMBER                                            | 2 +-
 .../Training/NativeTrainingMethods.shared.cs              | 4 ++--
 docs/python/README.rst                                    | 5 +++++
 include/onnxruntime/core/session/onnxruntime_c_api.h      | 2 +-
 js/common/lib/version.ts                                  | 2 +-
 js/common/package-lock.json                               | 4 ++--
 js/common/package.json                                    | 2 +-
 js/node/lib/version.ts                                    | 2 +-
 js/node/package-lock.json                                 | 6 +++---
 js/node/package.json                                      | 2 +-
 js/react_native/lib/version.ts                            | 2 +-
 js/react_native/package.json                              | 2 +-
 js/react_native/yarn.lock                                 | 2 +-
 js/web/lib/version.ts                                     | 2 +-
 js/web/package-lock.json                                  | 6 +++---
 js/web/package.json                                       | 2 +-
 onnxruntime/__init__.py                                   | 2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc             | 8 ++++----
 18 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 092afa15df4df..84cc529467b05 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.17.0
+1.18.0
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
index 68a399f8b9671..7fe16f4156ef2 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Training/NativeTrainingMethods.shared.cs
@@ -65,10 +65,10 @@ static NativeTrainingMethods()
                 DOrtGetApi OrtGetApi = (DOrtGetApi)Marshal.GetDelegateForFunctionPointer(NativeMethods.OrtGetApiBase().GetApi, typeof(DOrtGetApi));
 
                 // TODO: Make this save the pointer, and not copy the whole structure across
-                api_ = (OrtApi)OrtGetApi(17 /*ORT_API_VERSION*/);
+                api_ = (OrtApi)OrtGetApi(18 /*ORT_API_VERSION*/);
 
                 OrtGetTrainingApi = (DOrtGetTrainingApi)Marshal.GetDelegateForFunctionPointer(api_.GetTrainingApi, typeof(DOrtGetTrainingApi));
-                trainingApiPtr = OrtGetTrainingApi(17 /*ORT_API_VERSION*/);
+                trainingApiPtr = OrtGetTrainingApi(18 /*ORT_API_VERSION*/);
                 if (trainingApiPtr != IntPtr.Zero)
                 {
                     trainingApi_ = (OrtTrainingApi)Marshal.PtrToStructure(trainingApiPtr, typeof(OrtTrainingApi));
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 32bb3729e01d0..bbc8571fe3f17 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.18.0
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.18.0
+
 1.17.0
 ^^^^^^
 
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index b321b2b2bac27..aca9f4896fbdb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -38,7 +38,7 @@
  *
  * This value is used by some API functions to behave as this version of the header expects.
  */
-#define ORT_API_VERSION 17
+#define ORT_API_VERSION 18
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index 84f6dba83fa59..a5ada877b916a 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/common/package.json b/js/common/package.json
index beab7d29be263..64ab2736adbe3 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 542eebe746d59..2d7c39c86097f 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "os": [
         "win32",
@@ -27,7 +27,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/node/package.json b/js/node/package.json
index 8e591d8f46b9d..026840742e29e 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -13,7 +13,7 @@
       3
     ]
   },
-  "version": "1.17.0",
+  "version": "1.18.0",
   "dependencies": {
     "onnxruntime-common": "file:../common"
   },
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 39e6cb08bb06a..47324a76fe55f 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -36,7 +36,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index ff9be7fbe3a5b..4dca90d7415cf 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.17.0"
+  version "1.18.0"
 
 open@^6.2.0:
   version "6.4.0"
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 96c2361cceabe..40f970ddf02ae 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.17.0';
+export const version = '1.18.0';
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index cd71c20ba4d2f..1815767fd2320 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^1.12.0",
@@ -49,7 +49,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.17.0",
+      "version": "1.18.0",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.23.22"
diff --git a/js/web/package.json b/js/web/package.json
index 7ffc9ba16aaa9..aa89606c00a1e 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -8,7 +8,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.17.0",
+  "version": "1.18.0",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^1.12.0",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 57219c50f39aa..c3699f0fb33ad 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -7,7 +7,7 @@
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.17.0"
+__version__ = "1.18.0"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index d77c188f832a7..91a7f0d930b51 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2397,7 +2397,7 @@ Second example, if we wanted to add and remove some members, we'd do this:
     In GetApi we now make it return ort_api_3 for version 3.
 */
 
-static constexpr OrtApi ort_api_1_to_17 = {
+static constexpr OrtApi ort_api_1_to_18 = {
     // NOTE: The ordering of these fields MUST not change after that version has shipped since existing binaries depend on this ordering.
 
     // Shipped as version 1 - DO NOT MODIFY (see above text for more information)
@@ -2756,16 +2756,16 @@ static_assert(offsetof(OrtApi, KernelContext_GetResource) / sizeof(void*) == 265
 static_assert(offsetof(OrtApi, SetUserLoggingFunction) / sizeof(void*) == 266, "Size of version 17 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.17.0",
+static_assert(std::string_view(ORT_VERSION) == "1.18.0",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
-// 2. If there were any APIs added to ort_api_1_to_17 above:
+// 2. If there were any APIs added to ort_api_1_to_18 above:
 //    a. Add the 'End of version #' markers (pattern above should be obvious)
 //    b. Add a static_assert in the directly above list of version sizes to ensure nobody adds any more functions to the just shipped API version
 
 ORT_API(const OrtApi*, OrtApis::GetApi, uint32_t version) {
   if (version >= 1 && version <= ORT_API_VERSION)
-    return &ort_api_1_to_17;
+    return &ort_api_1_to_18;
 
   fprintf(stderr,
           "The requested API version [%u] is not available, only API versions [1, %u] are supported in this build."

From bc219ed553fc8d4b8fa3c7b4476810a63a864d8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Wed, 17 Jan 2024 20:33:34 +0100
Subject: [PATCH 26/39] [TensorRT EP] Enable a minimal CUDA EP compilation
 without kernels  (#19052)

Adresses https://github.com/microsoft/onnxruntime/issues/18542.
I followed the advice given by @RyanUnderhill
[here](https://github.com/microsoft/onnxruntime/pull/18731#issuecomment-1848261925)
and went with a minimal CUDA EP for now.
---
 cmake/CMakeLists.txt                          |  1 +
 cmake/onnxruntime_providers_cuda.cmake        | 49 ++++++++++++++-----
 .../core/providers/cuda/cuda_context.h        |  3 +-
 onnxruntime/core/providers/cuda/cuda_call.cc  |  4 ++
 .../core/providers/cuda/cuda_common.cc        | 42 ++++++++--------
 onnxruntime/core/providers/cuda/cuda_common.h |  6 ++-
 .../providers/cuda/cuda_execution_provider.cc | 14 +++++-
 onnxruntime/core/providers/cuda/cuda_pch.h    |  7 +++
 .../core/providers/cuda/cuda_stream_handle.cc |  4 ++
 .../core/providers/cuda/cudnn_common.cc       |  3 +-
 .../core/providers/cuda/cudnn_common.h        |  3 +-
 11 files changed, 97 insertions(+), 39 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc96218dac79e..712d5d76108aa 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -79,6 +79,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index 84d1376f99d5e..9887d615c92d7 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -1,10 +1,25 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
-  )
+
+  if (onnxruntime_CUDA_MINIMAL)
+    file(GLOB onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.h"
+        "${ONNXRUNTIME_ROOT}/core/providers/cuda/tunable/*.cc"
+    )
+    # Remove pch files
+    list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/integer_gemm.cc"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/triton_kernel.h"
+    )
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cc_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.h"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+    )
+  endif()
   # Remove pch files
   list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
     "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -16,11 +31,16 @@
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
     "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
   )
-  file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
-    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
-  )
 
+
+  if (onnxruntime_CUDA_MINIMAL)
+    set(onnxruntime_providers_cuda_shared_srcs "")
+  else()
+    file(GLOB_RECURSE onnxruntime_providers_cuda_cu_srcs CONFIGURE_DEPENDS
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
+      "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cuh"
+    )
+  endif()
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
   set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
 
@@ -156,10 +176,15 @@
     endif()
 
     add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
-    target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
-    if(onnxruntime_CUDNN_HOME)
-      target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
-      target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+    if(onnxruntime_CUDA_MINIMAL)
+      target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL)
+      target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+    else()
+      target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
+      if(onnxruntime_CUDNN_HOME)
+          target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include)
+          target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib)
+      endif()
     endif()
 
     if (onnxruntime_USE_TRITON_KERNEL)
diff --git a/include/onnxruntime/core/providers/cuda/cuda_context.h b/include/onnxruntime/core/providers/cuda/cuda_context.h
index 9416fad5f1448..1370f5c4c5e10 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_context.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -16,9 +16,10 @@
 #include "core/providers/custom_op_context.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cudnn.h>
-
+#endif
 namespace Ort {
 
 namespace Custom {
diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
index 4f223041e04e3..f60684795a4bc 100644
--- a/onnxruntime/core/providers/cuda/cuda_call.cc
+++ b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -30,6 +30,7 @@ const char* CudaErrString<cudaError_t>(cudaError_t x) {
   return cudaGetErrorString(x);
 }
 
+#ifndef USE_CUDA_MINIMAL
 template <>
 const char* CudaErrString<cublasStatus_t>(cublasStatus_t e) {
   cudaDeviceSynchronize();
@@ -76,6 +77,7 @@ const char* CudaErrString<cufftResult>(cufftResult e) {
       return "Unknown cufft error status";
   }
 }
+#endif
 
 #ifdef ORT_USE_NCCL
 template <>
@@ -132,6 +134,7 @@ std::conditional_t<THRW, void, Status> CudaCall(
 
 template Status CudaCall<cudaError, false>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cudaError, true>(cudaError retCode, const char* exprString, const char* libName, cudaError successCode, const char* msg, const char* file, const int line);
+#ifndef USE_CUDA_MINIMAL
 template Status CudaCall<cublasStatus_t, false>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cublasStatus_t, true>(cublasStatus_t retCode, const char* exprString, const char* libName, cublasStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cudnnStatus_t, false>(cudnnStatus_t retCode, const char* exprString, const char* libName, cudnnStatus_t successCode, const char* msg, const char* file, const int line);
@@ -140,6 +143,7 @@ template Status CudaCall<curandStatus_t, false>(curandStatus_t retCode, const ch
 template void CudaCall<curandStatus_t, true>(curandStatus_t retCode, const char* exprString, const char* libName, curandStatus_t successCode, const char* msg, const char* file, const int line);
 template Status CudaCall<cufftResult, false>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
 template void CudaCall<cufftResult, true>(cufftResult retCode, const char* exprString, const char* libName, cufftResult successCode, const char* msg, const char* file, const int line);
+#endif
 
 #ifdef ORT_USE_NCCL
 template Status CudaCall<ncclResult_t, false>(ncclResult_t retCode, const char* exprString, const char* libName, ncclResult_t successCode, const char* msg, const char* file, const int line);
diff --git a/onnxruntime/core/providers/cuda/cuda_common.cc b/onnxruntime/core/providers/cuda/cuda_common.cc
index 33f2938940e4d..65083f89f7f77 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.cc
+++ b/onnxruntime/core/providers/cuda/cuda_common.cc
@@ -14,6 +14,27 @@ namespace cuda {
 //   0x04 - pedantic
 constexpr const char* kCudaGemmOptions = "ORT_CUDA_GEMM_OPTIONS";
 
+const char* CudaDataTypeToString(cudaDataType_t dt) {
+  switch (dt) {
+    case CUDA_R_16F:
+      return "CUDA_R_16F";
+    case CUDA_R_16BF:
+      return "CUDA_R_16BF";
+    case CUDA_R_32F:
+      return "CUDA_R_32F";
+#if !defined(DISABLE_FLOAT8_TYPES)
+    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
+    case CUDA_R_8F_E4M3:
+      return "CUDA_R_8F_E4M3";
+    case CUDA_R_8F_E5M2:
+      return "CUDA_R_8F_E5M2";
+#endif
+    default:
+      return "<unknown>";
+  }
+}
+
+#ifndef USE_CUDA_MINIMAL
 // Initialize the singleton instance
 HalfGemmOptions HalfGemmOptions::instance;
 
@@ -54,26 +75,6 @@ const char* cublasGetErrorEnum(cublasStatus_t error) {
   }
 }
 
-const char* CudaDataTypeToString(cudaDataType_t dt) {
-  switch (dt) {
-    case CUDA_R_16F:
-      return "CUDA_R_16F";
-    case CUDA_R_16BF:
-      return "CUDA_R_16BF";
-    case CUDA_R_32F:
-      return "CUDA_R_32F";
-#if !defined(DISABLE_FLOAT8_TYPES)
-    // Note: CUDA_R_8F_E4M3 is defined with CUDA>=11.8
-    case CUDA_R_8F_E4M3:
-      return "CUDA_R_8F_E4M3";
-    case CUDA_R_8F_E5M2:
-      return "CUDA_R_8F_E5M2";
-#endif
-    default:
-      return "<unknown>";
-  }
-}
-
 const char* CublasComputeTypeToString(cublasComputeType_t ct) {
   switch (ct) {
     case CUBLAS_COMPUTE_16F:
@@ -92,6 +93,7 @@ const char* CublasComputeTypeToString(cublasComputeType_t ct) {
       return "<unknown>";
   }
 }
+#endif
 
 // It must exist somewhere already.
 cudaDataType_t ToCudaDataType(int32_t element_type) {
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
index 707099bac3ce0..e9941ce743bc3 100644
--- a/onnxruntime/core/providers/cuda/cuda_common.h
+++ b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -22,13 +22,14 @@ namespace onnxruntime {
 namespace cuda {
 
 #define CUDA_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDA_CALL(expr))
+#ifndef USE_CUDA_MINIMAL
 #define CUBLAS_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUBLAS_CALL(expr))
 #define CUSPARSE_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUSPARSE_CALL(expr))
 #define CURAND_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CURAND_CALL(expr))
 #define CUDNN_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUDNN_CALL(expr))
 #define CUDNN2_RETURN_IF_ERROR(expr, m) ORT_RETURN_IF_ERROR(CUDNN_CALL2(expr, m))
 #define CUFFT_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(CUFFT_CALL(expr))
-
+#endif
 // Type mapping for MLFloat16 to half
 template <typename T>
 class ToCudaType {
@@ -93,7 +94,7 @@ inline bool CalculateFdmStrides(gsl::span<fast_divmod> p, const std::vector<int6
   }
   return true;
 }
-
+#ifndef USE_CUDA_MINIMAL
 class CublasMathModeSetter {
  public:
   CublasMathModeSetter(const cudaDeviceProp& prop, cublasHandle_t handle, cublasMath_t mode) : handle_(handle) {
@@ -189,6 +190,7 @@ const char* cublasGetErrorEnum(cublasStatus_t error);
 const char* CudaDataTypeToString(cudaDataType_t dt);
 
 const char* CublasComputeTypeToString(cublasComputeType_t ct);
+#endif
 
 cudaDataType_t ToCudaDataType(int32_t element_type);
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index f7b23f12e8193..644bcaaa24cd4 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -12,6 +12,7 @@
 #include "core/providers/cuda/gpu_data_transfer.h"
 #include "core/providers/cuda/cuda_profiler.h"
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
 #include "contrib_ops/cuda/cuda_contrib_kernels.h"
 #endif
@@ -27,6 +28,7 @@
 #ifdef USE_TRITON_KERNEL
 #include "core/providers/cuda/triton_kernel.h"
 #endif
+#endif
 
 #include "core/providers/cuda/cuda_stream_handle.h"
 
@@ -169,21 +171,23 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
                                                           ArenaExtendStrategy /*arena_extend_strategy*/, CUDAExecutionProviderExternalAllocatorInfo /*external_allocator_info*/,
                                                           OrtArenaCfg* /*default_memory_arena_cfg*/) {
   CUDA_CALL_THROW(cudaSetDevice(device_id));
-
+#ifndef USE_CUDA_MINIMAL
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
   CUBLAS_CALL_THROW(cublasLtCreate(&cublas_lt_handle_));
   CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
 
   CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
   CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
-
+#endif
   cuda_graph_.SetStream(stream);
 }
 
 CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
+#ifndef USE_CUDA_MINIMAL
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasDestroy(cublas_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUBLAS_CALL(cublasLtDestroy(cublas_lt_handle_)));
   ORT_IGNORE_RETURN_VALUE(CUDNN_CALL(cudnnDestroy(cudnn_handle_)));
+#endif
 }
 
 bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
@@ -441,6 +445,7 @@ namespace cuda {
 // opset 1 to 9
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+#ifndef USE_CUDA_MINIMAL
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, float, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, double, Cos);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, MLFloat16, Cos);
@@ -1315,6 +1320,7 @@ class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
+#endif
 
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
@@ -1326,6 +1332,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+#ifndef USE_CUDA_MINIMAL
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 4, 10, Concat)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Unsqueeze)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 8, Flatten)>,
@@ -2201,6 +2208,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
@@ -2210,6 +2218,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     }
   }
 
+#ifndef USE_CUDA_MINIMAL
 #ifndef DISABLE_CONTRIB_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::cuda::RegisterCudaContribKernels(kernel_registry));
 #endif
@@ -2220,6 +2229,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
 
 #ifdef ENABLE_TRAINING_OPS
   ORT_RETURN_IF_ERROR(::onnxruntime::cuda::RegisterCudaTrainingKernels(kernel_registry));
+#endif
 #endif
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h
index f48554e8f1286..dfe50fe0a8832 100644
--- a/onnxruntime/core/providers/cuda/cuda_pch.h
+++ b/onnxruntime/core/providers/cuda/cuda_pch.h
@@ -10,12 +10,19 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#ifndef USE_CUDA_MINIMAL
 #include <cublas_v2.h>
 #include <cusparse.h>
 #include <curand.h>
 #include <cudnn.h>
 #include <cufft.h>
 #include <cublasLt.h>
+#else
+typedef void* cudnnHandle_t;
+typedef void* cublasHandle_t;
+typedef void* cublasLtHandle_t;
+#endif
 
 #ifdef ORT_USE_NCCL
 #include <nccl.h>
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 7c866395ecf6e..0a256394b7d99 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -69,6 +69,7 @@ CudaStream::CudaStream(cudaStream_t stream,
                                                                    release_cpu_buffer_on_cuda_stream_(release_cpu_buffer_on_cuda_stream),
                                                                    deferred_cpu_allocator_(*this),
                                                                    ep_info_(ep_info) {
+#ifndef USE_CUDA_MINIMAL
   if (own_flag) {
     CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
     CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream));
@@ -80,10 +81,12 @@ CudaStream::CudaStream(cudaStream_t stream,
     cudnn_handle_ = external_cudnn_handle;
     CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream));
   }
+#endif
 }
 
 CudaStream::~CudaStream() {
   ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd());
+#ifndef USE_CUDA_MINIMAL
   if (own_stream_) {
     cublasDestroy(cublas_handle_);
     cudnnDestroy(cudnn_handle_);
@@ -91,6 +94,7 @@ CudaStream::~CudaStream() {
     if (handle)
       cudaStreamDestroy(static_cast<cudaStream_t>(handle));
   }
+#endif
 }
 
 std::unique_ptr<synchronize::Notification> CudaStream::CreateNotification(size_t /*num_consumers*/) {
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc
index 4df59a98b12e5..c850f7b583bfc 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.cc
+++ b/onnxruntime/core/providers/cuda/cudnn_common.cc
@@ -9,7 +9,7 @@
 #include "core/common/gsl.h"
 #include "shared_inc/cuda_call.h"
 #include "core/providers/cpu/tensor/utils.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -222,3 +222,4 @@ const Float8E5M2 Consts<Float8E5M2>::One = Float8E5M2(1.0f, true);
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index 8a94a334ee688..fdd14dedad47e 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -7,7 +7,7 @@
 #include <cfloat>
 
 #include "core/providers/cuda/cuda_common.h"
-
+#ifndef USE_CUDA_MINIMAL
 namespace onnxruntime {
 namespace cuda {
 
@@ -260,3 +260,4 @@ SetPoolingNdDescriptorHelper(cudnnPoolingDescriptor_t poolingDesc,
 
 }  // namespace cuda
 }  // namespace onnxruntime
+#endif

From 146ebaf91e85185a0ac18c82bc69eba685ab9727 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 17 Jan 2024 15:03:43 -0800
Subject: [PATCH 27/39] [js/web] allow proxy to load model with 1GB <= size <
 2GB (#19178)

### Description

allow proxy to load model with 1GB <= size < 2GB

resolves #19157.
---
 js/web/lib/wasm/wasm-utils-load-file.ts | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/wasm-utils-load-file.ts b/js/web/lib/wasm/wasm-utils-load-file.ts
index abe480a43c790..c6cdba2320bde 100644
--- a/js/web/lib/wasm/wasm-utils-load-file.ts
+++ b/js/web/lib/wasm/wasm-utils-load-file.ts
@@ -47,9 +47,19 @@ export const loadFile = async(file: string|Blob|ArrayBufferLike|Uint8Array): Pro
         }
         const reader = response.body.getReader();
 
-        // use WebAssembly Memory to allocate larger ArrayBuffer
-        const pages = Math.ceil(fileSize / 65536);
-        const buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+        let buffer;
+        try {
+          // try to create ArrayBuffer directly
+          buffer = new ArrayBuffer(fileSize);
+        } catch (e) {
+          if (e instanceof RangeError) {
+            // use WebAssembly Memory to allocate larger ArrayBuffer
+            const pages = Math.ceil(fileSize / 65536);
+            buffer = new WebAssembly.Memory({initial: pages, maximum: pages}).buffer;
+          } else {
+            throw e;
+          }
+        }
 
         let offset = 0;
         // eslint-disable-next-line no-constant-condition

From f87e69801f200a34ddb312f1d39e7296f19b660b Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 17 Jan 2024 15:04:22 -0800
Subject: [PATCH 28/39] [js/web] show warning when numThreads is set but
 threads is not supported (#19179)

### Description
show warning when numThreads is set but threads is not supported.
Resolves #19148, #18933

for web: when crossOriginIsolated is false.
for node: always disable.
---
 js/web/lib/backend-wasm.ts      |  6 ++++++
 js/web/lib/wasm/wasm-factory.ts | 33 +++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/js/web/lib/backend-wasm.ts b/js/web/lib/backend-wasm.ts
index d9f63fec9c492..31ecffb07e40c 100644
--- a/js/web/lib/backend-wasm.ts
+++ b/js/web/lib/backend-wasm.ts
@@ -31,6 +31,12 @@ export const initializeFlags = (): void => {
   }
 
   if (typeof env.wasm.numThreads !== 'number' || !Number.isInteger(env.wasm.numThreads) || env.wasm.numThreads <= 0) {
+    // Web: when crossOriginIsolated is false, SharedArrayBuffer is not available so WebAssembly threads will not work.
+    // Node.js: onnxruntime-web does not support multi-threads in Node.js.
+    if ((typeof self !== 'undefined' && !self.crossOriginIsolated) ||
+        (typeof process !== 'undefined' && process.versions && process.versions.node)) {
+      env.wasm.numThreads = 1;
+    }
     const numCpuLogicalCores = typeof navigator === 'undefined' ? cpus().length : navigator.hardwareConcurrency;
     env.wasm.numThreads = Math.min(4, Math.ceil((numCpuLogicalCores || 1) / 2));
   }
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 81508a253ce8b..9b9334c93b78c 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -28,13 +28,34 @@ let initialized = false;
 let initializing = false;
 let aborted = false;
 
-const isMultiThreadSupported = (): boolean => {
-  try {
-    // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
-    if (typeof SharedArrayBuffer === 'undefined') {
-      return false;
+const isMultiThreadSupported = (numThreads: number): boolean => {
+  // WebAssembly threads are set to 1 (single thread).
+  if (numThreads === 1) {
+    return false;
+  }
+
+  // If 'SharedArrayBuffer' is not available, WebAssembly threads will not work.
+  if (typeof SharedArrayBuffer === 'undefined') {
+    if (typeof self !== 'undefined' && !self.crossOriginIsolated) {
+      // eslint-disable-next-line no-console
+      console.warn(
+          'env.wasm.numThreads is set to ' + numThreads +
+          ', but this will not work unless you enable crossOriginIsolated mode. ' +
+          'See https://web.dev/cross-origin-isolation-guide/ for more info.');
     }
+    return false;
+  }
+
+  // onnxruntime-web does not support multi-threads in Node.js.
+  if (typeof process !== 'undefined' && process.versions && process.versions.node) {
+    // eslint-disable-next-line no-console
+    console.warn(
+        'env.wasm.numThreads is set to ' + numThreads +
+        ', however, currently onnxruntime-web does not support multi-threads in Node.js. ' +
+        'Please consider using onnxruntime-node for performance critical scenarios.');
+  }
 
+  try {
     // Test for transferability of SABs (for browsers. needed for Firefox)
     // https://groups.google.com/forum/#!msg/mozilla.dev.platform/IHkBZlHETpA/dwsMNchWEQAJ
     if (typeof MessageChannel !== 'undefined') {
@@ -106,7 +127,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
   const numThreads = flags.numThreads!;
   const simd = flags.simd!;
 
-  const useThreads = numThreads > 1 && isMultiThreadSupported();
+  const useThreads = isMultiThreadSupported(numThreads);
   const useSimd = simd && isSimdSupported();
 
   const wasmPaths = flags.wasmPaths;

From 9da3e36138dd24377fbb0b4022d891b3baf07b84 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Wed, 17 Jan 2024 20:20:42 -0500
Subject: [PATCH 29/39] Fix buildJava from Zip-Nuget-Java-Nodejs Packaging
 Pipeline (#19187)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../c-api-noopenmp-packaging-pipelines.yml             |  2 ++
 .../stages/nuget-linux-cuda-packaging-stage.yml        | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 3803333bd880a..aa1a75bfcda45 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -204,6 +204,8 @@ stages:
     CudaVersion: ${{ parameters.CudaVersion }}
     docker_base_image: ${{ variables.docker_base_image }}
     linux_trt_version: ${{ variables.linux_trt_version }}
+    buildJava: true
+    buildNodejs: true
 
 #CUDA without tensorrt
 - template: templates/win-ci.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
index dbbc9ef27e513..db9bcacbf0754 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -6,6 +6,12 @@ parameters:
   type: string
 - name: linux_trt_version
   type: string
+- name: buildJava
+  type: boolean
+  default: false
+- name: buildNodejs
+  type: boolean
+  default: false
 
 stages:
   # Linux CUDA without TensorRT Packaging
@@ -66,9 +72,9 @@ stages:
   parameters:
     artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
     artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
-    buildJava: false
+    buildJava: ${{ parameters.buildJava }}
     buildJavaOption: '--build_java'
-    buildNodejs: false
+    buildNodejs: ${{ parameters.buildNodejs }}
     buildNodejsOption: '--build_nodejs'
     CudaVersion: ${{ parameters.CudaVersion }}
 # Linux CUDA Combined Testing and Publishing

From dadd3ea704243a8c2b2ded790ae01f3b57c4da53 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 18 Jan 2024 11:11:14 -0800
Subject: [PATCH 30/39] Check the ep_cache_context and don't allow access
 outside the directory (#19174)

### Description
Check the ep_cache_context node property for EPContext node, and don't
allow relative path like "../file_path"
---
 .../qnn/builder/onnx_ctx_model_helper.cc      |  28 +++-
 .../test/providers/qnn/simple_op_htp_test.cc  | 129 ++++++++++++++++++
 2 files changed, 155 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index b157396306d01..fd9bf200c45ef 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -88,9 +88,33 @@ Status GetEpContextFromGraph(const onnxruntime::GraphViewer& graph_viewer,
                                                                qnn_model);
   }
 
-  std::string external_qnn_context_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
   std::filesystem::path folder_path = std::filesystem::path(ctx_onnx_model_path).parent_path();
-  std::filesystem::path context_binary_path = folder_path.append(external_qnn_context_binary_file_name);
+  std::string external_qnn_ctx_binary_file_name = node_helper.Get(EP_CACHE_CONTEXT, "");
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name.empty(), "The file path in ep_cache_context should not be empty.");
+#ifdef _WIN32
+  onnxruntime::PathString external_qnn_context_binary_path = onnxruntime::ToPathString(external_qnn_ctx_binary_file_name);
+  auto ctx_file_path = std::filesystem::path(external_qnn_context_binary_path.c_str());
+  ORT_RETURN_IF(ctx_file_path.is_absolute(), "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  auto relative_path = ctx_file_path.lexically_normal().make_preferred().wstring();
+  if (relative_path.find(L"..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+
+  std::filesystem::path context_binary_path = folder_path.append(relative_path);
+#else
+  ORT_RETURN_IF(external_qnn_ctx_binary_file_name[0] == '/',
+                "External mode should set ep_cache_context field with a relative path, but it is an absolute path: ",
+                external_qnn_ctx_binary_file_name);
+  if (external_qnn_ctx_binary_file_name.find("..", 0) != std::string::npos) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context field has '..'. It's not allowed to point outside the directory.");
+  }
+  std::filesystem::path context_binary_path = folder_path.append(external_qnn_ctx_binary_file_name);
+  std::string file_full_path = context_binary_path.string();
+#endif
+  if (!std::filesystem::is_regular_file(context_binary_path)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "The file path in ep_cache_context does not exist or is not accessible.");
+  }
 
   size_t buffer_size{0};
   std::ifstream cache_file(context_binary_path.string().c_str(), std::ifstream::binary);
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index c4244fe532456..4ac1f5ddca643 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -908,6 +908,135 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCache_InvalidGraph) {
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
+std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) {
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, true, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  // The .. in the path will cause INVALID_GRAPH
+  ep_context_node.AddAttribute("ep_cache_context", external_bin_path);
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNN");
+  helper.SetGraphOutputs();
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  return model_data;
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has ".."
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryRelativePathTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("../qnn_context.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context has absolute path
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryAbsolutePathTest) {
+#if defined(_WIN32)
+  std::string external_ctx_bin_path = "D:/qnn_context.bin";
+#else
+  std::string external_ctx_bin_path = "/data/qnn_context.bin";
+#endif
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode(external_ctx_bin_path);
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to a file not exist
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("qnn_context_not_exist.bin");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
+// Create a model with EPContext node. Set the node property ep_cache_context to empty string
+// Verify that it return INVALID_GRAPH status
+TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
+  std::string model_data = CreateQnnCtxModelWithNonEmbedMode("");
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::INVALID_GRAPH);
+}
+
 // Run QDQ model on HTP with 2 inputs
 // 1st run will generate the Qnn context cache onnx file
 // 2nd run will load and run from QDQ model + Qnn context cache model

From dd2177c5d70b8e5b704f7ee0ddce134243eacb24 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Thu, 18 Jan 2024 13:11:47 -0800
Subject: [PATCH 31/39] enable webnn in ci build (#19163)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../github/azure-pipelines/templates/linux-wasm-ci.yml        | 4 ++--
 .../ci_build/github/azure-pipelines/templates/win-wasm-ci.yml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
index d279e667f9091..360e3d5ef879b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml
@@ -174,7 +174,7 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_simd_jsep | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_simd_jsep
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_jsep --enable_wasm_simd --use_jsep --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_jsep --enable_wasm_simd --use_jsep --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + JSEP)'
         WithCache: ${{ parameters.WithCache }}
     - template: build-linux-wasm-step.yml
@@ -185,7 +185,7 @@ jobs:
         ${{ else }}:
           AdditionalKey: wasm_simd_threads_jsep | ${{ parameters.BuildConfig }}
         CacheDir: $(ORT_CACHE_DIR)/wasm_simd_threads_jsep
-        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep --target onnxruntime_webassembly --skip_tests'
+        Arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)/wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         DisplayName: 'Build (simd + threads + JSEP)'
         WithCache: ${{ parameters.WithCache }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
index 79647cc5699c8..f2005ec5ada39 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml
@@ -127,14 +127,14 @@ jobs:
       displayName: 'Build (simd + JSEP)'
       inputs:
         scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_jsep --target onnxruntime_webassembly --skip_tests'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_jsep --enable_wasm_simd --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.BuildJsep, true) }}:
     - task: PythonScript@0
       displayName: 'Build (simd + threads + JSEP)'
       inputs:
         scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep --target onnxruntime_webassembly --skip_tests'
+        arguments: '$(CommonBuildArgs) --build_dir $(Build.BinariesDirectory)\wasm_simd_threads_jsep --enable_wasm_simd --enable_wasm_threads --use_jsep  --use_webnn --target onnxruntime_webassembly --skip_tests'
         workingDirectory: '$(Build.BinariesDirectory)'
   - ${{ if eq(parameters.SkipPublish, false) }}:
     - script: |

From 459c750b031339456e4061b1c4214904e6853ccd Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Fri, 19 Jan 2024 05:16:34 +0800
Subject: [PATCH 32/39] Update x64 template kernel library for 'sqnbitgemm'
 (#19016)

### Description
<!-- Describe your changes. -->
1. Make JBLAS codes an external module of ORT.
2. Move q4 gemm code to contrib_ops.
3. Update template kernel library to v0.1 release.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
We found that the current LLM model performance is far below our
expectations. Here is some performance data collected on Mistral-7B
model with Xeon-8480:
8 threads | prompt length=32 past_len=32 | prompt length=1   past_len=32
-- | -- | --
ORT-main | 1220ms | 263ms
Neural-speed | 564ms | 87ms
ORT-this PR|597ms|120ms

Although `Neural-speed` and `ORT-this PR` use the same int4 kernel code,
there is a 33ms(87ms vs. 120ms) latency gap between the two frameworks.
Through some statistics analysis, the summary latency of `MatMulNBits`
is 86.7ms
The summary latency of all int4 GEMMs in `Neural-speed` is 84.8ms. So
other OPs introduce an extra 30ms latency.

The performance of MatMulNBits in this PR meets our expectations.

### Remain Issues
1. For hybrid CPUs, like core 12900K, the ONNXRuntime thread pool uses
TaskGranularityFactor to scale its number of threads. This is not
expected in our code design. It may slow down the hybrid CPU performance
by 30~40%.
2. Prepack uses a single thread which is very slow to init a session.
3. MatMulNBits with zero points will fall through to COMP_FP32 even
accuracy_level=4. Our COMP_INT8 IGemmCore with zero points process is
not optimized for now. It will be updated in the future. So, for an int4
model with zero points, whether the accuracy_level is 0 or 4 will be no
difference.
---
 cmake/CMakeLists.txt                          |   18 +-
 cmake/deps.txt                                |    2 +-
 cmake/external/neural_speed.cmake             |   18 +
 cmake/onnxruntime_mlas.cmake                  |   13 -
 cmake/onnxruntime_providers_cpu.cmake         |   15 +
 .../cpu/quantization/matmul_nbits.cc          |   58 +-
 .../cpu/quantization/neural_speed_defs.h      |   45 +
 .../cpu/quantization/neural_speed_gemm.cc     |  438 ++
 .../cpu/quantization/neural_speed_gemm.h      |  129 +
 .../cpu/quantization/neural_speed_wrapper.h   |   39 +
 onnxruntime/core/mlas/inc/mlas_qnbit.h        |  130 -
 onnxruntime/core/mlas/lib/jblas_defs.h        |   73 -
 onnxruntime/core/mlas/lib/jblas_gemm.cpp      |  534 --
 onnxruntime/core/mlas/lib/jblas_gemm.h        |   61 -
 onnxruntime/core/mlas/lib/sqnbitgemm.cpp      |  128 -
 .../core/mlas/lib/x86_64/jblas/.clang-format  |    7 -
 .../core/mlas/lib/x86_64/jblas/CMakeLists.txt |   33 -
 .../mlas/lib/x86_64/jblas/jblas/jit_base.h    |  303 --
 .../mlas/lib/x86_64/jblas/jblas/jit_blas.h    |   96 -
 .../lib/x86_64/jblas/jblas/jit_blas_device.h  |  277 -
 .../x86_64/jblas/jblas/jit_blas_epilogue.h    |  329 --
 .../lib/x86_64/jblas/jblas/jit_blas_gemm.h    | 2699 ----------
 .../x86_64/jblas/jblas/jit_blas_parallel.h    |  678 ---
 .../x86_64/jblas/jblas/jit_blas_prologue_a.h  |  214 -
 .../x86_64/jblas/jblas/jit_blas_prologue_b.h  |  892 ----
 .../lib/x86_64/jblas/jblas/jit_blas_storage.h |  665 ---
 .../lib/x86_64/jblas/jblas/jit_blas_utils.h   |  638 ---
 .../lib/x86_64/jblas/jblas/jit_blas_wrapper.h |  281 -
 .../mlas/lib/x86_64/jblas/jblas/kernel_avx2.h |  874 ---
 .../x86_64/jblas/jblas/kernel_avx512_bf16.h   |   92 -
 .../lib/x86_64/jblas/jblas/kernel_avx512f.h   | 1966 -------
 .../mlas/lib/x86_64/jblas/jblas/kernel_jit.h  | 1375 -----
 .../x86_64/jblas/jblas/kernel_jit_injector.h  |  930 ----
 .../mlas/lib/x86_64/jblas/jblas/kernel_ref.h  | 1039 ----
 .../lib/x86_64/jblas/jblas/kernel_wrapper.h   |  702 ---
 .../mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h | 3313 ------------
 .../x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h  |  271 -
 .../x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h | 4728 -----------------
 .../lib/x86_64/jblas/jblas/xbyak/xbyak_util.h | 1160 ----
 .../test/contrib_ops/matmul_4bits_test.cc     |   49 +-
 .../test/mlas/bench/bench_sqnbitgemm.cpp      |   61 -
 41 files changed, 753 insertions(+), 24620 deletions(-)
 create mode 100644 cmake/external/neural_speed.cmake
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
 create mode 100644 onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
 delete mode 100644 onnxruntime/core/mlas/lib/jblas_defs.h
 delete mode 100644 onnxruntime/core/mlas/lib/jblas_gemm.cpp
 delete mode 100644 onnxruntime/core/mlas/lib/jblas_gemm.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
 delete mode 100644 onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 712d5d76108aa..7d7304630c00e 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_JBLAS "Build MLAS with JBLAS support" ON)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
@@ -910,6 +910,10 @@ function(onnxruntime_set_compile_flags target_name)
       target_compile_definitions(${target_name} PRIVATE USE_CUTLASS)
     endif()
 
+    if(USE_NEURAL_SPEED)
+      target_compile_definitions(${target_name} PRIVATE ORT_NEURAL_SPEED)
+    endif()
+
     set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR ON)
     if (onnxruntime_USE_CUDA)
       # Suppress a "conversion_function_not_usable" warning in gsl/span
@@ -1194,14 +1198,10 @@ if (onnxruntime_USE_DNNL)
   add_compile_definitions(DNNL_OPENMP)
 endif()
 
-set(USE_JBLAS FALSE)
-if (onnxruntime_USE_JBLAS AND NOT onnxruntime_MINIMAL_BUILD)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
-  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
-    add_compile_definitions(MLAS_JBLAS)
-    set(USE_JBLAS TRUE)
+if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD)
+  include(neural_speed)
+  if (USE_NEURAL_SPEED)
+    list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla)
   endif()
 endif()
 
diff --git a/cmake/deps.txt b/cmake/deps.txt
index ff07803013071..fda27e5e93797 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -54,4 +54,4 @@ tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2
 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a795034a89d4f48a79d1f009f7a04c8dee
 utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
-composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
+composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
\ No newline at end of file
diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake
new file mode 100644
index 0000000000000..e66e2acfb209a
--- /dev/null
+++ b/cmake/external/neural_speed.cmake
@@ -0,0 +1,18 @@
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND onnxruntime_target_platform STREQUAL "x86_64")
+  set(USE_NEURAL_SPEED TRUE)
+elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" AND onnxruntime_target_platform STREQUAL "x64")
+  set(USE_NEURAL_SPEED TRUE)
+endif()
+
+if(USE_NEURAL_SPEED)
+  FetchContent_Declare(
+      neural_speed
+      URL https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip
+      URL_HASH SHA1=65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939
+  )
+  set(BTLA_USE_OPENMP OFF)
+  FetchContent_MakeAvailable(neural_speed)
+  if(NOT neural_speed_POPULATED)
+    FetchContent_Populate(neural_speed)
+  endif()
+endif()
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index b995b27123218..f89d2150a6830 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -57,15 +57,6 @@ endif()
 
 set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
 
-function(add_jblas)
-    add_subdirectory(${MLAS_SRC_DIR}/x86_64/jblas jblas)
-    target_link_libraries(onnxruntime_mlas PRIVATE jblas::jblas)
-    target_sources(onnxruntime_mlas PRIVATE
-        ${MLAS_SRC_DIR}/jblas_gemm.cpp
-     )
-    set_target_properties(${target_name} PROPERTIES COMPILE_WARNING_AS_ERROR OFF)
-endfunction()
-
 #TODO: set MASM flags properly
 function(setup_mlas_source_for_windows)
 
@@ -622,10 +613,6 @@ else()
     target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
 
-if(USE_JBLAS)
-  add_jblas()
-endif()
-
 foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
     target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
     onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index f60faa4d39116..b81a5c79ac0cc 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -60,6 +60,15 @@ if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
       "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
     )
   endif()
+  set(onnxruntime_cpu_neural_speed_srcs 
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_defs.h"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.cc"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/quantization/neural_speed_gemm.h"
+  )
+  if(NOT USE_NEURAL_SPEED)
+    list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs ${onnxruntime_cpu_neural_speed_srcs})
+  endif()
   # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
   source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
   list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@@ -144,6 +153,12 @@ if (HAS_BITWISE_INSTEAD_OF_LOGICAL)
   target_compile_options(onnxruntime_providers PRIVATE "-Wno-bitwise-instead-of-logical")
 endif()
 
+if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+  if(USE_NEURAL_SPEED)
+    onnxruntime_add_include_to_target(onnxruntime_providers neural_speed::bestla)
+  endif()
+endif()
+
 if (MSVC)
    target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
 #   if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
index 406c73c95d444..72948c74d7877 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -9,6 +9,9 @@
 #include "core/mlas/inc/mlas_q4.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/common.h"
+#ifdef ORT_NEURAL_SPEED
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -24,15 +27,17 @@ class MatMulNBits final : public OpKernel {
         accuracy_level_{info.GetAttr<int64_t>("accuracy_level")} {
     ORT_ENFORCE(nbits_ == 4,
                 "Only 4b quantization is supported for MatMulNBits op, additional bits support is planned.");
-    is_asym_ = info.GetInputCount() >= 4;
+#ifdef ORT_NEURAL_SPEED
     const Tensor* tensor_B = nullptr;
     const Tensor* tensor_scale = nullptr;
     const Tensor* tensor_zero_point = nullptr;
     bool B_constant = info.TryGetConstantInput(1, &tensor_B);
     bool scale_constant = info.TryGetConstantInput(2, &tensor_scale);
     bool zero_point_constant = info.TryGetConstantInput(3, &tensor_zero_point);
+    is_asym_ = info.GetInputCount() >= 4;
     all_constant_ = B_constant && scale_constant;
     all_constant_ = is_asym_ ? all_constant_ && zero_point_constant : all_constant_;
+#endif
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -53,30 +58,34 @@ class MatMulNBits final : public OpKernel {
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_;
   size_t packed_b_size_{0};
+#ifdef ORT_NEURAL_SPEED
   bool is_asym_{false};
   bool all_constant_{false};
+#endif
 };
 
 Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc,
                             /*out*/ bool& is_packed,
                             /*out*/ PrePackedWeights* prepacked_weights) {
   is_packed = false;
+#ifdef ORT_NEURAL_SPEED
   if (!all_constant_) {
     return Status::OK();
   }
-
-#if defined(MLAS_JBLAS)
-
-  auto compt_type = static_cast<MLAS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
   MLAS_THREADPOOL* pool = NULL;
+  if (nbits_ != 4) {
+    return Status::OK();
+  }
+  auto comp_type = static_cast<NS_SQNBIT_COMPUTE_TYPE>(accuracy_level_);
+  auto nbits = static_cast<int>(nbits_);
   if (input_idx == 1) {
-    packed_b_size_ = MlasNBitsGemmPackBSize(N_, K_, block_size_, static_cast<int>(nbits_), is_asym_, compt_type);
+    packed_b_size_ = NSNBitsGemmPackBSize(N_, K_, block_size_, nbits, is_asym_, comp_type);
     if (packed_b_size_ == 0) return Status::OK();
     auto qptr = tensor.Data<uint8_t>();
     packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
     std::memset(packed_b_.get(), 0, packed_b_size_);
-    MlasNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, false, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), qptr, nullptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, false,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -85,8 +94,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 2 && packed_b_ != nullptr) {
     auto sptr = tensor.Data<float>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, !is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, sptr, nullptr, N_, K_, K_, block_size_, nbits, is_asym_, !is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -95,8 +104,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
   }
   if (input_idx == 3 && packed_b_ != nullptr) {
     auto zptr = tensor.Data<uint8_t>();
-    MlasNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, static_cast<int>(nbits_),
-                       is_asym_, is_asym_, compt_type, pool);
+    NSNBitsGemmPackB(packed_b_.get(), nullptr, nullptr, zptr, N_, K_, K_, block_size_, nbits, is_asym_, is_asym_,
+                     comp_type, pool);
     if (prepacked_weights) {
       prepacked_weights->buffers_.push_back(std::move(packed_b_));
       prepacked_weights->buffer_sizes_.push_back(packed_b_size_);
@@ -104,7 +113,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_);
@@ -119,7 +128,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
     is_packed = true;
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   return Status::OK();
 }
@@ -127,9 +136,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ Allocat
 Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers, int input_idx,
                                               /*out*/ bool& used_shared_buffers) {
   used_shared_buffers = false;
-
-#if defined(MLAS_JBLAS)
-
+#ifdef ORT_NEURAL_SPEED
   // Pack three tensors into one buffer
   if (input_idx == 1) {
     used_shared_buffers = true;
@@ -144,14 +151,14 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prep
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#else  // defined(MLAS_JBLAS)
+#else  // defined(ORT_NEURAL_SPEED)
 
   if (input_idx == 1) {
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
   return Status::OK();
 }
 
@@ -160,9 +167,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
 
   const Tensor* a = ctx->Input<Tensor>(0);
   const auto* a_data = a->Data<float>();
-
-#if defined(MLAS_JBLAS)
-
+#ifdef ORT_NEURAL_SPEED
   if (packed_b_.get()) {
     TensorShape b_shape({static_cast<int64_t>(N_), static_cast<int64_t>(K_)});
 
@@ -181,7 +186,7 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
     const size_t N = static_cast<size_t>(helper.N());
     const size_t K = static_cast<size_t>(helper.K());
     const size_t lda = helper.Lda(false);
-    std::vector<MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
+    std::vector<NS_SQNBITS_GEMM_DATA_PACKED_PARAMS> gemm_params(max_len);
     AllocatorPtr allocator;
     auto status = ctx->GetTempSpaceAllocator(&allocator);
     ORT_RETURN_IF_ERROR(status);
@@ -192,15 +197,14 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const {
       gemm_params[i].C = y_data + helper.OutputOffsets()[i];
       gemm_params[i].ldc = N;
     }
-    auto ws_size = MlasSQNBitsGemmBatchPackedBWorkspaceSize(M, N, K, max_len, gemm_params.data());
+    auto ws_size = NSSQNBitsGemmBatchWorkspaceSize(M, N, K, max_len, gemm_params.data());
     // workspace for activation process(dynamic quantization and others)
     auto ws_ptr = IAllocator::MakeUniquePtr<int8_t>(allocator, ws_size);
-    MlasSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(),
-                                thread_pool);
+    NSSQNBitsGemmBatchPackedB(M, N, K, max_len, gemm_params.data(), ws_ptr.get(), thread_pool);
     return Status::OK();
   }
 
-#endif  // defined(MLAS_JBLAS)
+#endif  // defined(ORT_NEURAL_SPEED)
 
   const Tensor* scales = ctx->Input<Tensor>(2);
   const Tensor* zero_points = ctx->Input<Tensor>(3);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
new file mode 100644
index 0000000000000..864abffd131fe
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_defs.h
@@ -0,0 +1,45 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+--*/
+
+#pragma once
+
+#include "contrib_ops/cpu/quantization/neural_speed_wrapper.h"
+
+namespace bestla {
+
+using tAVX512F = gemm::SCoreRowNAvx512f<48, 8>;
+using tAMX_BF16 = gemm::HCoreRowNAmxbf16<64, 16>;
+using tAVX512_FP16 = gemm::HCoreRowNAvx512fp16<96, 8>;
+using tAVX_VNNI = gemm::ICoreRowNAvxvnni<24, 4>;
+using tAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>;
+using tAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>;
+using tAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>;
+using tAVX2 = gemm::SCoreRowNAvx2<24, 4>;
+using tAVX_VNNI_KBlock = gemm::ICoreRowNAvxvnniKBlock<24, 2>;
+using tAVX512_VNNI_KBlock = gemm::ICoreRowNAvx512vnniKBlock<48, 4>;
+using tAMX_INT8_US_KBlock = gemm::ICoreRowNAmxint8KBlock<48, 16>;
+using tAMX_INT8_SS_KBlock = gemm::ICoreRowNAmxint8SSKBlock<48, 16>;
+
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNInt = prologue_b::gemm::WeightKBlockNInteger<GC_T, ISA_T>;
+template <class GC_T, BTLA_ISA ISA_T>
+using tWeiNFloat = prologue_b::gemm::WeightKBlockNFloat<GC_T, ISA_T>;
+
+class ORTThreading : public parallel::IThreading {
+ public:
+  explicit ORTThreading(void* tp);
+  void parallel_for(const parallel::thread_func& func) const override;
+  void set_threads(int nthreads) override {
+    (void)(nthreads);
+    assert(0);
+  }
+  void sync() const override { assert(0); }
+  void* mTp;
+};
+
+}  // namespace bestla
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
new file mode 100644
index 0000000000000..73aaa4ae61a6e
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.cc
@@ -0,0 +1,438 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.cpp
+
+Abstract:
+
+    GEMM template combinations of neural_speed.
+--*/
+
+#include "contrib_ops/cpu/quantization/neural_speed_defs.h"
+#include "contrib_ops/cpu/quantization/neural_speed_gemm.h"
+#include "core/platform/threadpool.h"
+
+using ThreadPool = onnxruntime::concurrency::ThreadPool;
+
+namespace bestla {
+
+ORTThreading::ORTThreading(void* tp)
+    : IThreading(ThreadPool::DegreeOfParallelism(reinterpret_cast<ThreadPool*>(tp))), mTp(tp) {}
+
+void ORTThreading::parallel_for(const parallel::thread_func& func) const {
+  ThreadPool::TrySimpleParallelFor(reinterpret_cast<ThreadPool*>(mTp), mThreadNum,
+                                   [&](ptrdiff_t tid) { func(static_cast<int>(tid)); });
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompF32(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                             parallel::IThreading* th) {
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  if (M <= 16) {
+    using Parallel = parallel::gemm::SchedulerKBlock<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationKBlockBaseF32,
+                                      prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::CompFp32BlockEpilogue,
+                                      epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
+    if (B->IsAsym()) {
+      reduceA.assign(WorkSpace);
+      ORTThreading single(nullptr);
+      kernel.mProA.reduce({A, lda_, &reduceA}, M_, K_, B->mBlockSize, &single);
+    }
+    typename Launcher::Param args{gp,
+                                  {A, lda_, &reduceA},
+                                  {B},
+                                  {B->template SPtr<int8_t>(), B->SDtype(), B->CStep(), B->template ZPtr<int8_t>(),
+                                   reduceA.template RPtr<float>(), reduceA.lda},
+                                  {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  } else {
+    using Parallel = parallel::gemm::SchedulerBase<GemmCore_T>;
+    using Launcher =
+        wrapper::gemm::LauncherBase<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationBase,
+                                    prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>;
+    static Launcher kernel;
+    typename Launcher::Param args{gp, {A, lda_}, {B}, {C, ldc_, nullptr}};
+    parallel::GemmRun<Parallel>(kernel, args, th);
+  }
+}
+
+template <class GemmCore_T>
+static void NSSQ4GemmCompInt8(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                              storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc, int8_t* WorkSpace,
+                              parallel::IThreading* th) {
+  using Parallel = parallel::gemm::SchedulerKBlockS<GemmCore_T>;
+  using Launcher =
+      wrapper::gemm::LauncherIntKBlock<GemmCore_T::ISA, GemmCore_T, prologue_a::gemm::ActivationF32KBlockQuantize,
+                                       prologue_b::gemm::WeightKBlockNInteger,
+                                       epilogue::gemm::AccumulatorWriteBackFp32>;
+  auto M_ = static_cast<int>(M);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto lda_ = static_cast<int>(lda);
+  auto ldc_ = static_cast<int>(ldc);
+  static Launcher kernel;
+  auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->IsAsym());
+  quanA.assign(WorkSpace);
+  if (M <= 16) {
+    ORTThreading single(nullptr);
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
+  } else {
+    kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
+  }
+  utils::GemmProblem gp(1, M_, N_, K_, B->mBlockSize);
+  typename Launcher::Param args{gp, {A, lda_, &quanA}, {B}, {C, ldc_, nullptr}};
+  parallel::GemmRun<Parallel>(kernel, args, th);
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompF32WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                            storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  auto M_ = static_cast<int>(M);
+  auto K_ = static_cast<int>(K);
+  (void)(A);
+  (void)(N);
+  (void)(C);
+  (void)(lda);
+  (void)(ldc);
+  if (M <= 16) {
+    using ProA = prologue_a::gemm::ActivationKBlockBaseF32<GemmCore_T, GemmCore_T::ISA>;
+    static ProA proA;
+    if (B->IsAsym()) {
+      auto reduceA = proA.createStorage(M_, K_, B->mBlockSize);
+      return reduceA.mSize;
+    }
+    return 0;
+  } else {
+    // using ProA = prologue_a::gemm::ActivationBase<GemmCore_T, GemmCore_T::ISA>;
+    return 0;
+  }
+}
+
+template <class GemmCore_T>
+static size_t NSSQ4GemmCompInt8WorkspaceSize(size_t M, size_t N, size_t K, const float* A, size_t lda,
+                                             storage::gemm::StorageWeightKBlockNInteger* B, float* C, size_t ldc) {
+  (void)(N);
+  (void)(lda);
+  (void)(ldc);
+  (void)(A);
+  (void)(C);
+  using ProA = prologue_a::gemm::ActivationF32KBlockQuantize<GemmCore_T, GemmCore_T::ISA>;
+  static ProA proA;
+  auto quanA =
+      proA.createStorage(static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->IsAsym());
+  return quanA.mSize;
+}
+
+}  // namespace bestla
+
+using namespace bestla;
+
+static bool NSSQ4GemmBatchDriver(size_t M, size_t N, size_t K, size_t BatchN,
+                                 const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, int8_t* WorkSpace,
+                                 void* ThreadPool) {
+  GetCPUDevice();
+  bestla::ORTThreading orth(ThreadPool);
+  bool processed = true;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = bestla::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<bestla::storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+      auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+      auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+      auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<bestla::storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == bestla::tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                       DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          } else if (NTile == bestla::tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            bestla::NSSQ4GemmCompF32<bestla::tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C,
+                                                    DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == bestla::tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() &&
+              BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAMX_INT8_SS_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX512_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                   DataParams[i].C, DataParams[i].ldc, WorkSpace,
+                                                                   &orth);
+          } else if (NTile == bestla::tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() &&
+                     BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            bestla::NSSQ4GemmCompInt8<bestla::tAVX_VNNI_KBlock>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                DataParams[i].C, DataParams[i].ldc, WorkSpace, &orth);
+          }
+        }
+      }
+    } else {
+      processed = false;
+      break;
+    }
+  }
+  return processed;
+}
+
+static size_t NSSQ4GemmBatchWorkspaceSize(size_t M, size_t N, size_t K, size_t BatchN,
+                                          const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  GetCPUDevice();
+  size_t size = 0;
+  for (size_t i = 0; i < BatchN; i++) {
+    auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
+    auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+    if (ptr) {
+      if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+        auto kptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+        auto NTile =
+            gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+        auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+        auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+        auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+        auto BlkSize = kptr->mBlockSize;
+        if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+          if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX512F>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                    DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompF32WorkspaceSize<tAVX2>(M, N, K, DataParams[i].A, DataParams[i].lda, kptr,
+                                                                 DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+        if (btype == gemm::CompType::tS8 && PackRow == 4) {
+          if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                     BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+            size = std::max(NSSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI_KBlock>(
+                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc),
+                            size);
+          }
+        }
+      }
+    }
+  }
+  return size;
+}
+
+template <typename T>
+static size_t NSQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym) {
+  static T proB;
+  auto stor = proB.createStorage(static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size),
+                                 BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32, BTLA_DTYPE::BF16, isAsym);
+  // TODO(Yu) support more scale dtype
+  return stor.mSize;
+}
+
+static bool NSQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  auto ptr = storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
+  auto uptr = std::unique_ptr<storage::gemm::IWeightBase>(ptr);
+  ORTThreading orth(ThreadPool);
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto ldb_ = static_cast<int>(ldb);
+  GetCPUDevice();
+  if (ptr) {
+    auto NTile = gemm::CoreAttr::get_mask_val(ptr->mCoreId, gemm::CoreAttr::NTILE_MASK, gemm::CoreAttr::NTILE_SHIFT);
+    auto PackRow = gemm::CoreAttr::get_packrow(ptr->mCoreId);
+    auto CType = gemm::CoreAttr::get_comp(ptr->mCoreId);
+    auto btype = static_cast<gemm::CompType>(gemm::CompTypeHelper::get_B(CType));
+    if (ptr->mPrologueID == BTLA_PROLOGUEB_IDS::WeightKBlockNInteger) {
+      auto wptr = reinterpret_cast<storage::gemm::StorageWeightKBlockNInteger*>(ptr);
+      auto BlkSize = wptr->mBlockSize;
+      if (btype == gemm::CompType::tFP32 && PackRow == 1) {
+        if (NTile == tAVX512F::NTILE && _cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+          static tWeiNInt<tAVX512F, tAVX512F::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX2::NTILE && _cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+          static tWeiNInt<tAVX2, tAVX2::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+      if (btype == gemm::CompType::tS8 && PackRow == 4) {
+        if (NTile == tAMX_INT8_SS_KBlock::NTILE && _cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          static tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX512_VNNI_KBlock::NTILE && _cd->AVX512_VNNI() &&
+                   BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        } else if (NTile == tAVX_VNNI_KBlock::NTILE && _cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          static tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA> proB;
+          proB.unpackWeight(N_, K_, wptr, FpData, ldb_, &orth);
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+static void NSQ4GemmPackBImpl(void* PackedBuf, size_t BlkSize, const uint8_t* QData, const float* Scale,
+                              const uint8_t* Zp, size_t N, size_t K, bool IsAsym, bool lastCall, size_t ldb,
+                              void* ThreadPool) {
+  static T proB;
+  auto N_ = static_cast<int>(N);
+  auto K_ = static_cast<int>(K);
+  auto stor = proB.createStorage(N_, K_, static_cast<int>(BlkSize), BTLA_DTYPE::S4_CLIP, BTLA_DTYPE::F32,
+                                 BTLA_DTYPE::BF16, IsAsym);
+  stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
+  ORTThreading orth(ThreadPool);
+  proB.packNbitsWeightQ4(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
+  if (lastCall) {
+    proB.reduceWeight(&stor, &orth);
+  }
+}
+
+static size_t NSQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, NS_SQNBIT_COMPUTE_TYPE CompType) {
+  GetCPUDevice();
+  if (K % BlkSize != 0) {
+    return 0;
+  }
+  // from low precision to high precision
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          return NSQ4BuSize<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(BlkSize, N, K, isAsym);
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX512F, tAVX512F::ISA>>(BlkSize, N, K, isAsym);
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        return NSQ4BuSize<tWeiNInt<tAVX2, tAVX2::ISA>>(BlkSize, N, K, isAsym);
+      }
+      [[fallthrough]];
+    default:
+      return 0;
+  }
+}
+
+static bool NSQ4GemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N,
+                          size_t K, size_t ldb, size_t BlkSize, bool isAsym, bool lastCall,
+                          NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  GetCPUDevice();
+  // explicit statement fall through.
+  switch (CompType) {
+    case NSCompInt8:
+      if (!isAsym) {  // asym int8 is not optimized, so fall through to others.
+        if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAMX_INT8_SS_KBlock, tAMX_INT8_SS_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX512_VNNI_KBlock, tAVX512_VNNI_KBlock::ISA>>(
+              PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+        if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI_KBlock::KTILE == 0) {
+          NSQ4GemmPackBImpl<tWeiNInt<tAVX_VNNI_KBlock, tAVX_VNNI_KBlock::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N,
+                                                                               K, isAsym, lastCall, ldb, ThreadPool);
+          return true;
+        }
+      }
+      [[fallthrough]];
+    case NSCompBf16:
+    case NSCompFp16:
+    case NSCompFp32:
+    case NSCompUndef:
+      if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX512F, tAVX512F::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym,
+                                                             lastCall, ldb, ThreadPool);
+        return true;
+      }
+      if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
+        NSQ4GemmPackBImpl<tWeiNInt<tAVX2, tAVX2::ISA>>(PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall,
+                                                       ldb, ThreadPool);
+        return true;
+      }
+      [[fallthrough]];
+    default:
+      return false;
+  }
+}
+
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym,
+                            NS_SQNBIT_COMPUTE_TYPE CompType) {
+  if (nbits == 4) {
+    auto jsize = NSQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
+    if (jsize) {
+      return jsize;
+    }
+  }
+  return 0;
+}
+
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t BlkSize, int nbits, bool isAsym, bool lastCall,
+                      NS_SQNBIT_COMPUTE_TYPE CompType, void* ThreadPool) {
+  if (nbits == 4) {
+    if (NSQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
+      return;
+    }
+  }
+}
+
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
+    return;
+  }
+}
+
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  return NSSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
+}
+
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool) {
+  // only nbits=4 can be packed, so not necessary to check the nbits in DataParams
+  if (NSSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
+    // PackedWeight is created by bestla
+    return;
+  }
+}
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
new file mode 100644
index 0000000000000..ebcb3027a209f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_gemm.h
@@ -0,0 +1,129 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    neural_speed_gemm.h
+
+Abstract:
+
+    Prepack-weight GEMM APIs of neural_speed.
+--*/
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+
+/**
+ * @brief Define compute types of block quantization
+ */
+enum NS_SQNBIT_COMPUTE_TYPE {
+  NSCompUndef = 0, /*!< undef */
+  NSCompFp32 = 1,  /*!< input fp32, accumulator fp32 */
+  NSCompFp16 = 2,  /*!< input fp16, accumulator fp16 */
+  NSCompBf16 = 3,  /*!< input bf16, accumulator fp32 */
+  NSCompInt8 = 4   /*!< input int8, accumulator int32 */
+};
+
+/**
+ * @brief Data parameters for NBits GEMM routine
+ *        C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *        All except C are [in] parameters
+ */
+struct NS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
+  const float* A = nullptr; /**< address of A (float32 matrix)*/
+  const void* B = nullptr;  /**< address of B (packed nbits blob)*/
+  float* C = nullptr;       /**< address of result matrix */
+  size_t lda = 0;           /**< leading dimension of A */
+  size_t ldc = 0;           /**< leading dimension of C*/
+};
+
+/**
+ * @brief Compute the byte size of the parameter combination
+ *
+ * @param N      the number of columns of matrix B.
+ * @param K      the number of rows of matrix B.
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits  number of bits used for weight quantization
+ * @param is_asym  flag for asymmetric quantization
+ * @param comp_type  specify input data type and accumulator data type
+ * @return size of the packing buffer, 0 if the operation is not yet supported.
+ */
+size_t NSNBitsGemmPackBSize(size_t N, size_t K, size_t block_size, int nbits, bool is_asym,
+                            NS_SQNBIT_COMPUTE_TYPE comp_type);
+
+/**
+ * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
+ *
+ * @param PackedBuf     packed data buffer
+ * @param QData         quantized data buffer
+ * @param Scale         scale pointer
+ * @param Zp            zero point pointer
+ * @param N             the number of columns of matrix B.
+ * @param K             the number of rows of matrix B.
+ * @param ldb           leading dimension of B
+ * @param block_size    size of the block to quantize, elements from the same block share the same
+ * scale and zero point
+ * @param nbits         number of bits used for weight quantization (default 4)
+ * @param is_asym       flag for asymmetric quantization
+ * @param comp_type     specify input data type and accumulator data type
+ * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
+ * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
+ * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
+ * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
+ * (is_asym is false) and Zp(is_asym is true).
+ * @param thread_pool
+ */
+void NSNBitsGemmPackB(void* PackedBuf, const uint8_t* QData, const float* Scale, const uint8_t* Zp, size_t N, size_t K,
+                      size_t ldb, size_t block_size, int nbits, bool is_asym, bool last_call,
+                      NS_SQNBIT_COMPUTE_TYPE comp_type, void* thread_pool);
+
+/**
+ * @brief Unpack and dequantize to fp32
+ *
+ * @param FpData     unpacked float32 data
+ * @param PackedBuf  quantized and packed data
+ * @param N          the number of columns of matrix B.
+ * @param K          the number of rows of matrix B.
+ * @param ldb        leading dimension of B
+ * @param thread_pool
+ */
+void NSNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, void* thread_pool);
+
+/**
+ * @brief Get the workspace size required by computation.
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @return     Workspace size in bytes
+ */
+size_t NSSQNBitsGemmBatchWorkspaceSize(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                                       const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams);
+
+/**
+ * @brief Batched GEMM:  C = A * B
+ *        A, C must be a float32 matrix
+ *        B must be a packed nbits blob
+ *
+ * @param[in]  M       row size of matrix A and C
+ * @param[in]  N       column size of matrix B and C
+ * @param[in]  K       column size of matrix A and row size of matrix B
+ * @param[in]  BatchN  number of batches
+ * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
+ * @param[in]  WorkSpace  temporary buffer
+ * @param[in]  ThreadPool
+ * @return
+ */
+void NSSQNBitsGemmBatchPackedB(const size_t M, const size_t N, const size_t K, const size_t BatchN,
+                               const NS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams, void* WorkSpace,
+                               void* ThreadPool = nullptr);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
new file mode 100644
index 0000000000000..d3902f9bd68c7
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h
@@ -0,0 +1,39 @@
+//-----------------------------------------------------------------------------
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+//-----------------------------------------------------------------------------
+#pragma once
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-value"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4457)
+#pragma warning(disable : 4189)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4267)
+#pragma warning(disable : 4702)
+#endif
+
+#include "bestla/bestla_prologue_a.h"
+#include "bestla/bestla_wrapper.h"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h
index bc0bfc92c85a0..047011e70bd4d 100644
--- a/onnxruntime/core/mlas/inc/mlas_qnbit.h
+++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h
@@ -183,133 +183,3 @@ MlasSQNBitGemmPackQuantBData(
     void* PackedQuantBData,
     MLAS_THREADPOOL* ThreadPool = nullptr
 );
-
-/**
- * @brief Data parameters for NBits GEMM routine
- *        C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *        All except C are [in] parameters
- */
-struct MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS {
-    const float* A = nullptr; /**< address of A (float32 matrix)*/
-    const void* B = nullptr;  /**< address of B (packed nbits blob)*/
-    float* C = nullptr;       /**< address of result matrix */
-    size_t lda = 0;           /**< leading dimension of A */
-    size_t ldc = 0;           /**< leading dimension of C*/
-};
-
-/**
- * @brief Compute the byte size of the parameter combination
- *
- * @param N      the number of columns of matrix B.
- * @param K      the number of rows of matrix B.
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits  number of bits used for weight quantization
- * @param is_asym  flag for asymmetric quantization
- * @param comp_type  specify input data type and accumulator data type
- * @return size of the packing buffer, 0 if the operation is not yet supported.
- */
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t block_size, int nbits, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE comp_type
-);
-
-/**
- * @brief Prepack tensor data from n-bit quantized data, scale and zero point buffers.
- *
- * @param PackedBuf     packed data buffer
- * @param QData         quantized data buffer
- * @param Scale         scale pointer
- * @param Zp            zero point pointer
- * @param N             the number of columns of matrix B.
- * @param K             the number of rows of matrix B.
- * @param ldb           leading dimension of B
- * @param block_size    size of the block to quantize, elements from the same block share the same
- * scale and zero point
- * @param nbits         number of bits used for weight quantization (default 4)
- * @param is_asym       flag for asymmetric quantization
- * @param comp_type     specify input data type and accumulator data type
- * @param last_call     flag to activate the epilogue process of packB. OpKernel::PrePack will query input tensor
- * one by one: QData, Scale, Zp (if is_asym is true). But kernel prefers to pack all tensors into one blob data where
- * they can share the common attributes like: block_size. Meanwhile, kernel has some pre-computations to speed up
- * inference which require that all blob data are ready. So, you need to set this flag to true when passing Scale
- * (is_asym is false) and Zp(is_asym is true).
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t block_size,
-    int nbits,
-    bool is_asym,
-    bool last_call,
-    MLAS_SQNBIT_COMPUTE_TYPE comp_type,
-    MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Unpack and dequantize to fp32
- *
- * @param FpData     unpacked float32 data
- * @param PackedBuf  quantized and packed data
- * @param N          the number of columns of matrix B.
- * @param K          the number of rows of matrix B.
- * @param ldb        leading dimension of B
- * @param thread_pool
- */
-void MLASCALL
-MlasNBitsGemmUnPackB(
-    float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* thread_pool
-);
-
-/**
- * @brief Get the workspace size required by computation.
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @return     Workspace size in bytes
- */
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
-
-/**
- * @brief Batched GEMM:  C = A * B
- *        A, C must be a float32 matrix
- *        B must be a packed nbits blob
- *
- * @param[in]  M       row size of matrix A and C
- * @param[in]  N       column size of matrix B and C
- * @param[in]  K       column size of matrix A and row size of matrix B
- * @param[in]  BatchN  number of batches
- * @param[inout]  DataParams  An array (size BatchN) of parameter blocks
- * @param[in]  WorkSpace  temporary buffer
- * @param[in]  ThreadPool
- * @return
- */
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool = nullptr
-);
diff --git a/onnxruntime/core/mlas/lib/jblas_defs.h b/onnxruntime/core/mlas/lib/jblas_defs.h
deleted file mode 100644
index 9cd1711a3ffd2..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_defs.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
---*/
-
-#pragma once
-
-#include "jblas/jit_blas_prologue_b.h"
-#include "jblas/jit_blas_wrapper.h"
-
-namespace jblas
-{
-
-/*
-Name conversion explaination:
-Fp32:   comp type, determined by GemmCore, can be any jblas::gemm::SCorexxx(float GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(also support other integer and float weight
-classes)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompFp32BlockEpilogue is a fixed class for all fp32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Fp32_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationKBlockBaseF32,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompFp32BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-/*
-Name conversion explaination:
-Int8:   comp type, determined by GemmCore, can be any jblas::gemm::ICorexxx(integer GemmCore)
-S4:     weight dtype, determined by jblas::prologue_b::gemm::WeightKBlockS4(support integer weight classes only)
-F32F32: input/output dtype, determined by jblas::prologue_a::gemm::ActivationKBlockBaseF32 and
-jblas::epilogue::gemm::AccumulatorWriteBackFp32.
-
-Tips: jblas::epilogue::gemm::CompInt8BlockEpilogue is a fixed class for all int32 accumulator GemmCores.
-*/
-template <class GemmCore_T>
-using tLauncher_Int8_S4_F32F32 = jblas::wrapper::gemm::LauncherKBlock<
-    GemmCore_T::ISA,
-    GemmCore_T,
-    jblas::prologue_a::gemm::ActivationF32KBlockQuantize,
-    jblas::prologue_b::gemm::WeightKBlockS4,
-    jblas::epilogue::gemm::CompInt8BlockEpilogue,
-    jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-
-using tAVX512F = jblas::gemm::SCoreRowNAvx512f<48, 8>;
-using tAMX_BF16 = jblas::gemm::HCoreRowNAmxbf16<64, 16>;
-using tAVX512_FP16 = jblas::gemm::HCoreRowNAvx512fp16<96, 8>;
-using tAVX_VNNI = jblas::gemm::ICoreRowNAvxvnni<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-using tAVX512_VNNI = jblas::gemm::ICoreRowNAvx512vnni<48, 8>;
-using tAMX_INT8_US = jblas::gemm::ICoreRowNAmxint8<64, 16>;
-using tAMX_INT8_SS = jblas::gemm::ICoreRowNAmxint8SS<64, 16>;
-using tAVX2 = jblas::gemm::SCoreRowNAvx2<48, 2>;  // TODO(Yu) use 24x4 for higher efficiency
-
-class ORTThreading : public jblas::parallel::IThreading
-{
-   public:
-    ORTThreading(void* tp);
-    void parallel_for(const jblas::parallel::thread_func& func) override;
-    void set_threads(int nthreads) override { assert(0); }
-    void sync() override { assert(0); }
-    void* mTp;
-};
-
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.cpp b/onnxruntime/core/mlas/lib/jblas_gemm.cpp
deleted file mode 100644
index f3cae3186c28e..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.cpp
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#include "jblas_gemm.h"
-
-#include "jblas_defs.h"
-#include "mlasi.h"
-
-using namespace jblas;
-
-jblas::ORTThreading::ORTThreading(void* tp)
-    : IThreading(MLAS_THREADPOOL::DegreeOfParallelism(reinterpret_cast<MLAS_THREADPOOL*>(tp))), mTp(tp)
-{
-}
-
-void
-jblas::ORTThreading::parallel_for(const jblas::parallel::thread_func& func)
-{
-    MlasTrySimpleParallel(reinterpret_cast<MLAS_THREADPOOL*>(mTp), mThreadNum, [&](ptrdiff_t tid) {
-        func(static_cast<int>(tid));
-    });
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompF32(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    if (M <= 16) {
-        using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-        if (B->mIsAsym) {
-            reduceA.assign(WorkSpace);
-            ORTThreading single(nullptr);
-            kernel.mProA.reduce({A, lda_}, &reduceA, M_, K_, &single);
-        }
-        typename Launcher::BEpiParam blkargs{
-            B->template SPtr<int8_t>(),    B->mScaT,   B->mCStep, B->template ZPtr<int8_t>(),
-            reduceA.template get<float>(), reduceA.lda};
-
-        typename Launcher::Param args{M_, N_, K_, B->mBlockSize, {A, lda_}, {B}, blkargs, {C, ldc_}};
-        jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-    } else {
-        using Parallel = jblas::parallel::gemm::SchedulerBase<GemmCore_T>;
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-
-        typename Launcher::Param args{M_, N_, K_, {A, lda_}, {B}, {C, ldc_}};
-        jblas::parallel::GemmBaseRun<Parallel>(kernel, args, th);
-    }
-}
-
-template <class GemmCore_T>
-static void
-JblasSQ4GemmCompInt8(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc,
-    int8_t* WorkSpace,
-    jblas::parallel::IThreading* th
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    auto M_ = static_cast<int>(M);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto lda_ = static_cast<int>(lda);
-    auto ldc_ = static_cast<int>(ldc);
-    static Launcher kernel;
-    auto quanA = kernel.mProA.createStorage(M_, K_, B->mBlockSize, B->mIsAsym);
-    quanA.assign(WorkSpace);
-    if (M <= 16) {
-        ORTThreading single(nullptr);
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, &single);
-    } else {
-        kernel.mProA.quantize({A, lda_, &quanA}, M_, K_, th);
-    }
-    typename Launcher::Param args{
-        M_,
-        N_,
-        K_,
-        B->mBlockSize,
-        {A, lda_, &quanA},
-        {B},
-        {B->template SPtr<int8_t>(), B->mScaT, B->mCStep, quanA.template SPtr<float>(), quanA.mCStep,
-         quanA.template ZPtr<uint8_t>(), B->template RPtr<float>(), B->mRedT, B->template ZPtr<int8_t>(),
-         quanA.template RPtr<float>(), B->mBlockSize},
-        {C, ldc_}};
-    jblas::parallel::GemmKBlockRun<Parallel>(kernel, args, th);
-}
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    ORTThreading orth(ThreadPool);
-    bool processed = true;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        JblasSQ4GemmCompF32<tAVX512F>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        JblasSQ4GemmCompF32<tAVX2>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_US>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX512_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        JblasSQ4GemmCompInt8<tAVX_VNNI>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        JblasSQ4GemmCompInt8<tAMX_INT8_SS>(
-                            M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc,
-                            WorkSpace, &orth
-                        );
-                    }
-                }
-            }
-        } else {
-            processed = false;
-            break;
-        }
-    }
-    return processed;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompF32WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    auto M_ = static_cast<int>(M);
-    auto K_ = static_cast<int>(K);
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    if (M <= 16) {
-        using Launcher = tLauncher_Fp32_S4_F32F32<GemmCore_T>;
-        static Launcher kernel;
-        if (B->mIsAsym) {
-            auto reduceA = kernel.mProA.createStorage(M_, K_, B->mBlockSize);
-            return reduceA.mSize;
-        }
-        return 0;
-    } else {
-        using Launcher = jblas::wrapper::gemm::LauncherBase<
-            GemmCore_T::ISA, GemmCore_T, jblas::prologue_a::gemm::ActivationBase,
-            jblas::prologue_b::gemm::WeightKBlockS4, jblas::epilogue::gemm::AccumulatorWriteBackFp32>;
-        static Launcher kernel;
-        return 0;
-    }
-    return 0;
-}
-
-template <class GemmCore_T>
-static size_t
-JblasSQ4GemmCompInt8WorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const float* A,
-    const size_t lda,
-    jblas::storage::gemm::StorageWeightKBlockS4* B,
-    float* C,
-    const size_t ldc
-)
-{
-    using Parallel = jblas::parallel::gemm::SchedulerKBlock<GemmCore_T>;
-    using Launcher = tLauncher_Int8_S4_F32F32<GemmCore_T>;
-    static Launcher kernel;
-    (void)(N);
-    (void)(lda);
-    (void)(ldc);
-    auto quanA = kernel.mProA.createStorage(
-        static_cast<int>(M), static_cast<int>(K), static_cast<int>(B->mBlockSize), B->mIsAsym
-    );
-    return quanA.mSize;
-}
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-    GetCPUDevice();
-    size_t size = 0;
-    for (size_t i = 0; i < BatchN; i++) {
-        auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(DataParams[i].B);
-        auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-        if (ptr) {
-            if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-                auto kptr = reinterpret_cast<jblas::storage::gemm::StorageWeightKBlockS4*>(ptr);
-                auto coretype = ptr->mCoreId;
-                auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-                );
-                auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                    ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-                );
-                if (CType == uint32_t(gemm::CompType::COMP_FP32)) {
-                    if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX512F>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                        size = std::max(
-                            JblasSQ4GemmCompF32WorkspaceSize<tAVX2>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_US_INT32)) {
-                    if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_US>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX512_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAVX_VNNI>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-                if (CType == uint32_t(gemm::CompType::COMP_INT8_SS_INT32)) {
-                    if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                        size = std::max(
-                            JblasSQ4GemmCompInt8WorkspaceSize<tAMX_INT8_SS>(
-                                M, N, K, DataParams[i].A, DataParams[i].lda, kptr, DataParams[i].C, DataParams[i].ldc
-                            ),
-                            size
-                        );
-                    }
-                }
-            }
-        }
-    }
-    return size;
-}
-
-template <typename T>
-static size_t
-JblasQ4BuSize(size_t block_size, size_t N, size_t K, bool isAsym)
-{
-    static T launcher;
-    auto stor = launcher.mProB.createStorage(
-        static_cast<int>(N), static_cast<int>(K), static_cast<int>(block_size), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32,
-        JBLAS_DTYPE::BF16, isAsym
-    );
-    // TODO(Yu) support more scale dtype
-    return stor.mSize;
-}
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType)
-{
-    GetCPUDevice();
-    if (K % BlkSize != 0) {
-        return 0;
-    }
-    // from low precision to high precision
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(BlkSize, N, K, isAsym);
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX512F>>(BlkSize, N, K, isAsym);
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                return JblasQ4BuSize<tLauncher_Int8_S4_F32F32<tAVX2>>(BlkSize, N, K, isAsym);
-            }
-            break;
-        default:
-            return 0;
-    }
-    return 0;
-}
-
-template <typename T>
-static void
-JblasQ4GemmPackBImpl(
-    void* PackedBuf,
-    size_t BlkSize,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    bool IsAsym,
-    bool lastCall,
-    size_t ldb,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    static T JblasKernel;
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto stor = JblasKernel.mProB.createStorage(
-        N_, K_, static_cast<int>(BlkSize), JBLAS_DTYPE::S4_CLIP, JBLAS_DTYPE::F32, JBLAS_DTYPE::BF16, IsAsym
-    );
-    stor.assign(reinterpret_cast<int8_t*>(PackedBuf));
-    ORTThreading orth(ThreadPool);
-    JblasKernel.mProB.packNbitsWeight(N_, K_, IsAsym, QData, static_cast<int>(ldb), Scale, Zp, &stor, &orth);
-    if (lastCall) {
-        JblasKernel.mProB.reduceWeight(&stor, &orth);
-    }
-}
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetCPUDevice();
-    // explicit statement fall through.
-    switch (CompType) {
-        case CompInt8:
-            if (_cd->AMX_INT8() && BlkSize % tAMX_INT8_SS::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAMX_INT8_SS>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX512_VNNI() && BlkSize % tAVX512_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX512_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX_VNNI() && BlkSize % tAVX_VNNI::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Int8_S4_F32F32<tAVX_VNNI>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        case CompBf16:
-        case CompFp16:
-        case CompFp32:
-        case CompUndef:
-            if (_cd->AVX512F() && BlkSize % tAVX512F::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX512F>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-            if (_cd->AVX2() && BlkSize % tAVX2::KTILE == 0) {
-                JblasQ4GemmPackBImpl<tLauncher_Fp32_S4_F32F32<tAVX2>>(
-                    PackedBuf, BlkSize, QData, Scale, Zp, N, K, isAsym, lastCall, ldb, ThreadPool
-                );
-                return true;
-            }
-        default:
-            return false;
-    }
-    return false;
-}
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-    auto ptr = jblas::storage::gemm::PackedWeightParser::deserialBuffer(PackedBuf);
-    auto uptr = std::unique_ptr<jblas::storage::gemm::WeightBase>(ptr);
-    ORTThreading orth(ThreadPool);
-    auto N_ = static_cast<int>(N);
-    auto K_ = static_cast<int>(K);
-    auto ldb_ = static_cast<int>(ldb);
-    GetCPUDevice();
-    if (ptr) {
-        if (ptr->mPrologueID == JBLAS_PROLOGUEB_IDS::WeightKBlockS4) {
-            auto NTile = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::NTILE_MASK, jblas::gemm::CoreAttr::NTILE_SHIFT
-            );
-            auto CType = jblas::gemm::CoreAttr::get_mask_val(
-                ptr->mCoreId, jblas::gemm::CoreAttr::COMP_MASK, jblas::gemm::CoreAttr::COMP_SHIFT
-            );
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_FP32)) {
-                if (NTile == tAVX512F::NTILE && _cd->AVX512F()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512F, tAVX512F::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX2::NTILE && _cd->AVX2()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX2, tAVX2::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_US_INT32)) {
-                if (NTile == tAMX_INT8_US::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_US, tAMX_INT8_US::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX512_VNNI::NTILE && _cd->AVX512_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX512_VNNI, tAVX512_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                } else if (NTile == tAVX_VNNI::NTILE && _cd->AVX_VNNI()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAVX_VNNI, tAVX_VNNI::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-            if (CType == uint32_t(jblas::gemm::CompType::COMP_INT8_SS_INT32)) {
-                if (NTile == tAMX_INT8_SS::NTILE && _cd->AMX_INT8()) {
-                    static jblas::prologue_b::gemm::WeightKBlockS4<tAMX_INT8_SS, tAMX_INT8_SS::ISA> proB;
-                    proB.unpackWeight(N_, K_, ptr, FpData, ldb_, &orth);
-                }
-            }
-        }
-        return true;
-    }
-    return false;
-}
diff --git a/onnxruntime/core/mlas/lib/jblas_gemm.h b/onnxruntime/core/mlas/lib/jblas_gemm.h
deleted file mode 100644
index 044dc5e849a0a..0000000000000
--- a/onnxruntime/core/mlas/lib/jblas_gemm.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*++
-
-Copyright (c) Microsoft Corporation. All rights reserved.
-
-Licensed under the MIT License.
-
-Module Name:
-
-    jblas_gemm.h
-
-Abstract:
-
-    Currently only support Q4 gemm.
---*/
-
-#pragma once
-
-#include "mlas_qnbit.h"
-
-size_t
-JblasQ4GemmPackBSize(size_t N, size_t K, size_t BlkSize, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType);
-
-bool
-JblasQ4GemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-bool
-JblasQ4GemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb
-	, MLAS_THREADPOOL* ThreadPool);
-
-bool
-JblasSQ4GemmBatchDriver(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    int8_t* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-);
-
-size_t
-JblasSQ4GemmBatchWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-);
diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
index 7d877848017fe..0d8a5692359a6 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.cpp
@@ -19,10 +19,6 @@ Module Name:
 
 #include <cassert>
 
-#ifdef MLAS_JBLAS
-#include "jblas_gemm.h"
-#endif
-
 namespace
 {
 
@@ -694,127 +690,3 @@ MlasSQNBitGemmBatch(
         ComputeOperation(BlkLen, K, Data, PerGemmWorkspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN);
     });
 }
-
-size_t MLASCALL
-MlasNBitsGemmPackBSize(
-    size_t N, size_t K, size_t BlkSize, int nbits, bool isAsym, MLAS_SQNBIT_COMPUTE_TYPE CompType
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        auto jsize = JblasQ4GemmPackBSize(N, K, BlkSize, isAsym, CompType);
-        if (jsize) {
-            return jsize;
-        }
-    }
-#endif
-    (void)(N);
-    (void)(K);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(CompType);
-    return 0;
-}
-
-void MLASCALL
-MlasNBitsGemmPackB(
-    void* PackedBuf,
-    const uint8_t* QData,
-    const float* Scale,
-    const uint8_t* Zp,
-    size_t N,
-    size_t K,
-    size_t ldb,
-    size_t BlkSize,
-    int nbits,
-    bool isAsym,
-    bool lastCall,
-    MLAS_SQNBIT_COMPUTE_TYPE CompType,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-#ifdef MLAS_JBLAS
-    if (nbits == 4) {
-        if (JblasQ4GemmPackB(PackedBuf, QData, Scale, Zp, N, K, ldb, BlkSize, isAsym, lastCall, CompType, ThreadPool)) {
-            return;
-        }
-    }
-#endif
-    (void)(PackedBuf);
-    (void)(QData);
-    (void)(Scale);
-    (void)(Zp);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(BlkSize);
-    (void)(nbits);
-    (void)(isAsym);
-    (void)(lastCall);
-    (void)(CompType);
-    (void)(ThreadPool);
-}
-
-void MLASCALL
-MlasNBitsGemmUnPackB(float* FpData, const void* PackedBuf, size_t N, size_t K, size_t ldb, MLAS_THREADPOOL* ThreadPool)
-{
-#ifdef MLAS_JBLAS
-    if (JblasQ4GemmUnPackB(FpData, PackedBuf, N, K, ldb, ThreadPool)) {
-        return;
-    }
-#endif
-    (void)(FpData);
-    (void)(PackedBuf);
-    (void)(N);
-    (void)(K);
-    (void)(ldb);
-    (void)(ThreadPool);
-}
-
-size_t MLASCALL
-MlasSQNBitsGemmBatchPackedBWorkspaceSize(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams
-)
-{
-#ifdef MLAS_JBLAS
-    return JblasSQ4GemmBatchWorkspaceSize(M, N, K, BatchN, DataParams);
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    return 0;
-}
-
-void MLASCALL
-MlasSQNBitsGemmBatchPackedB(
-    const size_t M,
-    const size_t N,
-    const size_t K,
-    const size_t BatchN,
-    const MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS* DataParams,
-    void* WorkSpace,
-    MLAS_THREADPOOL* ThreadPool
-)
-{
-    GetMlasPlatform();
-#ifdef MLAS_JBLAS
-    if (JblasSQ4GemmBatchDriver(M, N, K, BatchN, DataParams, reinterpret_cast<int8_t*>(WorkSpace), ThreadPool)) {
-        // PackedWeight is created by jblas
-        return;
-    }
-#endif
-    (void)(M);
-    (void)(N);
-    (void)(K);
-    (void)(BatchN);
-    (void)(DataParams);
-    (void)(WorkSpace);
-    (void)(ThreadPool);
-}
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format b/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
deleted file mode 100644
index 84b876706161d..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/.clang-format
+++ /dev/null
@@ -1,7 +0,0 @@
-Language:        Cpp
-BasedOnStyle:  Google
-DerivePointerAlignment: false
-ColumnLimit: 120
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SortIncludes: false
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt b/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
deleted file mode 100644
index 5d9c5edf45a96..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-
-project(jblas LANGUAGES CXX VERSION 0.1.0)
-
-file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
-file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
-
-add_library(${PROJECT_NAME} INTERFACE)
-add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
-
-target_include_directories(
-	${PROJECT_NAME} INTERFACE
-	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-	"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
-)
-
-if(WIN32)
-	target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
-	target_compile_options(${PROJECT_NAME} INTERFACE /wd4068 /wd4849 /wd6262 /wd4702 /wd4100) 
-	#4068 ignore unroll and GCC flags
-	#4849 ignore collapse
-	#6262 ignore stack too large
-	#4702 unreachable code(false warning on constexpr condition)
-	#4100 unreferenced formal parameter
-
-	target_link_options(${PROJECT_NAME} INTERFACE /STACK:3145728) #Stack requires up to L2 cache size
-endif(WIN32)
-
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
deleted file mode 100644
index 143adb771760b..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_base.h
+++ /dev/null
@@ -1,303 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-
-#include <cstddef>
-#include <type_traits>
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-#define OFFSET(field) offsetof(params, field)
-
-namespace jblas {
-
-namespace xbyak {
-class JitBase : protected Xbyak::CodeGenerator {
- protected:
-  JitBase(size_t size = 16 * 1024) : CodeGenerator(size) {}
-
-  void load32(const Xbyak::Reg64& reg, const Xbyak::Address& addr) {
-    xor_(reg, reg);
-    mov(reg.cvt32(), addr);
-  }
-
-  void vreg_push(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(xword[baseaddr + i * 16], Xbyak::Xmm(6 + i));
-    }
-#endif
-  }
-
-  void vreg_pop(const Xbyak::Reg64& baseaddr) {
-#ifdef _WIN32
-    for (int i = 0; i < 10; i++) {
-      movaps(Xbyak::Xmm(6 + i), xword[baseaddr + i * 16]);
-    }
-#endif
-  }
-
-  void padto_le(const Xbyak::Reg64& _src, int padding) {
-    // _src=_src/padding*padding
-    if (padding == 1) {
-      return;
-    }
-    for (int i = 1; i < 16; i++) {
-      if ((1 << i) == padding) {
-        shr(_src, i);
-        shl(_src, i);
-        return;
-      }
-    }
-    assert(0);
-  }
-
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Address& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    inLocalLabel();
-    lea(_tmp, _total);
-    sub(_tmp, _pos);
-    cmp(_tmp, N);
-    jb(".maskflag");
-    cmp(_tmp, 0);
-    jl(".zeroflag");
-    uint64_t allmask = (static_cast<uint64_t>(1) << N) - 1;
-    if (N == 64) {
-      allmask = static_cast<uint64_t>(-1);
-    }
-    mov(_tmp, allmask);
-    kmovq(_msk, _tmp);
-    jmp(".maskend");
-    L(".maskflag");
-    mov(_tmp1, 1);
-    shlx(_tmp1, _tmp1, _tmp);
-    sub(_tmp1, 1);
-    kmovq(_msk, _tmp1);
-    jmp(".maskend");
-    L(".zeroflag");
-    mov(_tmp1, 0);
-    kmovq(_msk, _tmp1);
-    L(".maskend");
-    outLocalLabel();
-  }
-  void generate_Nbitsmask(const Xbyak::Opmask& _msk, const Xbyak::Reg64& _pos, const Xbyak::Reg64& _total,
-                          const Xbyak::Reg64& _tmp, const Xbyak::Reg64& _tmp1, int N) {
-    generate_Nbitsmask(_msk, _pos, ptr[_total], _tmp, _tmp1, N);
-  }
-};
-
-class JitAvx : protected JitBase {
- protected:
-  static int constexpr VBits = 256;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 16;
-  typedef Xbyak::Ymm vreg_t;
-};
-
-class JitAvx2 : protected JitAvx {
- protected:
-  static int constexpr VBits = 256;
-  typedef Xbyak::Ymm vreg_t;
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxor(x1, x2, op); }
-
-  void loadbf16_f32(const Xbyak::Ymm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-};
-
-class JitAvx512f : protected JitAvx2 {
- protected:
-  static int constexpr VBits = 512;
-  static int constexpr VecBytes = VBits / 8;
-  static int constexpr RegCount = 32;
-  typedef Xbyak::Zmm vreg_t;
-
-  void vxor(const vreg_t& x1, const vreg_t& x2, const Xbyak::Operand& op) { vpxorq(x1, x2, op); }
-
-  void interleave_2rows_4regs(Xbyak::Zmm* src_2regs, Xbyak::Zmm* tmp_2reg) {
-    vpunpcklwd(tmp_2reg[0], src_2regs[0], src_2regs[1]);
-    vpunpckhwd(tmp_2reg[1], src_2regs[0], src_2regs[1]);
-    vshuff32x4(src_2regs[0], tmp_2reg[0], tmp_2reg[1], 0 | (1 << 2) | (0 << 4) | (1 << 6));
-    vshuff32x4(src_2regs[0], src_2regs[0], src_2regs[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], tmp_2reg[0], tmp_2reg[1], 2 | (3 << 2) | (2 << 4) | (3 << 6));
-    vshuff32x4(src_2regs[1], src_2regs[1], src_2regs[1], 0 | (2 << 2) | (1 << 4) | (3 << 6));
-  }
-
-  void transpose16x16_4B(Xbyak::Zmm* src, Xbyak::Zmm* tmp, const int N = 16) {
-    for (int i = 0; i < 8; ++i) {
-      vpunpckldq(tmp[2 * i + 0], src[2 * i], src[2 * i + 1]);
-      vpunpckhdq(tmp[2 * i + 1], src[2 * i], src[2 * i + 1]);
-    }
-
-    for (int i = 0; i < 4; ++i) {
-      vpunpcklqdq(src[4 * i + 0], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpckhqdq(src[4 * i + 1], tmp[4 * i + 0], tmp[4 * i + 2]);
-      vpunpcklqdq(src[4 * i + 2], tmp[4 * i + 1], tmp[4 * i + 3]);
-      vpunpckhqdq(src[4 * i + 3], tmp[4 * i + 1], tmp[4 * i + 3]);
-    }
-
-    for (int i = 0; i < 2; ++i) {
-      vshufi32x4(tmp[8 * i + 0], src[8 * i + 0], src[8 * i + 4], 0x88);
-      vshufi32x4(tmp[8 * i + 1], src[8 * i + 1], src[8 * i + 5], 0x88);
-      vshufi32x4(tmp[8 * i + 2], src[8 * i + 2], src[8 * i + 6], 0x88);
-      vshufi32x4(tmp[8 * i + 3], src[8 * i + 3], src[8 * i + 7], 0x88);
-      vshufi32x4(tmp[8 * i + 4], src[8 * i + 0], src[8 * i + 4], 0xdd);
-      vshufi32x4(tmp[8 * i + 5], src[8 * i + 1], src[8 * i + 5], 0xdd);
-      vshufi32x4(tmp[8 * i + 6], src[8 * i + 2], src[8 * i + 6], 0xdd);
-      vshufi32x4(tmp[8 * i + 7], src[8 * i + 3], src[8 * i + 7], 0xdd);
-    }
-
-    // last step and move out
-    for (int i = 0; i < N; ++i) {
-      vshufi32x4(src[i], tmp[i % 8], tmp[8 + i % 8], i < 8 ? 0x88 : 0xdd);
-    }
-  }
-
-  void interleave_4rows_6regs(Xbyak::Zmm* src_4regs, Xbyak::Zmm* tmp_regs, const Xbyak::Opmask* masks) {
-    vpunpcklbw(tmp_regs[0], src_4regs[0], src_4regs[1]);
-    vpunpckhbw(tmp_regs[1], src_4regs[0], src_4regs[1]);
-    vpunpcklbw(tmp_regs[2], src_4regs[2], src_4regs[3]);
-    vpunpckhbw(tmp_regs[3], src_4regs[2], src_4regs[3]);
-
-    vpunpcklwd(tmp_regs[4], tmp_regs[0], tmp_regs[2]);
-    vpunpckhwd(tmp_regs[5], tmp_regs[0], tmp_regs[2]);
-    vpunpcklwd(tmp_regs[0], tmp_regs[1], tmp_regs[3]);
-    vpunpckhwd(tmp_regs[2], tmp_regs[1], tmp_regs[3]);
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (4 << 4) | 4);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (4 << 4) | 4);
-    vmovups(src_4regs[0], tmp_regs[1]);
-    vshuff32x4(src_4regs[0] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[1], tmp_regs[3]);
-    vshuff32x4(src_4regs[1] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-    vshuff32x4(tmp_regs[1], tmp_regs[4], tmp_regs[0], (14 << 4) | 14);
-    vshuff32x4(tmp_regs[3], tmp_regs[5], tmp_regs[2], (14 << 4) | 14);
-    vmovups(src_4regs[2], tmp_regs[1]);
-    vshuff32x4(src_4regs[2] | masks[0], tmp_regs[3], tmp_regs[3], 0 | (0 << 2) | (0 << 4) | (2 << 6));
-    vmovups(src_4regs[3], tmp_regs[3]);
-    vshuff32x4(src_4regs[3] | masks[1], tmp_regs[1], tmp_regs[1], 1 | (0 << 2) | (3 << 4) | (0 << 6));
-  }
-
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) {
-    vpsrld(_fp32, _fp32, 16);
-    vpmovdw(_bf16, _fp32);
-  }
-
-  void loadbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Address& addr) {
-    vpmovzxwd(dst, addr);
-    vpslld(dst, dst, 16);
-  }
-
-  void broadcastbf16_f32(const Xbyak::Zmm& dst, const Xbyak::Reg64& tmp, const Xbyak::Address& addr) {
-    mov(tmp.cvt16(), addr);
-    shl(tmp.cvt32(), 16);
-    vpbroadcastd(dst, tmp.cvt32());
-  }
-
-  void store_fp32_bf16(const Xbyak::Zmm& _fp32, const Xbyak::Address& _add) {
-    auto bf16 = Xbyak::Ymm(_fp32.getIdx());
-    cvt_fp32_bf16(bf16, _fp32);
-    vmovups(_add, bf16);
-  }
-};
-
-class JitAvx512_bf16 : protected JitAvx512f {};
-
-class JitAvx512_fp16 : protected JitAvx512f {};
-
-class JitAvx512vnni : protected JitAvx512f {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::EvexEncoding);
-  }
-};
-
-class JitAvxvnni : protected JitAvx2 {
- protected:
-  void vpdpbusds_(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, const Xbyak::Operand& op) {
-    vpdpbusds(x1, x2, op, Xbyak::VexEncoding);
-  }
-};
-
-class JitAmxtile : protected JitAvx512f {
- public:
-  struct alignas(64) tileconfig_t {
-    uint8_t palette_id;
-    uint8_t reserved[15];
-    uint16_t colb[16];
-    uint8_t rows[16];
-  };
-  static int constexpr TileCount = 8;
-
-  typedef long long (*configure_t)(void*);
-
-  static void generate_config(Xbyak::CodeGenerator* g) {
-    Xbyak::util::StackFrame st(g, 1, 0, 0);
-    auto& parambase = st.p[0];
-    g->ldtilecfg(g->ptr[parambase]);
-  }
-
-  static void configure_tiles(tileconfig_t& tc, int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum,
-                              int CNum) {
-    // Filling tile configure structure. Could be done offline.
-    tc.palette_id = 1;
-    // Configure C tiles
-    int t = 0;
-    for (; t < CNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-    // Configure A tiles
-    for (; t < CNum + ANum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_M);
-      tc.colb[t] = static_cast<uint16_t>(TILE_K * elesize);
-    }
-    // Configure B tile. B effectively has 64 rows and 16 columns.
-    int kpack = 4 / elesize;
-    for (; t < CNum + ANum + BNum; ++t) {
-      tc.rows[t] = static_cast<uint8_t>(TILE_K / kpack);
-      tc.colb[t] = static_cast<uint16_t>(TILE_N * 4);
-    }
-  }
-};
-
-class JitAmxbf16 : protected JitAmxtile {
- protected:
-  void cvt_fp32_bf16(const Xbyak::Ymm& _bf16, const Xbyak::Zmm& _fp32) { vcvtneps2bf16(_bf16, _fp32); }
-};
-
-class JitAmxint8 : protected JitAmxtile {
- protected:
-  template <class, class>
-  void _tdpb(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3);
-};
-template <>
-inline void JitAmxint8::_tdpb<int8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbssd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<int8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbsud(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, int8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbusd(x1, x2, x3);
-}
-template <>
-inline void JitAmxint8::_tdpb<uint8_t, uint8_t>(const Xbyak::Tmm& x1, const Xbyak::Tmm& x2, const Xbyak::Tmm& x3) {
-  tdpbuud(x1, x2, x3);
-}
-}  // namespace xbyak
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
deleted file mode 100644
index 8ecf3535c17f4..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <stdint.h>
-enum JBLAS_CODE {
-  JblasSuccess = 0,
-  JblasInvalidParam = 1,
-  JblasInvalidISA = 2,
-  JblasRuntimeError = 4,
-  JblasNotSupport = 8,
-};
-enum JBLAS_ISA : uint32_t {
-  JblasNoSIMD = 0,
-  JblasAVX,
-  JblasAVX2,
-  JblasAVX_VNNI,
-  JblasAVX512F,
-  JblasAVX512_VNNI,
-  JblasAMX_BF16,
-  JblasAMX_INT8,
-  JblasAVX512_FP16,
-  JblasAVX512_BF16,
-};
-enum class JBLAS_DTYPE : uint32_t {
-  EleBitsMask = 0xff,
-  EleBitsUndef = 0,
-  EleBits4 = 4,
-  EleBits8 = 8,
-  EleBits16 = 16,
-  EleBits32 = 32,
-  EleBits64 = 64,
-  TypeMask = 0xff00,
-  TypeFloat = 0 << 8,
-  TypeInt = 1 << 8,
-  SubTypeMask = 0xff0000,
-  SubType0 = 0 << 16,
-  SubType1 = 1 << 16,
-  SubType2 = 2 << 16,
-  F64 = EleBits64 | TypeFloat,
-  F32 = EleBits32 | TypeFloat,
-  F16 = EleBits16 | TypeFloat,
-  BF16 = EleBits16 | TypeFloat | SubType1,
-  F8_E4M3 = EleBits8 | TypeFloat,
-  F8_E5M2 = EleBits8 | TypeFloat | SubType1,
-  F8_E3M4 = EleBits8 | TypeFloat | SubType2,
-  S8 = EleBits8 | TypeInt,
-  U8 = EleBits8 | TypeInt | SubType1,
-  S4_CLIP = EleBits4 | TypeInt,
-  S4_FULLRANGE = EleBits4 | TypeInt | SubType1,
-  F4_E2M1 = EleBits4 | TypeFloat,
-  F4_BNB = EleBits4 | TypeFloat | SubType1,
-  F4_NF4 = EleBits4 | TypeFloat | SubType2,
-  S32 = EleBits32 | TypeInt,
-  U32 = EleBits32 | TypeInt | SubType1,
-};
-
-enum JBLAS_LAYOUT { JblasRowMajor = 101, JblasColMajor = 102 };
-enum JBLAS_TRANSPOSE {
-  JblasNoTrans = 111,
-  JblasTrans = 112,
-  JblasConjTrans = 113,
-};
-enum JBLAS_ELTWISEOP {
-  GELU,
-  SWISH,
-  TANH,
-  EXP,
-  LOW_PRECISION_EXP,
-  RELU,
-  LINEAR,
-};
-
-enum class JBLAS_PROLOGUEB_IDS : uint32_t {
-  Undef = (uint32_t)-1,
-  Begin = 0,
-  NormalBegin = Begin,
-  WeightPack = NormalBegin,
-  NormalEnd,
-  KBlockBegin = NormalEnd,
-  WeightKBlockS8 = KBlockBegin,
-  WeightKBlockS4,
-  WeightKBlockF4,
-  KBlockEnd,
-  End,
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
deleted file mode 100644
index 5cac1080bc610..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_device.h
+++ /dev/null
@@ -1,277 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas.h"
-#include "xbyak/xbyak_util.h"
-
-namespace jblas {
-
-namespace device {
-
-struct X64_ISA {
-  int64_t MMX : 1;                  // 0
-  int64_t SSE : 1;                  // 1
-  int64_t SSE2 : 1;                 // 2
-  int64_t SSE3 : 1;                 // 3
-  int64_t SSSE3 : 1;                // 4
-  int64_t SSE41 : 1;                // 5
-  int64_t SSE42 : 1;                // 6
-  int64_t AVX : 1;                  // 7
-  int64_t F16C : 1;                 // 8
-  int64_t FMA : 1;                  // 9
-  int64_t AVX2 : 1;                 // 10
-  int64_t AVX_VNNI : 1;             // 11
-  int64_t AVX_VNNI_INT8 : 1;        // 12
-  int64_t AVX_NE_CONVERT : 1;       // 13
-  int64_t AVX_IFMA : 1;             // 14
-  int64_t AVX512F : 1;              // 15
-  int64_t AVX512BW : 1;             // 16
-  int64_t AVX512CD : 1;             // 17
-  int64_t AVX512DQ : 1;             // 18
-  int64_t AVX512ER : 1;             // 19
-  int64_t AVX512IFMA52 : 1;         // 20
-  int64_t AVX512PF : 1;             // 21
-  int64_t AVX512VL : 1;             // 22
-  int64_t AVX512VPOPCNTDQ : 1;      // 23
-  int64_t AVX512_4FMAPS : 1;        // 24
-  int64_t AVX512_4VNNIW : 1;        // 25
-  int64_t AVX512_BF16 : 1;          // 26
-  int64_t AVX512_BITALG : 1;        // 27
-  int64_t AVX512_VBMI : 1;          // 28
-  int64_t AVX512_VBMI2 : 1;         // 29
-  int64_t AVX512_VNNI : 1;          // 30
-  int64_t AVX512_VP2INTERSECT : 1;  // 31
-  int64_t AVX512_FP16 : 1;          // 32
-  int64_t AMX_TILE : 1;             // 33
-  int64_t AMX_BF16 : 1;             // 34
-  int64_t AMX_INT8 : 1;             // 35
-  int64_t AMX_FP16 : 1;             // 36
-  int64_t AMX_COMPLEX : 1;          // 37
-  int64_t reserved : (64 - 38);
-};
-
-class AVX2_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 0;
-  static constexpr bool AVX512BW = 0;
-  static constexpr bool AVX512CD = 0;
-  static constexpr bool AVX512DQ = 0;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 0;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 0;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class AVX512_VNNI_Default {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 0;
-  static constexpr bool AMX_BF16 = 0;
-  static constexpr bool AMX_INT8 = 0;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-class SapphireRapids {
- public:
-  static constexpr bool MMX = 1;
-  static constexpr bool SSE = 1;
-  static constexpr bool SSE2 = 1;
-  static constexpr bool SSE3 = 1;
-  static constexpr bool SSSE3 = 1;
-  static constexpr bool SSE41 = 1;
-  static constexpr bool SSE42 = 1;
-  static constexpr bool AVX = 1;
-  static constexpr bool F16C = 1;
-  static constexpr bool FMA = 1;
-  static constexpr bool AVX2 = 1;
-  static constexpr bool AVX_VNNI = 0;
-  static constexpr bool AVX_VNNI_INT8 = 0;
-  static constexpr bool AVX_NE_CONVERT = 0;
-  static constexpr bool AVX_IFMA = 0;
-  static constexpr bool AVX512F = 1;
-  static constexpr bool AVX512BW = 1;
-  static constexpr bool AVX512CD = 1;
-  static constexpr bool AVX512DQ = 1;
-  static constexpr bool AVX512ER = 0;
-  static constexpr bool AVX512IFMA52 = 0;
-  static constexpr bool AVX512PF = 0;
-  static constexpr bool AVX512VL = 1;
-  static constexpr bool AVX512VPOPCNTDQ = 0;
-  static constexpr bool AVX512_4FMAPS = 0;
-  static constexpr bool AVX512_4VNNIW = 0;
-  static constexpr bool AVX512_BF16 = 0;
-  static constexpr bool AVX512_BITALG = 0;
-  static constexpr bool AVX512_VBMI = 0;
-  static constexpr bool AVX512_VBMI2 = 0;
-  static constexpr bool AVX512_VNNI = 1;
-  static constexpr bool AVX512_VP2INTERSECT = 0;
-  static constexpr bool AVX512_FP16 = 0;
-  static constexpr bool AMX_TILE = 1;
-  static constexpr bool AMX_BF16 = 1;
-  static constexpr bool AMX_INT8 = 1;
-  static constexpr bool AMX_FP16 = 0;
-  static constexpr bool AMX_COMPLEX = 0;
-};
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-class CpuDevice {
- public:
-  inline void setThreads(int _nth) {
-    if (_nth <= 0) {
-      numthreads = numcores;
-    } else {
-      numthreads = std::min(numcores, _nth);
-    }
-  }
-  inline int getThreads() { return numthreads; }
-  inline int getCores() { return numcores; }
-  inline uint32_t getL2CacheSize() { return L2Cache; }
-  inline uint32_t getL1CacheSize() { return L1Cache; }
-  inline bool AVX() { return mHasAVX; }
-  inline bool AVX2() { return mHasAVX2; }
-  inline bool AVX_VNNI() { return mHasAVX_VNNI; }
-  inline bool AVX512F() { return mHasAVX512F; }
-  inline bool AVX512_VNNI() { return mHasAVX512_VNNI; }
-  inline bool AMX_INT8() { return mHasAMX_INT8; }
-  inline bool AMX_BF16() { return mHasAMX_BF16; }
-  inline bool AVX512_BF16() { return mHasAVX512_BF16; }
-  inline bool AVX512_FP16() { return mHasAVX512_FP16; }
-#define ADD_FLAG(isa) mHas##isa = _cpu.has(_cpu.t##isa)
-  CpuDevice() {
-    static Xbyak::util::Cpu _cpu;
-    L1Cache = _cpu.getDataCacheSize(0);
-    L2Cache = _cpu.getDataCacheSize(1);
-    ADD_FLAG(AVX);
-    ADD_FLAG(AVX2);
-    ADD_FLAG(AVX512F);
-    ADD_FLAG(AVX512_VNNI);
-    ADD_FLAG(AVX_VNNI);
-    ADD_FLAG(AMX_BF16);
-    ADD_FLAG(AMX_INT8);
-    ADD_FLAG(AVX512_BF16);
-    ADD_FLAG(AVX512_FP16);
-    numcores = _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
-    numthreads = numcores;
-  }
-
-  static CpuDevice* getInstance() {
-    static CpuDevice instance;
-    return &instance;
-  }
-
-  void print() {
-    printf(
-        "AVX:%d AVX2:%d AVX512F:%d AVX_VNNI:%d AVX512_VNNI:%d AMX_INT8:%d AMX_BF16:%d AVX512_BF16:%d AVX512_FP16:%d\n",
-        mHasAVX, mHasAVX2, mHasAVX512F, mHasAVX_VNNI, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512_BF16,
-        mHasAVX512_FP16);
-  }
-#undef ADD_FLAG
-
- protected:
-  uint32_t L2Cache, L1Cache;
-  bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
-      mHasAVX512_FP16;
-  int numcores;
-  int numthreads;
-};
-
-#define GetCPUDevice() auto _cd = jblas::device::CpuDevice::getInstance();
-
-class CpuBase {
- public:
-  CpuBase() {
-    GetCPUDevice();
-    mL2Cache = _cd->getL2CacheSize();
-    mL1Cache = _cd->getL1CacheSize();
-    mNumThreads = _cd->getThreads();
-  }
-  size_t mL2Cache, mL1Cache;
-  int mNumThreads;
-};
-}  // namespace device
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
deleted file mode 100644
index ceb7a545092d8..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_epilogue.h
+++ /dev/null
@@ -1,329 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <tuple>
-
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace epilogue {
-namespace gemm {
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T>
-class AccumulatorWriteBack {
- public:
-  using SType = _SRC_T;
-  using DType = _DST_T;
-  struct Param {
-    DType* C;
-    int ldc;
-    void* elt_const_v;
-  };
-
-  template <typename... Eltops>
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize, Eltops... ops) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    bool constexpr Valid = !std::is_same<DType, utils::bf16>::value || std::is_same<SType, float>::value;
-    static_assert(Valid, "fp32 to bf16 conversion only.");
-    if constexpr (std::is_same<DType, utils::bf16>::value) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (std::is_same<std::tuple<SType, DType>, std::tuple<utils::fp16, float>>::value) {
-      return kernel::wrapper::Memcpy2DFp16CvtFp32::template forward<ISA_T>(
-          const_cast<_SRC_T*>(cacheptr), cptr, M, N, cachestep * sizeof(SType), _param.ldc * sizeof(DType), false);
-    } else if constexpr (sizeof(SType) == sizeof(DType)) {
-      return kernel::wrapper::Memcpy2D::template forward<ISA_T, SType, DType>(cacheptr, cptr, M, N, cachestep,
-                                                                              _param.ldc, _param.elt_const_v, ops...);
-    } else {
-      assert(false);
-    }
-  }
-};
-
-template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP _OP>
-class CustomAccumulatorWriteBackWithEltop {
- public:
-  struct Param {
-    _DST_T* C;
-    int ldc;
-    void* elt_const_v;
-  };
-  JBLAS_CODE forward(const _SRC_T* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    if constexpr (std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value) {
-      return kernel::wrapper::Memcpy2D::template forward1<ISA_T, float, float, _OP>(cacheptr, cptr, M, N, cachestep,
-                                                                                    _param.ldc, _param.elt_const_v);
-    } else {
-      assert(false);
-    }
-  }
-};
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32 = AccumulatorWriteBack<ISA_T, float, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackInt32 = AccumulatorWriteBack<ISA_T, int, int>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackBf16 = AccumulatorWriteBack<ISA_T, utils::bf16, utils::bf16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16 = AccumulatorWriteBack<ISA_T, utils::fp16, utils::fp16>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp16Fp32 = AccumulatorWriteBack<ISA_T, utils::fp16, float>;
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackFp32Bf16 = AccumulatorWriteBack<ISA_T, float, utils::bf16>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithGeluFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, GELU>;
-
-template <JBLAS_ISA ISA_T>
-using AccumulatorWriteBackWithSwishFp32 = CustomAccumulatorWriteBackWithEltop<ISA_T, float, float, SWISH>;
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessFp32 {
- public:
-  struct Param {
-    float *C, *D;
-    int ldc, ldd;
-    float alpha, beta;
-  };
-
-  JBLAS_CODE forward(const float* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto DOffset = M_offset * _param.ldd + N_offset;
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto dptr = _param.D + DOffset;
-    return kernel::wrapper::AlphaBetaF32F32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, _param.beta,
-                                                                     dptr, _param.ldd, cptr, _param.ldc, M, N);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompFp32BlockEpilogue {
- public:
-  struct Param {
-    void* scales;
-    JBLAS_DTYPE scaledtype;
-    int ldsb;
-    int8_t* zps = nullptr;
-    float* reduce = nullptr;
-    int ldra;
-  };
-  JBLAS_CODE forward(const float* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    auto ret = JblasNotSupport;
-    if (_param.scaledtype == JBLAS_DTYPE::F32) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(ret == JblasSuccess);
-      if (_param.zps != nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zps + K_offset * _param.ldsb + N_offset,
-            reinterpret_cast<float*>(_param.scales) + K_offset * _param.ldsb + N_offset, _param.ldra,
-            _param.reduce + M_offset * _param.ldra + K_offset);
-      }
-      assert(ret == JblasSuccess);
-      return ret;
-    } else if (_param.scaledtype == JBLAS_DTYPE::BF16) {
-      ret = kernel::wrapper::CompFp32BlockScale::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scales) + K_offset * _param.ldsb + N_offset, srcptr, cachestep, dstptr,
-          cachestep, M, N);
-      assert(_param.zps == nullptr);
-      assert(ret == JblasSuccess);
-      return ret;
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class DequantInt32ToFp32 {
- public:
-  struct Param {
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                   _param.scalesA + M_offset * _param.ldsa, _param.ldsa,
-                                                                   _param.scalesB + N_offset);
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class CompInt8BlockEpilogue {
- public:
-  struct Param {
-    void* scalesB;
-    JBLAS_DTYPE scaleBdtype;
-    int ldsb;
-    float* scalesA;
-    int ldsa;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    void* reduceB = nullptr;
-    JBLAS_DTYPE reduceBdtype = JBLAS_DTYPE::F32;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, const int cachestep, const int M_offset, const int N_offset,
-                     const int K_offset, const int M, const int N, const Param& _param, void* tmpcache,
-                     size_t cachesize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    float* scab = nullptr;
-    size_t ScaleBTmpSize = N * sizeof(float);
-    size_t ReduceBTmpSize = N * sizeof(float);
-    assert(cachesize >= (ScaleBTmpSize + ReduceBTmpSize));
-    if (_param.scaleBdtype == JBLAS_DTYPE::BF16) {
-      auto scache = reinterpret_cast<float*>(tmpcache);
-      ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-          reinterpret_cast<utils::bf16*>(_param.scalesB) + N_offset + K_offset * _param.ldsb, scache, 1, N, N, N,
-          false);
-      assert(ret == JblasSuccess);
-      scab = scache;
-    } else if (_param.scaleBdtype == JBLAS_DTYPE::F32) {
-      scab = reinterpret_cast<float*>(_param.scalesB) + N_offset + K_offset * _param.ldsb;
-    }
-    float* redb = nullptr;
-    if (_param.reduceB) {
-      if (_param.reduceBdtype == JBLAS_DTYPE::BF16) {
-        auto rcache = reinterpret_cast<float*>(reinterpret_cast<char*>(tmpcache) + ScaleBTmpSize);
-        ret = kernel::wrapper::Memcpy2DBf16CvtFp32::template forward<ISA_T>(
-            reinterpret_cast<utils::bf16*>(_param.reduceB) + N_offset + K_offset * _param.ldsb, rcache, 1, N, N, N,
-            false);
-        assert(ret == JblasSuccess);
-        redb = rcache;
-      } else if (_param.reduceBdtype == JBLAS_DTYPE::F32) {
-        redb = reinterpret_cast<float*>(_param.reduceB) + N_offset + K_offset * _param.ldsb;
-      }
-    }
-    ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(
-        srcptr, cachestep, reinterpret_cast<float*>(const_cast<int32_t*>(srcptr)), cachestep, M, N,
-        _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, scab);
-    assert(ret == JblasSuccess);
-    ret = kernel::wrapper::AccumulateFp32::template forward<ISA_T>(reinterpret_cast<const float*>(srcptr), cachestep,
-                                                                   dstptr, cachestep, M, N);
-    assert(ret == JblasSuccess);
-
-    if (_param.zpA == nullptr) {
-      if (_param.zpB == nullptr) {
-        return ret;
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpB + N_offset + K_offset * _param.ldsb, scab, _param.ldsa,
-            _param.reduceA + M_offset * _param.ldsa + K_offset);
-      }
-    } else {
-      if (_param.zpB == nullptr) {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.scalesA + M_offset * _param.ldsa + K_offset, _param.ldsa, redb);
-      } else {
-        ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-            dstptr, cachestep, M, N, _param.zpA + M_offset * _param.ldsa + K_offset,
-            _param.zpB + N_offset + K_offset * _param.ldsb, _param.scalesA + M_offset * _param.ldsa + K_offset, scab,
-            _param.ldsa, _param.K, _param.reduceA + M_offset * _param.ldsa + K_offset, redb);
-      }
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class ZpDequantInt32ToFp32 {
- public:
-  struct Param {
-    // necessary
-    float* C;
-    int ldc;
-    int ldsa;
-    float* scalesA;
-    float* scalesB;
-    // optional if A asym
-    uint8_t* zpA = nullptr;
-    float* reduceB = nullptr;
-    // optional if B asym
-    int8_t* zpB = nullptr;
-    float* reduceA = nullptr;
-    int K = 1;
-  };
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    auto ret = kernel::wrapper::DequanS32Fp32::template forward<ISA_T>(cacheptr, cachestep, cptr, _param.ldc, M, N,
-                                                                       _param.scalesA + M_offset * _param.ldsa,
-                                                                       _param.ldsa, _param.scalesB + N_offset);
-    if (ret != JblasSuccess) {
-      return ret;
-    }
-    if (_param.zpA == nullptr && _param.zpB == nullptr) {
-      return ret;
-    } else if (_param.zpA != nullptr && _param.zpB == nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_act<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.scalesA + M_offset * _param.ldsa,
-          _param.ldsa, _param.reduceB + N_offset);
-    } else if (_param.zpA == nullptr && _param.zpB != nullptr) {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_wei<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpB + N_offset, _param.scalesB + N_offset, _param.ldsa,
-          _param.reduceA + M_offset * _param.ldsa);
-    } else {
-      ret = kernel::wrapper::RemoveZeroPointBias::template forward_both<ISA_T>(
-          cptr, _param.ldc, M, N, _param.zpA + M_offset * _param.ldsa, _param.zpB + N_offset,
-          _param.scalesA + M_offset * _param.ldsa, _param.scalesB + N_offset, _param.ldsa, _param.K,
-          _param.reduceA + M_offset * _param.ldsa, _param.reduceB + N_offset);
-    }
-    return ret;
-  }
-};
-
-template <JBLAS_ISA ISA_T>
-class AlphaBetaProcessS32U8 {
- public:
-  struct Param {
-    uint8_t* C;
-    int ldc;
-    float alpha;
-    float scaleAcc, scaleC;
-    int zpC;
-  };
-
-  JBLAS_CODE forward(const int32_t* cacheptr, const int cachestep, const int M_offset, const int N_offset, const int M,
-                     const int N, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto COffset = M_offset * _param.ldc + N_offset;
-    auto cptr = _param.C + COffset;
-    return kernel::wrapper::QuanOutS32U32::template forward<ISA_T>(_param.alpha, cacheptr, cachestep, cptr, _param.ldc,
-                                                                   M, N, _param.scaleAcc, _param.scaleC, _param.zpC);
-  }
-};
-
-}  // namespace gemm
-}  // namespace epilogue
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
deleted file mode 100644
index 364da9223940f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_gemm.h
+++ /dev/null
@@ -1,2699 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-
-#include "jit_blas_utils.h"
-#include "jit_base.h"
-
-namespace jblas {
-namespace gemm {
-enum class CompType : uint32_t {
-  COMP_FP32 = 0,
-  COMP_BF16_FP32 = 1,
-  COMP_FP16_FP16 = 2,
-  COMP_INT_START = 3,
-  COMP_INT8_US_INT32 = COMP_INT_START,
-  COMP_INT8_UU_INT32 = 4,
-  COMP_INT8_SS_INT32 = 5,
-  COMP_INT8_SU_INT32 = 6,
-  COMP_INT16_SS_INT32 = 7,
-  COMP_INT8_US_FP32 = 8,
-  COMP_INT8_UU_FP32 = 9,
-  COMP_INT8_SS_FP32 = 10,
-  COMP_INT8_SU_FP32 = 11,
-};
-
-class CoreAttr {
- public:
-  // INT32=LSB|**8bits:NTile**||**8bits:PackRow**||**8bits:CompType**||**8bits:Reserve**|
-  static uint32_t constexpr NTILE_MASK = 0xff, NTILE_SHIFT = 0, PACKROW_MASK = 0xff00, PACKROW_SHIFT = 8,
-                            COMP_MASK = 0xff0000, COMP_SHIFT = 16, ISA_MASK = 0xff000000, ISA_SHIFT = 24;
-
-  static inline uint32_t get_mask_val(uint32_t raw, uint32_t mask, uint32_t shift) { return (raw & mask) >> shift; }
-  static constexpr uint32_t make_core_id(uint32_t NTile, uint32_t PackRow, uint32_t CompType, uint32_t ISA) {
-    return (NTile << NTILE_SHIFT) | (PackRow << PACKROW_SHIFT) | (CompType << COMP_SHIFT) | (ISA << ISA_SHIFT);
-  }
-
-  static void parse_id(uint32_t id, uint32_t* vals) {
-    vals[0] = get_mask_val(id, NTILE_MASK, NTILE_SHIFT);
-    vals[1] = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    vals[2] = get_mask_val(id, COMP_MASK, COMP_SHIFT);
-    vals[3] = get_mask_val(id, ISA_MASK, ISA_SHIFT);
-  }
-
-  static const char* to_str(uint32_t id) {
-    static char tmp[128];
-    uint32_t vals[4];
-    parse_id(id, vals);
-    sprintf(tmp, "N%d_PACK%d_COMP%d_ISA%d", vals[0], vals[1], vals[2], vals[3]);
-    return tmp;
-  }
-
-  static inline size_t get_bsize(uint32_t id) {
-    auto packrow = get_mask_val(id, PACKROW_MASK, PACKROW_SHIFT);
-    return size_t(4 / packrow);
-  }
-};
-
-namespace code {
-
-template <int _NTILE, int _MTILE = 0>
-class Avx2N8P1 : protected jblas::xbyak::JitAvx2 {
- public:
-  static int constexpr RegLen = 8, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX2;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512fp16N32P1 : protected jblas::xbyak::JitAvx512_fp16 {
- public:
-  static int constexpr RegLen = 32, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_FP16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP16_FP16;
-  typedef utils::fp16 AType;
-  typedef utils::fp16 BType;
-  typedef utils::fp16 CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastw(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastw(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ph(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512bf16N16P2 : protected jblas::xbyak::JitAvx512_bf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 2;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vdpbf16ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                        ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class AvxvnniN8P4 : protected jblas::xbyak::JitAvxvnni {
- public:
-  static int constexpr RegLen = 8, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_INT32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef int32_t CType;
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- private:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
- protected:
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _kunroll) {
-    for (int kk = 0; kk < _kunroll; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vpbroadcastd(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                         ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Amxbf16N16P2 : protected jblas::xbyak::JitAmxbf16 {
- public:
-  static int constexpr RegLen = 16, PackRow = 2;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 32;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_BF16;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_BF16_FP32;
-  typedef utils::bf16 AType;
-  typedef utils::bf16 BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              tdpbf16ps(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-
-template <typename AT, typename BT, int _NTILE, int _MTILE = 0>
-class Amxint8N16P4 : protected jblas::xbyak::JitAmxint8 {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static_assert(_MTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? 1 : _MTILE / RegLen;
-  static_assert(NRegs * MRegs + 2 <= TileCount);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs * RegLen, KTILE = 64;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAMX_INT8;
-  static uint32_t constexpr COMPUTE =
-      (uint32_t)(std::is_same_v<AT, int8_t>
-                     ? std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_SS_INT32 : CompType::COMP_INT8_SU_INT32
-                 : std::is_same_v<BT, int8_t> ? CompType::COMP_INT8_US_INT32
-                                              : CompType::COMP_INT8_UU_INT32);
-  using AType = AT;
-  using BType = BT;
-  typedef int32_t CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-    void* workspace;
-  };
-  typedef long long (*func_t)(params*);
-
-  int TmpRegCount = RegCount;
-  int TmpReg = 0;
-  int CTileCount = 0, ATileCount = 0, BTileCount = 0;
-  int CTile = 0, ATile = 0, BTile = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CTileCount = NRegs * MRegs;
-    auto tile_re = TileCount - CTileCount;
-    if (tile_re - 1 >= NRegs) {
-      BTileCount = NRegs;
-      ATileCount = tile_re - BTileCount;
-    } else if (tile_re - 1 >= MRegs) {
-      ATileCount = MRegs;
-      BTileCount = tile_re - ATileCount;
-    } else {
-      ATileCount = 1;
-      BTileCount = tile_re - ATileCount;
-    }
-    CTile = 0;
-    ATile = CTile + CTileCount;
-    BTile = ATile + ATileCount;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 11, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int kunrll) {
-    auto& reg_Bstride = reg_tmp1;
-    mov(reg_Bstride, NTILE * 4);
-    int mtiles = _mtile / RegLen;
-
-    for (int kk = 0; kk < kunrll; kk++) {
-      auto& reg_Atmp = reg_tmp2;
-      if (mtiles == 1) {
-        reg_Atmp = reg_matAptr;
-      } else {
-        mov(reg_Atmp, reg_matAptr);
-      }
-      if (BTileCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          tileloaddt1(Xbyak::Tmm(BTile + i), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-        }
-        for (int mm = 0; mm < mtiles; mm++) {
-          tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-          for (int i = 0; i < NRegs; i++) {
-            _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile + i));
-          }
-          if (mm != mtiles - 1) {
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-          }
-        }
-      } else {
-        if (ATileCount == mtiles) {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile + mm), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-          for (int i = 0; i < NRegs; i++) {
-            tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-            for (int mm = 0; mm < mtiles; mm++) {
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile + mm), Xbyak::Tmm(BTile));
-            }
-          }
-        } else {
-          for (int mm = 0; mm < mtiles; mm++) {
-            tileloadd(Xbyak::Tmm(ATile), ptr[reg_Atmp + reg_astride + kk * AKStepSize]);
-            for (int i = 0; i < NRegs; i++) {
-              tileloaddt1(Xbyak::Tmm(BTile), ptr[reg_matBptr + reg_Bstride + kk * BKStepSize + i * 64]);
-              _tdpb<AT, BT>(Xbyak::Tmm(CTile + mm * NRegs + i), Xbyak::Tmm(ATile), Xbyak::Tmm(BTile));
-            }
-            if (mm != mtiles - 1) {
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-              lea(reg_Atmp, ptr[reg_Atmp + 8 * reg_astride]);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < CTileCount; i++) {
-      tilezero(Xbyak::Tmm(CTile + i));
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    int mtnum = _mtile / 16;
-    for (int mm = 0; mm < mtnum; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tileloaddt1(Xbyak::Tmm(CTile + mm * NRegs + i), ptr[reg_matCptr + reg_cstride + i * 64]);
-      }
-      if (mm != mtnum - 1) {
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-        lea(reg_matCptr, ptr[reg_matCptr + 8 * reg_cstride]);
-      }
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, dword[parambase + OFFSET(workspace)]);
-    mov(reg_tmp1, NTILE * 4);
-    for (int mm = 0; mm < MRegs; mm++) {
-      for (int i = 0; i < NRegs; i++) {
-        tilestored(ptr[reg_tmp + reg_tmp1 + i * 64 + mm * 16 * NTILE * 4], Xbyak::Tmm(CTile + mm * NRegs + i));
-      }
-    }
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    int zunroll = TmpRegCount / NRegs;
-    for (int i = 0; i < _mtile; i += zunroll) {
-      int m_re = utils::remainsize(i, _mtile, zunroll);
-      for (int im = 0; im < m_re; im++) {
-        for (int j = 0; j < NRegs; j++) {
-          vmovups(vreg_t(TmpReg + im * NRegs + j), ptr[reg_tmp + j * 64 + (i + im) * NTILE * 4]);
-          vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(TmpReg + im * NRegs + j));
-        }
-        add(reg_matCptr, reg_cstride);
-      }
-    }
-    outLocalLabel();
-  }
-};
-template <int N, int M>
-using Amxint8N16P4US = Amxint8N16P4<uint8_t, int8_t, N, M>;
-
-template <int N, int M>
-using Amxint8N16P4SS = Amxint8N16P4<int8_t, int8_t, N, M>;
-
-class AmxConfigure : protected jblas::xbyak::JitAmxtile {
- public:
-  typedef long long (*func_t)(tileconfig_t*);
-
-  static void configure(int TILE_M, int TILE_N, int TILE_K, int elesize, int ANum, int BNum, int CNum) {
-    static AmxConfigure code;
-    tileconfig_t cfg;
-    std::memset(&cfg, 0, sizeof(cfg));
-    configure_tiles(cfg, TILE_M, TILE_N, TILE_K, elesize, ANum, BNum, CNum);
-    code.mKernel(&cfg);
-  }
-
- protected:
-  AmxConfigure() {
-    generate_config(this);
-    mKernel = getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-};
-
-namespace kblock {
-// optimize for kblock gemm, each block size in k dimension has dequant operation
-// all accumulators use fp32 dtype.
-template <int _NTILE, int _MTILE = 0>
-class Avx512fN16P1 : protected jblas::xbyak::JitAvx512f {
- public:
-  static int constexpr RegLen = 16, PackRow = 1;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1) / NRegs : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 1;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512F;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_FP32;
-  typedef float AType;
-  typedef float BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    int k;
-    int n;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_ret = rax;
-  Xbyak::Opmask msk_wr = k1;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = RegCount - ARegCount - CRegCount;
-    if (BRegCount < NRegs) {
-      BRegCount = 0;
-      ARegCount = BRegCount + 1;
-    }
-    if (BRegCount > NRegs) {
-      BRegCount = NRegs;
-    }
-    CReg = 0;
-    BReg = CReg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg <= RegCount);
-    TmpRegCount = RegCount - TmpReg;
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 10, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    mov(reg_tmp, reg_ksize);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kloop", T_NEAR);
-    L(".unkloop");
-    generate_fma(_mtile, KUNROLL);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_iterk, KUNROLL * KTILE);
-    cmp(reg_iterk, reg_tmp);  // k iteration variable
-    jb(".unkloop");
-    cmp(reg_tmp, reg_ksize);
-    jge(".kend", T_NEAR);
-    L(".kloop");
-    generate_fma(_mtile, 1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_iterk, 1 * KTILE);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-    L(".kend");
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(reg_tmp1, ptr[reg_matAptr + kk * AKStepSize]);
-      if (BRegCount == NRegs) {
-        for (int i = 0; i < NRegs; i++) {
-          vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-        }
-        for (int mm = 0; mm < _mtile; mm++) {
-          vbroadcastss(vreg_t(AReg), ptr[reg_tmp1]);
-          add(reg_tmp1, reg_astride);
-          for (int i = 0; i < NRegs; i++) {
-            vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-          }
-        }
-      } else if (BRegCount == 0) {
-        for (int mm = 0; mm < _mtile; mm += ARegCount) {
-          int mm_re = utils::remainsize(mm, _mtile, ARegCount);
-          for (int imm = 0; imm < mm_re; imm++) {
-            vbroadcastss(vreg_t(AReg + imm), ptr[reg_tmp1]);
-            add(reg_tmp1, reg_astride);
-            for (int i = 0; i < NRegs; i++) {
-              vfmadd231ps(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg + imm),
-                          ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-            }
-          }
-        }
-      } else {
-        assert(0);
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j), vreg_t(CReg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CReg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CReg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class Avx512vnniN16P4 : protected jblas::xbyak::JitAvx512vnni {
- public:
-  static int constexpr RegLen = 16, PackRow = 4;
-  static_assert(_NTILE % RegLen == 0);
-  static int constexpr NRegs = _NTILE / RegLen;
-  static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1 - NRegs) / (NRegs * 2) : _MTILE;
-  static_assert(NRegs * MRegs <= RegCount - 1);
-  static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4;
-  static int constexpr KUNROLL = 2;
-  static uint32_t constexpr ISA = (uint32_t)JBLAS_ISA::JblasAVX512_VNNI;
-  static uint32_t constexpr COMPUTE = (uint32_t)CompType::COMP_INT8_US_FP32;
-  typedef uint8_t AType;
-  typedef int8_t BType;
-  typedef float CType;
-
-  struct params {
-    AType* matA;
-    int astride;
-    BType* matB;
-    int bstride;
-    CType* matC;
-    int cstride;
-    uint8_t* zpA;
-    float* scaleA;
-    int ldsa;
-    float* scaleB;
-    float* reduceB;
-    int ldsb;
-    int k;
-    int n;
-    int kblock;
-    int init;
-  };
-  typedef long long (*func_t)(params*);
-
-  int CRegCount = 0, BRegCount = 0, ARegCount = 0, TmpRegCount = 0;
-  int CReg = 0, CF32Reg = 0, BReg = 0, AReg = 0, TmpReg = 0;
-  static int constexpr BKStepSize = KTILE * NTILE * sizeof(BType);
-  static int constexpr AKStepSize = KTILE * sizeof(AType);
-
-  void generate_code(int _mtile) {
-    assign_regs();
-    reset();
-    generate_mtile(_mtile);
-    ready();
-    mKernel = getCode<func_t>();
-  }
-  func_t mKernel = nullptr;
-
- protected:
-  Xbyak::Reg64 parambase;
-  Xbyak::Reg64 reg_matAptr;
-  Xbyak::Reg64 reg_matBptr;
-  Xbyak::Reg64 reg_matCptr;
-  Xbyak::Reg64 reg_ksize;
-  Xbyak::Reg64 reg_nsize;
-  Xbyak::Reg64 reg_cstride;
-  Xbyak::Reg64 reg_astride;
-  Xbyak::Reg64 reg_iterk;
-  Xbyak::Reg64 reg_iterkb;
-  Xbyak::Reg64 reg_itern;
-  Xbyak::Reg64 reg_tmp;
-  Xbyak::Reg64 reg_tmp1;
-  Xbyak::Reg64 reg_tmp2;
-  Xbyak::Reg64 reg_tmp3;
-  Xbyak::Reg64 reg_tmp4;
-  Xbyak::Reg64 reg_ret = rax;
-
-  void assign_regs() {
-    CRegCount = MRegs * NRegs;
-    ARegCount = 1;
-    BRegCount = NRegs;
-    CReg = 0;
-    CF32Reg = CReg + CRegCount;
-    BReg = CF32Reg + CRegCount;
-    AReg = BReg + BRegCount;
-    TmpReg = AReg + ARegCount;
-    assert(TmpReg < RegCount);
-    TmpRegCount = RegCount - TmpReg;
-    assert(TmpRegCount >= 1);
-  }
-
-  void generate_mtile(int _mtile) {
-    inLocalLabel();  // use local label for multiple instance
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10);
-    parambase = st.p[0];
-    reg_matAptr = st.t[0];
-    reg_matBptr = st.t[1];
-    reg_matCptr = st.t[0];
-    reg_ksize = st.t[2];
-    reg_astride = st.t[3];
-    reg_cstride = st.t[3];
-    reg_iterk = st.t[4];
-    reg_iterkb = st.t[12];
-    reg_tmp = st.t[5];
-    reg_tmp1 = st.t[6];
-    reg_tmp2 = st.t[7];
-    reg_tmp3 = st.t[10];
-    reg_tmp4 = st.t[11];
-    reg_nsize = st.t[8];
-    reg_itern = st.t[9];
-    reg_ret = rax;
-
-    vreg_push(rsp);
-
-    load32(reg_ksize, ptr[parambase + OFFSET(k)]);
-    load32(reg_nsize, ptr[parambase + OFFSET(n)]);
-    xor_(reg_itern, reg_itern);
-    L(".nloop");
-    init_regs(_mtile);
-    mov(reg_matAptr, ptr[parambase + OFFSET(matA)]);
-    load32(reg_astride, ptr[parambase + OFFSET(astride)]);
-    mov(reg_matBptr, ptr[parambase + OFFSET(matB)]);
-    load32(reg_tmp, ptr[parambase + OFFSET(bstride)]);
-    imul(reg_tmp, reg_itern);
-    lea(reg_matBptr, ptr[reg_matBptr + reg_tmp]);
-    xor_(reg_iterk, reg_iterk);
-    generate_kloop(_mtile);
-    write_back(_mtile);
-    add(reg_itern, NTILE);
-    cmp(reg_itern, reg_nsize);
-    jb(".nloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-
-    outLocalLabel();  // end of local label
-  }
-
-  void generate_kloop(int _mtile) {
-    inLocalLabel();
-    xor_(reg_iterkb, reg_iterkb);
-    L(".kloop");
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vpxorq(Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j), Xbyak::Zmm(CReg + i * NRegs + j));
-      }
-    }
-    xor_(reg_tmp2, reg_tmp2);
-    load32(reg_tmp3, ptr[parambase + OFFSET(kblock)]);
-    mov(reg_tmp, reg_tmp3);
-    padto_le(reg_tmp, KUNROLL * KTILE);
-    cmp(reg_tmp, 0);
-    jz(".kbloop", T_NEAR);
-    L(".unkbloop");
-    generate_fma(_mtile, KUNROLL, reg_tmp1);
-    add(reg_matAptr, KUNROLL * AKStepSize);
-    add(reg_matBptr, KUNROLL * BKStepSize);
-    add(reg_tmp2, KUNROLL * KTILE);
-    cmp(reg_tmp2, reg_tmp);
-    jb(".unkbloop");
-    cmp(reg_tmp, reg_tmp3);
-    jge(".kend", T_NEAR);
-    L(".kbloop");
-    generate_fma(_mtile, 1, reg_tmp1);
-    add(reg_matAptr, 1 * AKStepSize);
-    add(reg_matBptr, 1 * BKStepSize);
-    add(reg_tmp2, 1 * KTILE);
-    cmp(reg_tmp2, reg_tmp3);
-    jb(".kbloop");
-    L(".kend");
-    add(reg_iterk, reg_tmp2);
-    generate_f32_accumulate(_mtile);
-    generate_zp_correction(_mtile);
-    inc(reg_iterkb);
-    cmp(reg_iterk, reg_ksize);  // k iteration variable
-    jb(".kloop");
-
-    outLocalLabel();
-  }
-
-  void generate_fma(int _mtile, int _ktile, Xbyak::Reg64& tmp) {
-    for (int kk = 0; kk < _ktile; kk++) {
-      lea(tmp, ptr[reg_matAptr + kk * AKStepSize]);
-      for (int i = 0; i < NRegs; i++) {
-        vmovups(vreg_t(BReg + i), ptr[reg_matBptr + kk * BKStepSize + i * VecBytes]);
-      }
-      for (int mm = 0; mm < _mtile; mm++) {
-        vpbroadcastd(vreg_t(AReg), ptr[reg_tmp1]);
-        add(reg_tmp1, reg_astride);
-        for (int i = 0; i < NRegs; i++) {
-          vpdpbusds_(vreg_t(CReg + mm * NRegs + i), vreg_t(AReg), vreg_t(BReg + i));
-        }
-      }
-    }
-  }
-
-  void init_regs(int _mtile) {
-    inLocalLabel();
-    load32(reg_tmp, ptr[parambase + OFFSET(init)]);
-    cmp(reg_tmp, 0);
-    je(".read", T_NEAR);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vxor(vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j), vreg_t(CF32Reg + i * NRegs + j));
-      }
-    }
-    jmp(".end", T_NEAR);
-    L(".read");
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(vreg_t(CF32Reg + i * NRegs + j), ptr[reg_matCptr + j * VecBytes]);
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    L(".end");
-    outLocalLabel();
-  }
-
-  void generate_f32_accumulate(int _mtile) {
-    load32(reg_tmp, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(scaleB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-
-    mov(reg_tmp, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(float)]);
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsa)]);
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_tmp2 + i * VecBytes]);
-    }
-    for (int mm = 0; mm < _mtile; mm++) {
-      vbroadcastss(Xbyak::Zmm(TmpReg), ptr[reg_tmp]);
-      lea(reg_tmp, ptr[reg_tmp + reg_tmp1 * sizeof(float)]);
-      for (int i = 0; i < NRegs; i++) {
-        vcvtdq2ps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-        vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(TmpReg), Xbyak::Zmm(BReg + i));
-        vmulps(Xbyak::Zmm(CReg + mm * NRegs + i), Xbyak::Zmm(AReg));
-        vaddps(Xbyak::Zmm(CF32Reg + mm * NRegs + i), Xbyak::Zmm(CReg + mm * NRegs + i));
-      }
-    }
-  }
-
-  void generate_zp_correction(int _mtile) {
-    load32(reg_tmp1, ptr[parambase + OFFSET(ldsb)]);
-    imul(reg_tmp1, reg_iterkb);
-    mov(reg_tmp2, ptr[parambase + OFFSET(reduceB)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_tmp1 * sizeof(float)]);
-    lea(reg_tmp2, ptr[reg_tmp2 + reg_itern * sizeof(float)]);
-    auto& reg_redB = reg_tmp2;
-
-    mov(reg_tmp, ptr[parambase + OFFSET(zpA)]);
-    lea(reg_tmp, ptr[reg_tmp + reg_iterkb * sizeof(AType)]);
-    auto& reg_zpA = reg_tmp;
-
-    mov(reg_tmp1, ptr[parambase + OFFSET(scaleA)]);
-    lea(reg_tmp1, ptr[reg_tmp1 + reg_iterkb * sizeof(float)]);
-    auto& reg_scaleA = reg_tmp1;
-
-    load32(reg_tmp3, ptr[parambase + OFFSET(ldsa)]);
-    auto& reg_ldsa = reg_tmp3;
-    for (int i = 0; i < NRegs; i++) {
-      vmovups(Xbyak::Zmm(BReg + i), ptr[reg_redB + i * VecBytes]);
-    }
-
-    for (int i = 0; i < _mtile; i++) {
-      vpbroadcastb(Xbyak::Xmm(AReg), ptr[reg_zpA]);
-      vpmovzxbd(Xbyak::Zmm(AReg), Xbyak::Xmm(AReg));
-      vcvtdq2ps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg));
-      vmulps(Xbyak::Zmm(AReg), Xbyak::Zmm(AReg), zword_b[reg_scaleA]);
-      for (int j = 0; j < NRegs; j++) {
-        vmulps(Xbyak::Zmm(CReg + j), Xbyak::Zmm(AReg), Xbyak::Zmm(BReg + j));
-        vsubps(Xbyak::Zmm(CF32Reg + i * NRegs + j), Xbyak::Zmm(CReg + j));
-      }
-      lea(reg_zpA, ptr[reg_zpA + reg_ldsa * sizeof(AType)]);
-      lea(reg_scaleA, ptr[reg_scaleA + reg_ldsa * sizeof(float)]);
-    }
-  }
-
-  void write_back(int _mtile) {
-    inLocalLabel();
-    mov(reg_matCptr, ptr[parambase + OFFSET(matC)]);
-    load32(reg_cstride, ptr[parambase + OFFSET(cstride)]);
-    lea(reg_matCptr, ptr[reg_matCptr + reg_itern * sizeof(CType)]);
-    for (int i = 0; i < _mtile; i++) {
-      for (int j = 0; j < NRegs; j++) {
-        vmovups(ptr[reg_matCptr + j * VecBytes], vreg_t(CF32Reg + i * NRegs + j));
-      }
-      add(reg_matCptr, reg_cstride);
-    }
-    outLocalLabel();
-  }
-};
-
-}  // namespace kblock
-}  // namespace code
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBase {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA);
-  void configure() { (void)(0); }
-
- protected:
-  CoreCodeBase() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code(i + 1);
-    }
-  }
-  std::array<Code, Code::MTILE> mCodes;
-};
-
-template <template <int, int> class CodeT, int _NTILE, int _MTILE = 0>
-class CoreCodeBaseAMX {
- public:
-  using Code = CodeT<_NTILE, _MTILE>;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  static int constexpr NTILE = Code::NTILE;
-  static int constexpr MTILE = Code::MTILE;
-  static int constexpr KTILE = Code::KTILE;
-  static int constexpr PACK_ROW = Code::PackRow;
-  static int constexpr COMP = Code::COMPUTE;
-  static int constexpr PREFERRED_N = NTILE * 3;
-  static JBLAS_ISA constexpr ISA = (JBLAS_ISA)Code::ISA;
-  static uint32_t constexpr ID = CoreAttr::make_core_id(_NTILE, PACK_ROW, COMP, ISA);
-  Xbyak::CodeGenerator cfgcode;
-
- protected:
-  CoreCodeBaseAMX() {
-    for (int i = 0; i < mCodes.size(); i++) {
-      mCodes[i].generate_code((i + 1) * 16);
-    }
-  }
-  std::array<Code, Code::MRegs> mCodes;
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx2 : public CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx2N8P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class SCoreRowNAvx512f : public CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fN16P1, _NTILE, _MTILE>::Code;
-  void forward(float* matA, float* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512fp16 : public CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512fp16N32P1, _NTILE, _MTILE>::Code;
-
-  void forward(utils::fp16* matA, utils::fp16* matB, utils::fp16* matC, int _m, int _n, int _k, int _astride,
-               int _bstride, int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAvx512bf16 : public CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512bf16N16P2, _NTILE, _MTILE>::Code;
-  void forward(utils::bf16* matA, utils::bf16* matB, float* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class HCoreRowNAmxbf16 : public CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxbf16N16P2, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(AType* matA, BType* matB, CType* matC, int _m, int _n, int _k, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnni : public CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvx512vnniKBlock : public CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::kblock::Avx512vnniN16P4, _NTILE, _MTILE>::Code;
-  void forward(uint8_t* matA, int8_t* matB, float* matC, uint8_t* zpA, float* scaleA, int _ldsa, float* scaleB,
-               float* reduceB, int _ldsb, int _m, int _n, int _k, int _kblock, int _astride, int _bstride, int _cstride,
-               int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA,  _astride, matB,    _bstride, matC, _cstride, zpA,     scaleA,
-                                       _ldsa, scaleB,   reduceB, _ldsb,    _k,   _n,       _kblock, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAvxvnni : public CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBase<code::AvxvnniN8P4, _NTILE, _MTILE>::Code;
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param = typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0};
-    if (_m <= Code::MTILE) {
-      this->mCodes[_m - 1].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8 : public CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4US, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(uint8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <int _NTILE, int _MTILE = 0>
-class ICoreRowNAmxint8SS : public CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE> {
- public:
-  using Code = typename CoreCodeBaseAMX<code::Amxint8N16P4SS, _NTILE, _MTILE>::Code;
-  using AType = typename Code::AType;
-  using BType = typename Code::BType;
-  using CType = typename Code::CType;
-  void configure() {
-    code::AmxConfigure::configure(16, 16, Code::KTILE, sizeof(BType), this->mCodes[0].ATileCount,
-                                  this->mCodes[0].BTileCount, this->mCodes[0].CTileCount);
-  }
-
-  void forward(int8_t* matA, int8_t* matB, int32_t* matC, int _m, int _n, int _k, int _astride, int _bstride,
-               int _cstride, int kpos, void* tmpcache, size_t cachesize) {
-    auto param =
-        typename Code::params{matA, _astride, matB, _bstride, matC, _cstride, _k, _n, kpos == 0 ? 1 : 0, tmpcache};
-    if (_m <= Code::MTILE) {
-      int idx = utils::updiv(_m, 16) - 1;
-      this->mCodes[idx].mKernel(&param);
-    } else {
-      assert(0);
-    }
-  }
-};
-}  // namespace gemm
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
deleted file mode 100644
index a1607c9012187..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_parallel.h
+++ /dev/null
@@ -1,678 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <functional>
-#include <thread>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "jit_blas_utils.h"
-#include "jit_blas_device.h"
-
-namespace jblas {
-namespace parallel {
-struct Config2D {
-  int threads;
-  int size[2];
-  int step[2];
-};
-struct ThreadProblem2D {
-  int tid;
-  int tidx[2];
-  int loc[2];
-  int size[2];
-  bool valid;
-  void print() {
-    printf("Thread %d indice:(%d,%d)\n", tid, tidx[0], tidx[1]);
-    printf("Thread location:(%d,%d)\n", loc[0], loc[1]);
-    printf("Thread problem size:(%d,%d)\n", size[0], size[1]);
-  }
-};
-class Scheduler2D {
- public:
-  Scheduler2D() = default;
-  Scheduler2D(const Config2D& config) { update(config); }
-  using ThreadProblem = ThreadProblem2D;
-
-  virtual void getIndex(ThreadProblem& problem) {
-    if (problem.tid >= mThdValid) {
-      problem.size[0] = 0;
-      problem.size[1] = 0;
-      problem.valid = false;
-      return;
-    }
-    auto& tid = problem.tid;
-    problem.tidx[1] = tid % mThdPerRow;
-    problem.tidx[0] = tid / mThdPerRow;
-    problem.loc[0] = problem.tidx[0] * mThdSize[0];
-    problem.loc[1] = problem.tidx[1] * mThdSize[1];
-    problem.size[0] = utils::remainsize(problem.loc[0], mSize[0], mThdSize[0]);
-    problem.size[1] = utils::remainsize(problem.loc[1], mSize[1], mThdSize[1]);
-    problem.valid = true;
-  }
-
-  virtual void update(const Config2D& config) {
-    mThdCount = config.threads;
-    for (size_t i = 0; i < 2; i++) {
-      mSize[i] = config.size[i];
-      mStep[i] = config.step[i];
-    }
-    schedule();
-  }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-  }
-
- protected:
-  void set(const int* thdsize, const int* size, const int* step) {
-    for (size_t i = 0; i < 2; i++) {
-      mThdSize[i] = thdsize[i];
-      mSize[i] = size[i];
-      mStep[i] = step[i];
-    }
-  }
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    float ratio = colnum * rownum / static_cast<float>(mThdCount);
-    if (ratio <= 1) {
-      mThdSize[0] = mStep[0];
-      mThdSize[1] = mStep[1];
-      mThdPerRow = colnum;
-      calc_valid_threads();
-      return;
-    }
-    float colratio = ratio > colnum ? colnum : ceil(ratio);
-    mThdSize[1] = static_cast<int>(colratio * mStep[1]);
-    mThdPerRow = static_cast<int>(ceil(static_cast<float>(colnum) / colratio));
-    mThdSize[0] = static_cast<int>(ceil(rownum / (static_cast<float>(mThdCount) / mThdPerRow)) * mStep[0]);
-    calc_valid_threads();
-  }
-  void calc_valid_threads() {
-    mThdValid = mThdPerRow * static_cast<int>(std::ceil(static_cast<float>(mSize[0]) / mThdSize[0]));
-  }
-
-  int mThdPerRow = 0;
-  int mThdValid = 0;
-  int mThdCount = 0;
-
- private:
-  int mThdSize[2] = {0, 0};
-  int mSize[2] = {0, 0};
-  int mStep[2] = {0, 0};
-};
-
-namespace gemm {
-
-struct ConfigGemmBase {
-  int threads;
-  int size[3];
-  size_t l2cache = 1024ULL * 1024;
-  size_t l1cache = 32ULL * 1024;
-};
-
-struct ThreadProblemBase : ThreadProblem2D {
-  int block[3];
-  size_t l2cachesize;
-  size_t tmpcachesize;
-};
-
-template <class _GemmCore_T>
-class SchedulerBase : public Scheduler2D {
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerBase() = default;
-  SchedulerBase(const ConfigGemmBase& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.l2cachesize = mL2Size;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmBase& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // cache = mMStep * mNStep * CSize + mNStep * mKStep * BSize
-  //       = mNStep * (mMStep*CSize + mKStep*BSize)
-  // C Access = K/mKStep
-  // B Access = M/mMStep
-  // A Access = N/mNStep
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    size_t csize_total = mL2Size - _GemmCore_T::PREFERRED_N * KRef * mEleSize[1];
-    int maxM = static_cast<int>(csize_total / _GemmCore_T::PREFERRED_N / mEleSize[2]);
-    maxM = utils::downdiv(maxM, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-    int maxN = static_cast<int>(mL2Size / (mBlock[0] * mEleSize[2] + KRef * mEleSize[1]));
-    maxN = utils::downdiv(maxN, mStep[1]);
-    int nthdn = mThdSize[1] / mStep[1];
-    if (maxN < nthdn) {
-      int niter = utils::updiv(nthdn, maxN);
-      mBlock[1] = utils::updiv(nthdn, niter) * mStep[1];
-    } else {
-      mBlock[1] = mThdSize[1];
-    }
-    auto rawk = static_cast<int>((mL2Size - mBlock[0] * mBlock[1] * mEleSize[2]) /
-                                 (mBlock[0] * mEleSize[0] + mBlock[1] * mEleSize[1]));
-    rawk = std::min(rawk, mSizePadded[2]);
-    mBlock[2] = utils::padto_le(rawk, mStep[2]);
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = mThdSize[0];
-    mBlock[1] = mStep[1];
-    size_t reservsize = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2];
-    size_t maxK = (mL1Size - reservsize) / (mBlock[1] * mEleSize[1] + mBlock[0] * mEleSize[0]);
-    size_t Bsize = maxK * mBlock[1] * mEleSize[1];
-    size_t Bsize_1K = utils::padto_le(Bsize, 1024);
-    mBlock[2] = static_cast<int>(Bsize_1K / mEleSize[1] / mBlock[1]);
-    mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-  }
-
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-
-struct ConfigGemmKBlock : ConfigGemmBase {
-  int kblock;
-};
-
-template <class _GemmCore_T>
-class SchedulerKBlock : public Scheduler2D {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlock() = default;
-  SchedulerKBlock(const ConfigGemmKBlock& config) { update(config); }
-  virtual void getIndex(ThreadProblem& problem) {
-    problem.l2cachesize = mL2Size;
-    problem.tmpcachesize = mL2Size - mL2Use;
-    problem.block[0] = mBlock[0];
-    problem.block[1] = mBlock[1];
-    problem.block[2] = mBlock[2];
-    Scheduler2D::getIndex(problem);
-  }
-
-  void update(const ConfigGemmKBlock& config) {
-    for (size_t i = 0; i < 3; i++) {
-      mSize[i] = config.size[i];
-      mSizePadded[i] = utils::padto(mSize[i], mStep[i]);
-    }
-    mThdCount = config.threads;
-    mL2Size = config.l2cache;
-    mL1Size = config.l1cache;
-    mKBlock = config.kblock;
-    if (mSize[0] <= 0 || mSize[1] <= 0 || mSize[2] <= 0) {
-      return;
-    }
-    schedule();
-  }
-
-  constexpr int valid_theads() { return mThdValid; }
-
-  void print() {
-    printf("Thread Block:(%d,%d)\n", mThdSize[0], mThdSize[1]);
-    printf("Thread in use:%d of %d, Nx%d\n", mThdValid, mThdCount, mThdPerRow);
-    printf("GEMM MStep:%d NStep:%d KStep:%d\n", mBlock[0], mBlock[1], mBlock[2]);
-    printf("Cache Size:%zu used:%zu\n", mL2Size, mL2Use);
-  }
-
- protected:
-  void schedule() {
-    int rownum = utils::updiv(mSize[0], mStep[0]);
-    int colnum = utils::updiv(mSize[1], mStep[1]);
-    mDensity = static_cast<float>(mSize[0]) * mSize[1] / (mSize[0] + mSize[1]);
-    int maxN = 0;
-    float maxScore = std::numeric_limits<float>::min();
-    int core_enum = static_cast<int>(std::sqrt(mThdCount));
-    for (int i = 1; i <= core_enum; i += 1) {
-      generate_by_cores(i, mThdCount / i, rownum, colnum);
-      auto thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = i;
-      }
-      generate_by_cores(mThdCount / i, i, rownum, colnum);
-      thdscore = calculate_score();
-      if (maxScore < thdscore) {
-        maxScore = thdscore;
-        maxN = mThdCount / i;
-      }
-    }
-    generate_by_cores(maxN, mThdCount / maxN, rownum, colnum);
-    update_cache_blocking();
-    Scheduler2D::set(mThdSize, mSize, mStep);
-    mL2Use = static_cast<size_t>(mBlock[0]) * mBlock[1] * mEleSize[2] * 2;
-    mL2Use += static_cast<size_t>(mBlock[1]) * mBlock[2] * mEleSize[1];
-    mL2Use += static_cast<size_t>(mStep[0]) * mBlock[2] * mEleSize[0];
-  }
-  const float DensityThres = 32;
-
-  float calculate_score() {
-    int tmpnstep = mThdSize[1] < _GemmCore_T::PREFERRED_N ? mThdSize[1] : _GemmCore_T::PREFERRED_N;
-    float threadratio = static_cast<float>(mThdValid) / mThdCount;
-    float density = static_cast<float>(tmpnstep) * mThdSize[0] / (tmpnstep + mThdSize[0]);
-    if (mDensity < DensityThres) {
-      return threadratio * 1.f;
-    }
-    return (threadratio * 1.f + density * 0.0016f);
-  }
-
-  void generate_by_cores(int ny, int nx, int rownum, int colnum) {
-    mThdSize[0] = utils::updiv(rownum, ny) * mStep[0];
-    mThdSize[1] = utils::updiv(colnum, nx) * mStep[1];
-    mThdPerRow = utils::updiv(mSize[1], mThdSize[1]);
-    mThdValid = utils::updiv(mSize[0], mThdSize[0]) * mThdPerRow;
-  }
-
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitStage * mStep[2] >= mSize[2]) {
-      mBlock[2] = mSize[2];
-    } else if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-    }      
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = static_cast<int>(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = static_cast<int>(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = static_cast<int>(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = static_cast<int>(getMaxK(mBlock[1]));
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-      auto tmp = utils::updiv(mKBlock, mBlock[2]);
-      while (mKBlock % tmp != 0) tmp++;  // TODO(Yu) optimize
-      mBlock[2] = utils::downdiv(mKBlock, tmp);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#if 0
-template <class _GemmCore_T>
-class SchedulerKBlockS : public SchedulerBase<_GemmCore_T> {
-  // Block[2]: block size of K must be mutiplier of mKBlock
-  //           or factor of mKBlock
- public:
-  using ThreadProblem = ThreadProblemBase;
-  SchedulerKBlockS() = default;
-  SchedulerKBlockS(const ConfigGemmKBlock& config) { update(config); }
-
- protected:
-  // C-KBlock Accumulator=MBlock*NBlock
-  // C-K Accumulator=MBlock*NBlock
-  // B=MBlock*KBlock
-  // A=MTILE*KBlock
-  void update_cache_blocking() {
-    if (mDensity <= DensityThres) {
-      return cache_block_memory();
-    } else {
-      return cache_blocking_compute();
-    }
-  }
-
-  void cache_blocking_compute() {
-    int constexpr KRef = 256;
-    int constexpr NRef = _GemmCore_T::PREFERRED_N;
-    int constexpr MTile = _GemmCore_T::MTILE;
-    int constexpr KSplitStage = 16;
-    int BlkNum = utils::updiv(mSize[2], mKBlock);
-    int KSplitSize = utils::padto(utils::updiv(mSize[2], KSplitStage), mStep[2]);
-    mBlock[1] = NRef < mThdSize[1] ? NRef : mThdSize[1];
-    if (KSplitSize >= mKBlock) {
-      mBlock[2] = mKBlock;
-    } else {
-      int scale = utils::downdiv(KSplitStage, BlkNum);
-      for (; scale >= 1; scale--) {
-        if (mKBlock % scale == 0) {
-          break;
-        }
-      }
-      mBlock[2] = utils::downdiv(mKBlock, scale);
-    }
-    size_t size_remain = mL2Size - mBlock[1] * mBlock[2] * mEleSize[1];
-    // MBlock*KBlock*ASize+MBlock*NBlock*CSize*2<=size_remain
-    int maxMBlock = int(size_remain / (mBlock[1] * mEleSize[2] * 2 + mBlock[2] * mEleSize[0]));
-    int maxM = utils::downdiv(maxMBlock, mStep[0]);
-    int nthdm = mThdSize[0] / mStep[0];
-    if (maxM < nthdm) {
-      int niter = utils::updiv(nthdm, maxM);
-      mBlock[0] = utils::updiv(nthdm, niter) * mStep[0];
-    } else {
-      mBlock[0] = mThdSize[0];
-    }
-  }
-
-  void cache_block_memory() {
-    mBlock[0] = _GemmCore_T::MTILE;
-    size_t startK = std::max(16, _GemmCore_T::KTILE);
-    auto getMaxN = [&](size_t refk) {
-      size_t sizeA = refk * mEleSize[0] * mBlock[0];
-      size_t maxN = (mL1Size - sizeA) / (mBlock[0] * mEleSize[2] * 2 + refk * mEleSize[1]);
-      return maxN;
-    };
-    auto getMaxK = [&](size_t refN) {
-      size_t sizeC = refN * mEleSize[2] * mBlock[0] * 2;
-      size_t maxK = (mL1Size - sizeC) / (mBlock[0] * mEleSize[0] + refN * mEleSize[1]);
-      return maxK;
-    };
-    auto maxN = getMaxN(startK);
-    if (maxN <= mThdSize[1]) {
-      mBlock[1] = int(maxN);
-      mBlock[1] = utils::padto_le(mBlock[1], mStep[1]);
-      mBlock[2] = int(startK);
-    } else {
-      mBlock[1] = mThdSize[1];
-      mBlock[2] = getMaxK(mBlock[1]);
-      mBlock[2] = utils::padto_le(mBlock[2], mStep[2]);
-      mBlock[2] = std::min(mKBlock, mBlock[2]);
-    }
-  }
-  size_t mL2Size = 0, mL1Size = 0, mL2Use = 0;
-  float mDensity = 0.f;
-  int mKBlock = 0;
-
- private:
-  int mSize[3] = {0, 0, 0};
-  int mThdSize[3] = {0, 0, 0};
-  static constexpr int mStep[3] = {_GemmCore_T::MTILE, _GemmCore_T::NTILE, _GemmCore_T::KTILE};
-  static constexpr int mEleSize[3] = {sizeof(typename _GemmCore_T::AType), sizeof(typename _GemmCore_T::BType),
-                                      sizeof(typename _GemmCore_T::CType)};
-  int mSizePadded[3] = {0, 0, 0};
-  int mBlock[3] = {0, 0, 0};
-};
-#endif
-}  // namespace gemm
-using thread_func = std::function<void(int tid)>;
-
-class IThreading {
- public:
-  IThreading(int nthreads) : mThreadNum(nthreads) {}
-  virtual void parallel_for(const thread_func& func) = 0;
-  virtual inline void sync() = 0;
-  virtual int num_threads() { return mThreadNum; };
-  virtual void set_threads(int nthreads) = 0;
-
- protected:
-  int mThreadNum;
-};
-#ifdef _OPENMP
-class OMPThreading : public IThreading {
- public:
-  OMPThreading(int nthreads) : IThreading(nthreads) { omp_set_num_threads(nthreads); }
-  void parallel_for(const thread_func& func) override {
-#pragma omp parallel
-    {
-      int tidx = omp_get_thread_num();
-      func(tidx);
-    }
-  }
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    omp_set_num_threads(nthreads);
-  }
-  virtual inline void sync() override {
-#pragma omp barrier
-    (void)(0);  // make msvc happy with c++20
-  }
-};
-#endif
-
-class StdThreading : public IThreading {
- public:
-  StdThreading(int nthreads) : IThreading(nthreads) { thdset.resize(nthreads); }
-  void parallel_for(const thread_func& func) override {
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i] = std::thread([&](int tidx) { func(tidx); }, int(i));
-    }
-    for (size_t i = 0; i < mThreadNum; i++) {
-      thdset[i].join();
-    }
-  }
-
-  virtual void set_threads(int nthreads) override {
-    mThreadNum = nthreads;
-    thdset.resize(nthreads);
-  }
-
-  virtual inline void sync() override { assert(0); }
-
- private:
-  std::vector<std::thread> thdset;
-};
-
-template <class Parallel_T, class Launch_T>
-void GemmBaseRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRun(Launch_T& launcher, const typename Launch_T::Param& args, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  static bool flag = false;
-  if (flag) {
-    printf("%s\n", __FUNCTION__);
-    para.print();
-    flag = false;
-  }
-  th->parallel_for([&](int tidx) {
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-template <class Parallel_T, class Launch_T>
-void GemmKBlockRunWithA(Launch_T& launcher, const typename Launch_T::Param& args,
-                        const typename Launch_T::AParam& Aargs, parallel::IThreading* th) {
-  device::CpuBase cb;
-  Parallel_T para({th->num_threads(), args.M, args.N, args.K, cb.mL2Cache, cb.mL1Cache, args.KBlock});
-  using AParall = typename Launch_T::PrologueA::Parallel;
-  AParall apara({th->num_threads(), args.M, args.K, 1, args.KBlock});
-  th->parallel_for([&](int tidx) {
-    typename AParall::ThreadProblem thdpA{tidx};
-    apara.getIndex(thdpA);
-    if (thdpA.valid) {
-      launcher.mProA.run(Aargs, thdpA);
-    }
-    th->sync();
-    typename Parallel_T::ThreadProblem thdp{tidx};
-    para.getIndex(thdp);
-    if (thdp.valid) {
-      launcher.run(args, thdp);
-    }
-  });
-}
-
-}  // namespace parallel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
deleted file mode 100644
index b006e0b410cd8..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_a.h
+++ /dev/null
@@ -1,214 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include <cassert>
-
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_a {
-namespace gemm {
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class ActivationBase {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = AType;
-  struct Param {
-    const AType* A;
-    int lda;
-  };
-  ActivationBase() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<AType*>(_param.A);
-    if (k_size % _GemmCore_T::KTILE == 0 && m_size >= _GemmCore_T::MTILE) {
-      *dstptr = aptr + m_offset * _param.lda + k_offset;
-      *dststep = _param.lda;
-      return JblasSuccess;
-    } else {
-      auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-      *dststep = k_pad;
-      return kernel::wrapper::Memcpy2D::forward<ISA_T, AType, AType>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                     m_size, k_size, _param.lda, k_pad);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationConverter {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-  };
-  ActivationConverter() {}
-
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    auto aptr = const_cast<SRC_T*>(_param.A);
-    auto k_pad = utils::padto(k_size, _GemmCore_T::KTILE);
-    *dststep = k_pad;
-    if constexpr (std::is_same_v<AType, utils::bf16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtBf16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, utils::fp16> && std::is_same_v<SRC_T, float>) {
-      return kernel::wrapper::Memcpy2DFp32CvtFp16::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else if constexpr (std::is_same_v<AType, float> && std::is_same_v<SRC_T, utils::bf16>) {
-      return kernel::wrapper::Memcpy2DBf16CvtFp32::forward<ISA_T>(aptr + m_offset * _param.lda + k_offset, *dstptr,
-                                                                  m_size, k_size, _param.lda * sizeof(SRC_T),
-                                                                  k_pad * sizeof(AType), true);
-    } else {
-      assert(0);
-    }
-    return JblasNotSupport;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterFp32 = ActivationConverter<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationConverterBf16 = ActivationConverter<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockQuantize {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = float;
-  using QParam = storage::gemm::StorageQuantActivation;
-  using SRCType = SRC_T;
-  struct Param {
-    const SRC_T* A;
-    int lda;
-    QParam* quan;
-  };
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline QParam createStorage(int m, int k, int kblock, bool hasreduce) {
-    QParam tmp;
-    int kpad = utils::padto(k, _GemmCore_T::KTILE);
-    int mpad = utils::padto(m, _GemmCore_T::MTILE);
-    tmp.resize(mpad, kpad, kblock == -1 ? kpad : kblock, JBLAS_DTYPE::U8, JBLAS_DTYPE::F32, JBLAS_DTYPE::U8,
-               JBLAS_DTYPE::F32, std::is_same_v<AType, uint8_t>, hasreduce);
-    return tmp;
-  }
-
-  void run(const Param& _param, ThreadProblem& thdp) {
-    auto quan = _param.quan;
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto thdqptr = quan->template APtr<AType>() + thdp.loc[0] * quan->lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * quan->mCStep + thdp.loc[1] / quan->kblock;
-      auto thdsptr = quan->template SPtr<float>() + blk_offset;
-      auto thdzptr = quan->template ZPtr<AType>() + blk_offset;
-      auto thdrptr = quan->template RPtr<float>() == nullptr ? nullptr : quan->template RPtr<float>() + blk_offset;
-      if constexpr (std::is_same_v<AType, uint8_t>) {
-        kernel::wrapper::QuantizeU8ColBlock::template forward<ISA_T, SRC_T>(
-            thdp.size[0], thdp.size[1], srcptr, _param.lda, thdqptr, quan->lda, thdsptr, quan->mCStep, thdzptr,
-            quan->kblock, thdrptr);
-      }
-      if constexpr (std::is_same_v<AType, int8_t>) {
-        kernel::wrapper::QuantizeS8ColBlock::template forward<ISA_T, SRC_T>(thdp.size[0], thdp.size[1], srcptr,
-                                                                            _param.lda, thdqptr, quan->lda, thdsptr,
-                                                                            quan->mCStep, quan->kblock, thdrptr);
-      }
-    }
-  }
-
-  JBLAS_CODE quantize(const Param& _param, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, _param.quan->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, thdp);
-    });
-    return JblasSuccess;
-  }
-
- public:  // Runtime get by launcher
-  JBLAS_CODE getActivation(AType** dstptr, int* dststep, const Param& _param, int m_size, int k_size, int m_offset,
-                           int k_offset, void* tmpcache, size_t cachesize) {
-    (void)m_size;
-    (void)k_size;
-    auto quan = _param.quan;
-    auto aptr = quan->template APtr<AType>();
-    *dstptr = aptr + m_offset * quan->lda + k_offset;
-    *dststep = quan->lda;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationF32KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, float>;
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationBf16KBlockQuantize = ActivationKBlockQuantize<_GemmCore_T, ISA_T, utils::bf16>;
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T, typename SRC_T>
-class ActivationKBlockBase : public ActivationBase<_GemmCore_T, ISA_T> {
- public:
-  using AType = typename _GemmCore_T::AType;
-  using SType = storage::gemm::StorageReduce;
-  using SRCType = SRC_T;
-  using Param = typename ActivationBase<_GemmCore_T, ISA_T>::Param;
-  using Parallel = jblas::parallel::Scheduler2D;
-  using ThreadProblem = jblas::parallel::ThreadProblem2D;
-
-  inline SType createStorage(int m, int k, int kblock) {
-    SType tmp;
-    tmp.resize(m, k, kblock == -1 ? k : kblock, JBLAS_DTYPE::F32);
-    return tmp;
-  }
-
-  void run(const Param& _param, SType* stor, int m, int k, ThreadProblem& thdp) {
-    if (thdp.valid) {
-      // min max
-      auto srcptr = const_cast<SRC_T*>(_param.A) + thdp.loc[0] * _param.lda + thdp.loc[1];
-      auto blk_offset = thdp.loc[0] * stor->lda + thdp.loc[1] / stor->kblock;
-      auto thdrptr = stor->template get<float>() + blk_offset;
-      auto ret = kernel::wrapper::ColBlockReduceSum::template forward<ISA_T, SRC_T>(
-          srcptr, _param.lda, thdp.size[0], thdp.size[1], stor->kblock, thdrptr, stor->lda);
-      assert(ret == JblasSuccess);
-    }
-  }
-
-  JBLAS_CODE reduce(const Param& _param, SType* stor, int m, int k, jblas::parallel::IThreading* threading) {
-    auto paral = Parallel({threading->num_threads(), m, k, 1, stor->kblock});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      paral.getIndex(thdp);
-      if (thdp.valid) run(_param, stor, m, k, thdp);
-    });
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-using ActivationKBlockBaseF32 = ActivationKBlockBase<_GemmCore_T, ISA_T, float>;
-}  // namespace gemm
-}  // namespace prologue_a
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
deleted file mode 100644
index 7fd632d4d3c6c..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_prologue_b.h
+++ /dev/null
@@ -1,892 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_storage.h"
-#include "jit_blas_device.h"
-#include "jit_blas_parallel.h"
-#include "kernel_wrapper.h"
-
-namespace jblas {
-namespace prologue_b {
-namespace gemm {
-
-template <typename WT, JBLAS_ISA ISA_T>
-static inline void transposeWeight(const int Row, const int Col, const WT* src, const int ld_src, WT* dst,
-                                   const int ld_dst, parallel::IThreading* threading) {
-  jblas::parallel::Scheduler2D _para;
-  _para.update({threading->num_threads(), Row, Col, 16, 16});
-  threading->parallel_for([&](int tidx) {
-    jblas::parallel::ThreadProblem2D thdp{tidx};
-    _para.getIndex(thdp);
-    if (thdp.valid) {
-      kernel::wrapper::Transpose2D<WT>::template forward<ISA_T>(src + thdp.loc[0] * ld_src + thdp.loc[1],
-                                                                   dst + thdp.loc[0] + thdp.loc[1] * ld_dst,
-                                                                   thdp.size[0], thdp.size[1], ld_src, ld_dst);
-    }
-  });
-}
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightPack {
- public:
-  using WType = typename _GemmCore_T::BType;
-  using StorageType = storage::gemm::StoragePackedWeight;
-  struct Param {
-    const WType* B;
-    const int ldb;
-    StorageType* packedW;
-  };
-
-  StorageType createStorage(int n, int k) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageType tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, n, k, utils::jblas_dtype<WType>);
-    return tmp;
-  }
-
-  void packWeightTranspose(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<WType>(static_cast<size_t>(N) * K);
-    transposeWeight<WType, ISA_T>(N, K, _param.B, _param.ldb, B_NT, N, threading);
-    packWeight(N, K, {B_NT, N, _param.packedW}, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN int8 symmetric weight to packed N//NtilexKPadxNTile int4 weight
-  void packWeight(const int N, const int K, const Param& _param, parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        run(_param, thdp);
-      }
-    });
-  }
-
-  void run(const Param& _param, parallel::ThreadProblem2D& thdp) {
-    auto packedw = _param.packedW;
-    auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-    auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-    const auto src = _param.B + thdp.loc[0] * _param.ldb + thdp.loc[1];
-    const auto dst = packedw->template get<WType>() + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * packedw->mKPad;
-    using PaddingInterleaveMNWType = kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-    auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-        src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, _param.ldb, packedw->mKPad);
-    assert(ret == JblasSuccess);
-    (void)ret;
-  }
-
-  inline JBLAS_CODE getWeight(WType** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param param, void* tmpcache, size_t cachesize) {
-    auto wptr = param.packedW;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->template get<WType>() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, WType, WType>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS8 {
- public:
-  using StorageWeight = storage::gemm::StorageWeightKBlockS8;
-  using BType = typename _GemmCore_T::BType;
-  struct Param {
-    const storage::gemm::WeightKBlockBase* packedW;
-  };
-
-  StorageWeight createStorage(int n, int k, int blocksize, JBLAS_DTYPE scat, JBLAS_DTYPE redt, bool is_asym) {
-    int KPad = utils::padto(k, _GemmCore_T::KTILE);
-    int NPad = utils::padto(n, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, n, k, scat, redt, is_asym);
-    return tmp;
-  }
-
-  virtual void packTransposeWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                                   parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    transposeWeight<float, ISA_T>(N, K, B, ldb, B_NT, N, threading);
-    packWeight(N, K, B_NT, N, stor, threading);
-    utils::afree(B_NT);
-  }
-
-  // from packed N//NtilexKPadxNTile int8 weight to KxN f32 weight
-  virtual void unpackTransposeWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                                     parallel::IThreading* threading) {
-    auto B_NT = utils::amalloc<float>(static_cast<size_t>(N) * K);
-    unpackWeight(N, K, stor, B_NT, N, threading);
-    transposeWeight<float, ISA_T>(K, N, B_NT, N, B, ldb, threading);
-    utils::afree(B_NT);
-  }
-
-  // from KxN f32 weight to packed N//NtilexKPadxNTile int8 weight
-  virtual void packWeight(const int N, const int K, const float* B, const int ldb, void* stor,
-                          parallel::IThreading* threading) {
-    auto tmpq = utils::amalloc<int8_t>(static_cast<size_t>(N) * K);
-    auto ptr = reinterpret_cast<StorageWeight*>(stor);
-    int nk_scale = utils::updiv(K, ptr->mBlockSize);
-    auto ssize = static_cast<size_t>(N) * nk_scale;
-    auto Tscales = utils::amalloc<float>(ssize);
-    auto Tzps = utils::amalloc<int8_t>(ptr->mIsAsym ? ssize : 0);
-    quantizeWeight(N, K, B, ldb, ptr->mBlockSize, tmpq, Tscales, Tzps, ptr->mDType, threading);
-    packQWeight(N, K, tmpq, N, Tscales, Tzps, stor, threading);
-    utils::afree(tmpq);
-    utils::afree(Tscales);
-    utils::afree(Tzps);
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, float* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<float>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void unpackWeight(const int N, const int K, void* stor, int8_t* B, const int ldb,
-                            parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp{tidx};
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpad = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpad = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        auto dequant = utils::amalloc<int8_t>((size_t)rowpad * colpad);
-        auto dstptr = dequant;
-        int dststep = 0;
-        size_t constexpr CacheSize = size_t(100) << 10;
-        int8_t tmpcache[CacheSize];
-        getWeight(&dstptr, &dststep, rowpad, colpad, thdp.loc[0], thdp.loc[1], {(storage::gemm::WeightKBlockBase*)stor},
-                  tmpcache, CacheSize);
-        kernel::wrapper::RevertPaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>::template forward<ISA_T>(
-            dstptr, B + thdp.loc[0] * ldb + thdp.loc[1], thdp.size[0], thdp.size[1], rowpad, colpad, dststep, ldb);
-        utils::afree(dequant);
-      }
-    });
-  }
-
-  virtual void setQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales, void* ptr,
-                                  parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr)
-                std::memcpy(stor->template SPtr<float>() + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
-              if (zero_points != nullptr)
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[j + i * stor->mNPad] = static_cast<utils::bf16>(scales[i * N + j]);
-                }
-              }
-              if (zero_points != nullptr) {
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-              }
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-    }
-  }
-
-  virtual void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
-                                           void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->mScaT == JBLAS_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<float>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
-                }
-              } else {
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              }
-            }
-          }
-        }
-      });
-    } else if (stor->mScaT == JBLAS_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[i * stor->mNPad + j] = utils::bf16(scales[j * rawnk_scale + i]);
-                }
-              } else {
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              }
-            }
-          }
-        }
-      });
-    }
-    if (stor->mIsAsym && zero_points)
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              for (size_t j = 0; j < N; j++) {
-                stor->template ZPtr<int8_t>()[i * stor->mNPad + j] = zero_points[j * rawnk_scale + i];
-              }
-            } else {
-              std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) {
-    setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    reorderWeight(N, K, B, ldb, stor->WPtr(), threading);
-    reduceWeight(ptr, threading);
-  }
-
-  void reduceWeight(void* ptr, parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    if (stor->mHasReduce) {
-      auto deq = utils::amalloc<float>((size_t)stor->mK * stor->mN);
-      unpackWeight(stor->mN, stor->mK, stor, deq, stor->mN, threading);
-      if (stor->mRedT == JBLAS_DTYPE::F32) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<float>(), stor->mCStep,
-               threading);
-      } else if (stor->mRedT == JBLAS_DTYPE::BF16) {
-        reduce(stor->mN, stor->mK, stor->mBlockSize, deq, stor->mN, stor->template RPtr<utils::bf16>(), stor->mCStep,
-               threading);
-      } else {
-        assert(0);
-      }
-      utils::afree(deq);
-    }
-  }
-  template <typename RED_T>
-  void reduce(const int N, const int K, const int KBlock, const float* B, const int ldb, RED_T* rptr, const int ldr,
-              parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, KBlock, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = rptr + thdp.loc[1] + thdp.loc[0] / KBlock * ldr;
-        using RowReduceSum = kernel::wrapper::RowReduceSum<RED_T>;
-        for (int i = 0; i < thdp.size[0]; i += KBlock) {
-          int rowremain = utils::remainsize(thdp.loc[0] + i, K, KBlock);
-          auto ret = RowReduceSum::template forward<ISA_T>(  //
-              src + i * ldb, ldb, rowremain, thdp.size[1], dst + i / KBlock * ldr);
-          assert(ret == JblasSuccess);
-          (void)ret;
-        }
-      }
-    });
-  }
-
-  void quantizeWeight(const int N, const int K, const float* B, const int ldb, int blocksize, int8_t* qB, float* scales,
-                      int8_t* zero_points, JBLAS_DTYPE quant_dtype, parallel::IThreading* threading) {
-    int bsize = blocksize == -1 ? K : blocksize;
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, bsize, 16});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        quantRowBlock(B + thdp.loc[0] * ldb + thdp.loc[1], qB + thdp.loc[0] * N + thdp.loc[1], thdp.size[0],
-                      thdp.size[1], ldb, N, scales + thdp.loc[0] / bsize * N + thdp.loc[1],
-                      zero_points == nullptr ? zero_points : zero_points + thdp.loc[0] / bsize * N + thdp.loc[1], bsize,
-                      quant_dtype);
-      }
-    });
-  }
-
-  void reorderWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr,
-                     parallel::IThreading* threading) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto rowpadded = utils::padto(thdp.size[0], _GemmCore_T::KTILE);
-        auto colpadded = utils::padto(thdp.size[1], _GemmCore_T::NTILE);
-        const auto src = B + thdp.loc[0] * ldb + thdp.loc[1];
-        const auto dst = dstptr + thdp.loc[0] * _GemmCore_T::NTILE + thdp.loc[1] * KPad;
-        using PaddingInterleaveMNWType =
-            kernel::wrapper::PaddingInterleaveMN<_GemmCore_T::NTILE, _GemmCore_T::PACK_ROW>;
-        auto ret = PaddingInterleaveMNWType::template forward<ISA_T>(  //
-            src, dst, thdp.size[0], thdp.size[1], rowpadded, colpadded, ldb, KPad);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  virtual inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    auto zptr = wptr->template ZPtr<int8_t>();
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, float>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        kernel::wrapper::DecompressKBlockS8F32<_GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16>(
-            bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-            zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-            wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                      int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-  virtual inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    kernel::wrapper::Memcpy2D::template forward<ISA_T, int8_t, int8_t>(
-        bptr, *dstptr, n_size / _GemmCore_T::NTILE, _GemmCore_T::NTILE * k_size, _GemmCore_T::NTILE * KPad,
-        _GemmCore_T::NTILE * k_size);
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad + k_offset * _GemmCore_T::NTILE;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      kernel::wrapper::DecompressKBlockS8S8Fp::template forward<ISA_T>(
-          bptr + i * KPad, *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize);
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(utils::fp16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return JblasNotSupport;
-  }
-
-  virtual inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                            int n_offset, const Param& _param, void* tmpcache, size_t cachesize) {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S8) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S8>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else {
-      assert(0);
-    }
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockS4 : public WeightKBlockS8<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockS4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE weiT, JBLAS_DTYPE scaT,
-                              JBLAS_DTYPE redT, bool is_asym = false) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, weiT, scaT, redT, is_asym);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
-                           const int8_t* zero_points, void* ptr, parallel::IThreading* threading) override {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, zero_points, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>((size_t)stor->mKPad * stor->mNPad);
-    auto reorded = (int8_t*)tmp;
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reduceWeight(ptr, threading);
-    utils::afree(tmp);
-  }
-
-  virtual void packNbitsWeight(const int N, const int K, bool isasym, const uint8_t* B, const int ldb,
-                               const float* scales, const uint8_t* zero_points, void* ptr,
-                               parallel::IThreading* threading) {
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto tmp = utils::amalloc<float>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    auto blks = utils::updiv(K, stor->mBlockSize);
-    auto blks_padding2 = utils::padto(blks, 2);
-    auto tmpscales = tmp;
-    auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
-    if (scales) {
-      for (size_t i = 0; i < N * blks; i += 2) {
-        tmpscales[i] = scales[i] / 16;
-        tmpscales[i + 1] = scales[i + 1] / 16;
-      }
-    }
-    if (zero_points) {
-      for (size_t i = 0; i < N; i += 1) {
-        for (size_t ib = 0; ib < blks; ib += 2) {
-          auto tmpzp = *(zero_points + i * blks_padding2 / 2 + ib / 2);
-          tmpzeropoints[i * blks + ib] = ((tmpzp & 0xf) - 8) << 4;
-          if (ib + 1 < blks) {
-            tmpzeropoints[i * blks + ib + 1] = (((tmpzp & 0xf0) >> 4) - 8) << 4;
-          }
-        }
-      }
-    }
-
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setTransposeQuantCorrection(N, K, zero_points ? tmpzeropoints : nullptr,
-                                                                    scales ? tmpscales : nullptr, ptr, threading);
-    if (B) {
-      auto s8ptr = (int8_t*)tmp;
-      auto transposeunpackfunc_u4s4 = [&]() {
-        parallel::Scheduler2D para({threading->num_threads(), N, K, 1, 2});
-        threading->parallel_for([&](int tid) {
-          parallel::ThreadProblem2D thdp{tid};
-          para.getIndex(thdp);
-          if (thdp.valid) {
-            for (size_t i = thdp.loc[0]; i < thdp.loc[0] + thdp.size[0]; i++) {
-              for (size_t j = thdp.loc[1]; j < thdp.loc[1] + thdp.size[1]; j += 2) {
-                auto src = *(B + i * ldb / 2 + j / 2);
-                s8ptr[(j + 0) * N + i] = ((src & 0xf) - 8) << 4;
-                s8ptr[(j + 1) * N + i] = (((src & 0xf0) >> 4) - 8) << 4;
-              }
-            }
-          }
-        });
-      };
-      transposeunpackfunc_u4s4();
-      auto reorded = s8ptr + static_cast<size_t>(K) * N;
-      WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, s8ptr, N, reorded, threading);
-      compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(), threading);
-    }
-    utils::afree(tmp);
-  }
-
-  void compressWeight(const int N, const int K, const int8_t* B, const int ldb, utils::bit4x2* dstptr,
-                      parallel::IThreading* threading) {
-    parallel::Scheduler2D _para({threading->num_threads(), K, N, _GemmCore_T::KTILE, _GemmCore_T::NTILE});
-    threading->parallel_for([&](int tidx) {
-      parallel::ThreadProblem2D thdp({tidx});
-      _para.getIndex(thdp);
-      if (thdp.valid) {
-        auto ret = doCompress(B + thdp.loc[0] * ldb + thdp.loc[1], dstptr + thdp.loc[0] * ldb / 2 + thdp.loc[1] / 2,
-                              thdp.size[0], thdp.size[1], ldb, ldb);
-        assert(ret == JblasSuccess);
-        (void)ret;
-      }
-    });
-  }
-
- public:
-  inline JBLAS_CODE getWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-        kernel::wrapper::DecompressKBlockS4S8::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-            (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-            ColSize, ColSize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(int8_t** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return kernel::wrapper::CompressS8S4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::int4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) {
-    if (quant_dtype == JBLAS_DTYPE::S4_FULLRANGE) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::S4_CLIP) {
-      kernel::wrapper::QuantizeSignIntRowBlock::forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, blocksize);
-    }
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4S8Fp<T>::template forward<ISA_T, JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename _T>
-  inline JBLAS_CODE getFpWeight(_T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto zptr = wptr->template ZPtr<int8_t>();
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->template SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->template SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::S4_CLIP) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_CLIP>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::S4_FULLRANGE) {
-          kernel::wrapper::DecompressKBlockS4Fp<_T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                             JBLAS_DTYPE::S4_FULLRANGE>(
-              (utils::int4x2*)(bptr + i * KPad / 2), *dstptr + i * k_size, k_size / _GemmCore_T::PACK_ROW, ColSize,
-              ColSize, ColSize, sptr, zptr != nullptr ? zptr + n_offset + i : nullptr, k_offset / _GemmCore_T::PACK_ROW,
-              wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-
-template <class _GemmCore_T, JBLAS_ISA ISA_T>
-class WeightKBlockF4 : public WeightKBlockS4<_GemmCore_T, ISA_T> {
- public:
-  using Param = typename WeightKBlockS8<_GemmCore_T, ISA_T>::Param;
-  using StorageWeight = storage::gemm::StorageWeightKBlockF4;
-  StorageWeight createStorage(const int N, const int K, int blocksize, JBLAS_DTYPE f4T, JBLAS_DTYPE scaT) {
-    int KPad = utils::padto(K, _GemmCore_T::KTILE);
-    int NPad = utils::padto(N, _GemmCore_T::NTILE);
-    StorageWeight tmp(_GemmCore_T::ID);
-    tmp.resize(NPad, KPad, blocksize <= 0 ? KPad : blocksize, N, K, f4T, scaT);
-    return tmp;
-  }
-
-  virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales, void* ptr,
-                           parallel::IThreading* threading) {
-    WeightKBlockS8<_GemmCore_T, ISA_T>::setQuantCorrection(N, K, NULL, scales, ptr, threading);
-    auto stor = reinterpret_cast<StorageWeight*>(ptr);
-    auto reorded = utils::amalloc<int8_t>(static_cast<size_t>(stor->mKPad) * stor->mNPad);
-    WeightKBlockS8<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded, threading);
-    WeightKBlockS4<_GemmCore_T, ISA_T>::compressWeight(stor->mNPad, stor->mKPad, reorded, stor->mNPad, stor->WPtr(),
-                                                       threading);
-    utils::afree(reorded);
-  }
-
-  inline JBLAS_CODE getWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                              const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(float** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                    const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
-  inline JBLAS_CODE getKBlockWeight(utils::bf16** dstptr, int* dststep, int k_size, int n_size, int k_offset,
-                                    int n_offset, const Param& _param, void* tmpcache, size_t cachesize) override {
-    return getFpKBlockWeight(dstptr, dststep, k_size, n_size, k_offset, n_offset, _param, tmpcache, cachesize);
-  }
-
- protected:
-  virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                             float* scales, int8_t* zero_points, int blocksize, JBLAS_DTYPE quant_dtype) override {
-    if (quant_dtype == JBLAS_DTYPE::F4_BNB) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_BNB>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_E2M1) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(srcptr, dstptr, row, col, ld_src,
-                                                                                ld_dst, scales, zero_points, blocksize);
-    } else if (quant_dtype == JBLAS_DTYPE::F4_NF4) {
-      kernel::wrapper::QuantizeF4RowBlock::forward<ISA_T, JBLAS_DTYPE::F4_NF4>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                               scales, zero_points, blocksize);
-    }
-  }
-
-  virtual JBLAS_CODE doCompress(const int8_t* srcptr, void* dstptr, int row, int col, int ld_src, int ld_dst) override {
-    return kernel::wrapper::CompressFp4<_GemmCore_T::NTILE>::template forward<ISA_T>(
-        srcptr, reinterpret_cast<utils::f4x2*>(dstptr), row, col, ld_src,
-        ld_dst);  // ld_dst here not stride
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mScaT == JBLAS_DTYPE::F32) {
-        auto sptr = wptr->SPtr<float>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, float,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      } else if (wptr->mScaT == JBLAS_DTYPE::BF16) {
-        auto sptr = wptr->SPtr<utils::bf16>() + n_offset + i;
-        if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_NF4>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_E2M1>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-          kernel::wrapper::DecompressKBlockF4Fp<T, _GemmCore_T::PACK_ROW>::template forward<ISA_T, utils::bf16,
-                                                                                            JBLAS_DTYPE::F4_BNB>(
-              f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, sptr,
-              k_offset / _GemmCore_T::PACK_ROW, wptr->mBlockSize / _GemmCore_T::PACK_ROW, NPad, tmpcache, cachesize);
-        }
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-
-  template <typename T>
-  inline JBLAS_CODE getFpKBlockWeight(T** dstptr, int* dststep, int k_size, int n_size, int k_offset, int n_offset,
-                                      const Param& _param, void* tmpcache, size_t cachesize) {
-    auto wptr = reinterpret_cast<StorageWeight*>(const_cast<storage::gemm::WeightKBlockBase*>(_param.packedW));
-    auto NPad = wptr->mNPad;
-    auto KPad = wptr->mKPad;
-    auto bptr = wptr->WPtr() + n_offset * KPad / 2 + k_offset * _GemmCore_T::NTILE / 2;
-    int constexpr ColSize = _GemmCore_T::NTILE * _GemmCore_T::PACK_ROW;
-    for (int i = 0; i < n_size; i += _GemmCore_T::NTILE) {
-      auto f4ptr = reinterpret_cast<utils::f4x2*>(bptr + i * KPad / 2);
-      auto fp32ptr = *dstptr + i * k_size;
-      if (wptr->mDType == JBLAS_DTYPE::F4_NF4) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_NF4>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_E2M1) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_E2M1>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      } else if (wptr->mDType == JBLAS_DTYPE::F4_BNB) {
-        kernel::wrapper::DecompressKBlockF4FpNoscale<T>::template forward<ISA_T, JBLAS_DTYPE::F4_BNB>(
-            f4ptr, fp32ptr, k_size / _GemmCore_T::PACK_ROW, ColSize, ColSize, ColSize, tmpcache, cachesize);
-      }
-    }
-    *dststep = k_size;
-    return JblasSuccess;
-  }
-};
-}  // namespace gemm
-}  // namespace prologue_b
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
deleted file mode 100644
index 052728dba687f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_storage.h
+++ /dev/null
@@ -1,665 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_base.h"
-#include "jit_blas.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace storage {
-
-constexpr size_t Alignment = 64;
-class ISerialObject {
- protected:
-  virtual size_t getSerializedSize() = 0;
-
-  virtual void serializeToBuffer(int8_t*& wptr) = 0;
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) = 0;
-};
-
-class ISerializable : public ISerialObject {
- public:
-  virtual ~ISerializable() = default;
-
-  virtual void assign(int8_t* buf) = 0;
-
-  virtual void serialize(int8_t* wptr) = 0;
-
-  virtual void deserialize(int8_t* rptr) = 0;
-  size_t mSize = 0;
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mSize);
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override { utils::serialize(wptr, mSize); }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mSize);
-    }
-  }
-};
-
-class ISerialBuffer : public ISerialObject {
- public:
-  template <typename T>
-  inline constexpr T* get() {
-    return reinterpret_cast<T*>(mBufPtr);
-  };
-  template <typename T>
-  inline size_t size() {
-    return mBufSize / sizeof(T);
-  };
-
-  void resize(size_t bytes) { mBufSize = bytes; }
-
- protected:
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = 0;
-    totalsize += sizeof(mBufSize);
-    totalsize += mBufSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mBufSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mBufPtr) {
-      std::memcpy(wptr, mBufPtr, mBufSize);
-    }
-    wptr += mBufSize;
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) override {
-    if (!map_buf) {
-      mBufSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<size_t>(rptr, mBufSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mBufPtr = rptr;
-    rptr += mBufSize;
-  }
-
-  int8_t* mBufPtr = NULL;
-  size_t mBufSize = 0;
-};
-namespace gemm {
-// Storage classes for GEMM cases:
-// Weight K*N
-// Activation M*K
-
-class WeightBase : public storage::ISerializable {
- public:
-  JBLAS_PROLOGUEB_IDS mPrologueID = JBLAS_PROLOGUEB_IDS::Undef;
-  uint32_t mCoreId = 0;
-  JBLAS_DTYPE mDType = JBLAS_DTYPE::F32;
-  int mNPad = 0, mKPad = 0;
-  int mN = 0, mK = 0;
-
-  WeightBase(uint32_t _id) { mCoreId = _id; }
-
-  // bytes offset to mPrologueID
-  static constexpr inline size_t offset() { return sizeof(mSize); }
-
- protected:
-  void resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    mNPad = NPad;
-    mKPad = KPad;
-    mN = N;
-    mK = K;
-    mDType = dtype;
-  }
-
-  virtual size_t getSerializedSize() { return ISerializable::getSerializedSize() + getMiscSize(); }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    utils::serialize(wptr, mPrologueID);
-    utils::serialize(wptr, mCoreId);
-    utils::serialize(wptr, mNPad);
-    utils::serialize(wptr, mKPad);
-    utils::serialize(wptr, mN);
-    utils::serialize(wptr, mK);
-    utils::serialize(wptr, mDType);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    ISerializable::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mPrologueID = utils::deserialize<JBLAS_PROLOGUEB_IDS>(rptr);
-      mCoreId = utils::deserialize<uint32_t>(rptr);
-      mNPad = utils::deserialize<int>(rptr);
-      mKPad = utils::deserialize<int>(rptr);
-      mN = utils::deserialize<int>(rptr);
-      mK = utils::deserialize<int>(rptr);
-      mDType = utils::deserialize<JBLAS_DTYPE>(rptr);
-    } else {
-      utils::serialize<JBLAS_PROLOGUEB_IDS>(rptr, mPrologueID);
-      utils::serialize<uint32_t>(rptr, mCoreId);
-      utils::serialize<int>(rptr, mNPad);
-      utils::serialize<int>(rptr, mKPad);
-      utils::serialize<int>(rptr, mN);
-      utils::serialize<int>(rptr, mK);
-      utils::serialize<JBLAS_DTYPE>(rptr, mDType);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mPrologueID);
-    totalsize += sizeof(mCoreId);
-    totalsize += sizeof(mNPad);
-    totalsize += sizeof(mKPad);
-    totalsize += sizeof(mN);
-    totalsize += sizeof(mK);
-    totalsize += sizeof(mDType);
-    return totalsize;
-  }
-};
-
-class WeightKBlockBase : public WeightBase {
- public:
-  int mBlockSize = 1;
-  WeightKBlockBase(uint32_t _id) : WeightBase(_id) {}
-  void resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    mBlockSize = Block;
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    size_t totalsize = WeightBase::getSerializedSize() + getMiscSize();
-    return totalsize;
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    utils::serialize(wptr, mBlockSize);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    WeightBase::deserializeBuffer(rptr, map_buf);
-    if (!map_buf) {
-      mBlockSize = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, mBlockSize);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = sizeof(mBlockSize);
-    return totalsize;
-  }
-};
-
-class StorageQuantCorrection : public ISerialObject {
-  // ser
- public:
-  size_t mCSize = 0;
-  int mCStep = 0;
-  bool mIsAsym = false;
-  bool mHasReduce = false;
-  JBLAS_DTYPE mScaT = JBLAS_DTYPE::F32, mZpT = JBLAS_DTYPE::F32, mRedT = JBLAS_DTYPE::F32;
-
- protected:
-  int8_t* mSPtr = nullptr;
-  int8_t* mZPtr = nullptr;
-  int8_t* mRPtr = nullptr;
-
-  // non-ser
- public:
-  int mScaEleSize = 0, mZpEleSize = 0, mRedEleSize = 0;
-
- public:
-  template <typename T>
-  inline T* SPtr() {
-    return (T*)mSPtr;
-  }
-
-  template <typename T>
-  inline T* ZPtr() {
-    return (T*)mZPtr;
-  }
-
-  template <typename T>
-  inline T* RPtr() {
-    return (T*)mRPtr;
-  }
-
-  size_t resize(int Rows, int Step, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt, bool _is_asym,
-                bool _has_reduce) {
-    mScaT = scalet;
-    mZpT = zpt;
-    mRedT = redt;
-    updateSize();
-    mIsAsym = _is_asym;
-    mHasReduce = _has_reduce;
-    mCStep = Step;
-    mCSize = static_cast<size_t>(Rows) * Step;
-    return getSerializedSize();
-  }
-
- protected:
-  inline void updateSize() {
-    mScaEleSize = int(utils::jblas_dtype_size(mScaT));
-    mZpEleSize = int(utils::jblas_dtype_size(mZpT));
-    mRedEleSize = int(utils::jblas_dtype_size(mRedT));
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(mScaT);
-    totalsize += sizeof(mZpT);
-    totalsize += sizeof(mRedT);
-    totalsize += sizeof(mIsAsym);
-    totalsize += sizeof(mHasReduce);
-    totalsize += sizeof(mCStep);
-    totalsize += sizeof(mCSize);
-    return totalsize;
-  }
-  virtual size_t getSerializedSize() override {
-    size_t totalsize = getMiscSize();
-    totalsize += mCSize * mScaEleSize + Alignment;
-    if (mIsAsym) totalsize += mCSize * mZpEleSize + Alignment;
-    if (mHasReduce) totalsize += mCSize * mRedEleSize + Alignment;
-    return totalsize;
-  }
-  virtual void serializeToBuffer(int8_t*& wptr) override {
-    utils::serialize(wptr, mScaT);
-    utils::serialize(wptr, mZpT);
-    utils::serialize(wptr, mRedT);
-    utils::serialize(wptr, mIsAsym);
-    utils::serialize(wptr, mHasReduce);
-    utils::serialize(wptr, mCStep);
-    utils::serialize(wptr, mCSize);
-    wptr = utils::pointer_align<Alignment>(wptr);
-    if (wptr != mSPtr) {
-      std::memcpy(wptr, mSPtr, mScaEleSize);
-    }
-    wptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mZPtr) {
-        std::memcpy(wptr, mZPtr, mZpEleSize);
-      }
-      wptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      wptr = utils::pointer_align<Alignment>(wptr);
-      if (wptr != mRPtr) {
-        std::memcpy(wptr, mRPtr, mCSize * mRedEleSize);
-      }
-      wptr += mCSize * mRedEleSize;
-    }
-  }
-  virtual void deserializeBuffer(int8_t*& rptr, bool locate_buf) override {
-    if (!locate_buf) {
-      mScaT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mZpT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      mRedT = utils::deserialize<JBLAS_DTYPE>(rptr);
-      updateSize();
-      mIsAsym = utils::deserialize<bool>(rptr);
-      mHasReduce = utils::deserialize<bool>(rptr);
-      mCStep = utils::deserialize<int>(rptr);
-      mCSize = utils::deserialize<size_t>(rptr);
-    } else {
-      utils::serialize<JBLAS_DTYPE>(rptr, mScaT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mZpT);
-      utils::serialize<JBLAS_DTYPE>(rptr, mRedT);
-      utils::serialize<bool>(rptr, mIsAsym);
-      utils::serialize<bool>(rptr, mHasReduce);
-      utils::serialize<int>(rptr, mCStep);
-      utils::serialize<size_t>(rptr, mCSize);
-    }
-    rptr = utils::pointer_align<Alignment>(rptr);
-    mSPtr = rptr;
-    rptr += mCSize * mScaEleSize;
-    if (mIsAsym) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mZPtr = rptr;
-      rptr += mCSize * mZpEleSize;
-    }
-    if (mHasReduce) {
-      rptr = utils::pointer_align<Alignment>(rptr);
-      mRPtr = rptr;
-      rptr += mCSize * mRedEleSize;
-    }
-  }
-};
-
-class StorageReduce : public ISerializable, public ISerialBuffer {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, k = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _k, int _kblock, JBLAS_DTYPE redt) {
-    kblock = _kblock;
-    m = _m;
-    k = _k;
-    lda = utils::updiv(_k, _kblock);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(redt);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, k);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, k);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(k);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StorageQuantActivation : public ISerializable, public ISerialBuffer, public StorageQuantCorrection {
- public:
-  using CorrectionType = StorageQuantCorrection;
-  int m = 0, lda = 0, kblock = 1;
-  size_t resize(int _m, int _lda, int _kblock, JBLAS_DTYPE buft, JBLAS_DTYPE scalet, JBLAS_DTYPE zpt, JBLAS_DTYPE redt,
-                bool is_asym, bool has_reduce) {
-    kblock = _kblock;
-    lda = _lda;
-    m = _m;
-    CorrectionType::resize(_m, utils::updiv(_lda, _kblock), scalet, zpt, redt, is_asym, has_reduce);
-    size_t bufsize = static_cast<size_t>(m) * lda * utils::jblas_dtype_size(buft);
-    ISerialBuffer::resize(bufsize);
-    mSize = getSerializedSize();
-    return mSize;
-  }
-  template <typename QT_T>
-  inline QT_T* APtr() {
-    return get<QT_T>();
-  }
-
-  virtual void assign(int8_t* buf) override {
-    ISerializable::deserializeBuffer(buf, true);
-    deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    ISerializable::serializeToBuffer(wptr);
-    serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    ISerializable::deserializeBuffer(rptr, false);
-    deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-
- protected:
-  virtual size_t getSerializedSize() {
-    return ISerializable::getSerializedSize() + getMiscSize() + ISerialBuffer::getSerializedSize() +
-           CorrectionType::getSerializedSize();
-  }
-
-  virtual void serializeToBuffer(int8_t*& wptr) {
-    utils::serialize(wptr, m);
-    utils::serialize(wptr, lda);
-    utils::serialize(wptr, kblock);
-  }
-
-  virtual void deserializeBuffer(int8_t*& rptr, bool map_buf) {
-    if (!map_buf) {
-      m = utils::deserialize<int>(rptr);
-      lda = utils::deserialize<int>(rptr);
-      kblock = utils::deserialize<int>(rptr);
-    } else {
-      utils::serialize(rptr, m);
-      utils::serialize(rptr, lda);
-      utils::serialize(rptr, kblock);
-    }
-  }
-
-  inline constexpr size_t getMiscSize() {
-    size_t totalsize = 0;
-    totalsize += sizeof(m);
-    totalsize += sizeof(lda);
-    totalsize += sizeof(kblock);
-    return totalsize;
-  }
-};
-
-class StoragePackedWeight : public WeightBase, public ISerialBuffer {
- public:
-  StoragePackedWeight(uint32_t _id) : WeightBase(_id) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightPack; }
-
-  size_t resize(int NPad, int KPad, int N, int K, JBLAS_DTYPE dtype) {
-    WeightBase::resize(NPad, KPad, N, K, dtype);
-    auto bsize = static_cast<size_t>(NPad) * KPad * jblas::utils::jblas_dtype_size(dtype);
-    ISerialBuffer::resize(bsize);
-    mSize = WeightBase::getSerializedSize() + ISerialBuffer::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    WeightBase::deserializeBuffer(buf, true);
-    ISerialBuffer::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    WeightBase::serializeToBuffer(wptr);
-    ISerialBuffer::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    WeightBase::deserializeBuffer(rptr, false);
-    ISerialBuffer::deserializeBuffer(rptr, false);
-  }
-};
-
-class Buffer8Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(size); }
-  inline int8_t* WPtr() { return get<int8_t>(); }
-};
-
-class Buffer4Bit : public ISerialBuffer {
- public:
-  void resize(size_t size) { ISerialBuffer::resize(utils::updiv(size, 2)); }
-  inline utils::bit4x2* WPtr() { return get<utils::bit4x2>(); }
-};
-
-class StorageWeightKBlockS8 : public WeightKBlockBase, public Buffer8Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer8Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS8(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS8; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE scalet, JBLAS_DTYPE redt, bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, JBLAS_DTYPE::S8);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockS4 : public WeightKBlockBase, public Buffer4Bit, public StorageQuantCorrection {
- public:
-  using InfoType = WeightKBlockBase;
-  using QWeightType = Buffer4Bit;
-  using CorrectionType = StorageQuantCorrection;
-  StorageWeightKBlockS4(uint32_t _type) : WeightKBlockBase(_type) { mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockS4; }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE s4t, JBLAS_DTYPE scalet, JBLAS_DTYPE redt,
-                bool IsAsym) {
-    JBLAS_DTYPE zpt = JBLAS_DTYPE::S8;
-    InfoType::resize(NPad, KPad, Block, N, K, s4t);
-    QWeightType::resize(static_cast<size_t>(NPad) * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    auto gemm_comp = jblas::gemm::CoreAttr::get_mask_val(mCoreId, jblas::gemm::CoreAttr::COMP_MASK,
-                                                         jblas::gemm::CoreAttr::COMP_SHIFT);
-    CorrectionType::resize(nk_scale, NPad, scalet, zpt, redt, IsAsym,
-                           gemm_comp >= static_cast<uint32_t>(jblas::gemm::CompType::COMP_INT_START));
-    mSize = InfoType::getSerializedSize() + QWeightType::getSerializedSize() + CorrectionType::getSerializedSize();
-    return mSize;
-  }
-
-  virtual void assign(int8_t* buf) override {
-    InfoType::deserializeBuffer(buf, true);
-    QWeightType::deserializeBuffer(buf, true);
-    CorrectionType::deserializeBuffer(buf, true);
-  }
-
-  virtual void serialize(int8_t* wptr) {
-    InfoType::serializeToBuffer(wptr);
-    QWeightType::serializeToBuffer(wptr);
-    CorrectionType::serializeToBuffer(wptr);
-  }
-
-  virtual void deserialize(int8_t* rptr) override {
-    InfoType::deserializeBuffer(rptr, false);
-    QWeightType::deserializeBuffer(rptr, false);
-    CorrectionType::deserializeBuffer(rptr, false);
-  }
-};
-
-class StorageWeightKBlockF4 : public StorageWeightKBlockS4 {
- public:
-  StorageWeightKBlockF4(uint32_t _type) : StorageWeightKBlockS4(_type) {
-    mPrologueID = JBLAS_PROLOGUEB_IDS::WeightKBlockF4;
-  }
-
-  size_t resize(int NPad, int KPad, int Block, int N, int K, JBLAS_DTYPE f4t, JBLAS_DTYPE scalet) {
-    StorageWeightKBlockS4::InfoType::resize(NPad, KPad, Block, N, K, f4t);
-    StorageWeightKBlockS4::QWeightType::resize((size_t)NPad * KPad);
-    int nk_scale = utils::updiv(KPad, Block);
-    StorageWeightKBlockS4::CorrectionType::resize(nk_scale, NPad, scalet, JBLAS_DTYPE::S8, JBLAS_DTYPE::F32, false,
-                                                  false);
-    mSize = StorageWeightKBlockS4::InfoType::getSerializedSize() +
-            StorageWeightKBlockS4::QWeightType::getSerializedSize() +
-            StorageWeightKBlockS4::CorrectionType::getSerializedSize();
-    return mSize;
-  }
-};
-
-class PackedWeightParser {
- public:
-  static gemm::WeightBase* deserialBuffer(const void* serialized_buf) {
-    auto rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-    rptr += WeightBase::offset();
-    int mProID = utils::deserialize<int>(rptr);
-    WeightBase* ptr = NULL;
-    if (mProID >= int(JBLAS_PROLOGUEB_IDS::Begin) && mProID < int(JBLAS_PROLOGUEB_IDS::End)) {
-      rptr = reinterpret_cast<int8_t*>(const_cast<void*>(serialized_buf));
-      auto type = static_cast<JBLAS_PROLOGUEB_IDS>(mProID);
-      switch (type) {
-        case JBLAS_PROLOGUEB_IDS::WeightPack:
-          ptr = new gemm::StoragePackedWeight(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS8:
-          ptr = new gemm::StorageWeightKBlockS8(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockS4:
-          ptr = new gemm::StorageWeightKBlockS4(0);
-          break;
-        case JBLAS_PROLOGUEB_IDS::WeightKBlockF4:
-          ptr = new gemm::StorageWeightKBlockF4(0);
-          break;
-        default:
-          break;
-      }
-      if (ptr) {
-        ptr->deserialize(rptr);
-      }
-    }
-    return ptr;
-  }
-};
-}  // namespace gemm
-}  // namespace storage
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
deleted file mode 100644
index 96d9e94c9bfc0..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_utils.h
+++ /dev/null
@@ -1,638 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstring>
-#include <functional>
-#include <cassert>
-#include <vector>
-#include <cstdio>
-#ifdef _WIN32
-#include <cstdlib>
-#else
-#include <err.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/signal.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#include <stdlib.h>
-
-#define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__)
-#define XFEATURE_XTILECFG 17
-#define XFEATURE_XTILEDATA 18
-#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
-#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
-#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
-
-#define ARCH_GET_XCOMP_PERM 0x1022
-#define ARCH_REQ_XCOMP_PERM 0x1023
-
-#endif
-#include "jit_blas.h"
-
-// As long as the compiler supports the ISA, we will enable it.
-// Only the ISA you use in your project will be compiled.
-#ifdef __GNUC__
-#define CompileAVX512F() (__GNUC__ >= 6)
-#define CompileAVX2() (__GNUC__ >= 5)
-#define CompileAMX() (__GNUC__ >= 11)
-#define CompileBF16() (__GNUC__ >= 13)
-#define CompileFP16() (__GNUC__ >= 13)
-#define CompileAMXBF16() (CompileAMX())
-#define CompileAMXINT8() (CompileAMX())
-#else
-#define CompileAVX512F() _MSC_VER && (_MSC_VER >= 1911)
-#define CompileAVX2() _MSC_VER && (_MSC_VER >= 1900)
-#define CompileAMX() 0
-#define CompileBF16() 0
-#define CompileFP16() 0
-#define CompileAMXBF16() 0
-#define CompileAMXINT8() 0
-#endif
-#if CompileBF16() || CompileFP16()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace utils {
-
-template <typename T2, typename T1>
-inline const T2 bit_cast(T1 i) {
-  static_assert(sizeof(T1) == sizeof(T2), "Bit-casting must preserve size.");
-  T2 o;
-  memcpy(&o, &i, sizeof(T2));
-  return o;
-}
-
-template <typename T>
-inline uint32_t bitand_u32(const T& src, const T& src1) {
-  return uint32_t(src) & uint32_t(src1);
-}
-
-struct bf16 {
-  uint16_t x;
-  union bf16f32 {
-    float f32;
-    unsigned int u;
-    uint16_t bf16[2];
-  };
-  bf16() : x(0) {}
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  static uint16_t f32_to_bf16(float v) {
-    auto mm = _mm_load_ss(&v);
-    auto mm2 = _mm_cvtneps_pbh(mm);
-    uint16_t dst;
-    _mm_storeu_si16(reinterpret_cast<uint16_t*>(&dst), reinterpret_cast<__m128i>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-  explicit bf16(float vf32) : x(bit_cast<uint16_t>(f32_to_bf16(vf32))) {}
-#else
-  explicit bf16(float vf32) { fromfloat(vf32); }
-#endif
-
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512vl", "avx512bf16")
-  float tofloat() const {
-    auto mm = _mm_loadu_si16(&(this->x));
-    auto mm2 = _mm_bslli_si128(mm, 2);
-    float dst;
-    _mm_store_ss(&dst, reinterpret_cast<__m128>(mm2));
-    return dst;
-  }
-#pragma GCC pop_options
-#else
-  float tofloat() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-#endif
-
-  float tofloat_nosimd() const {
-    bf16f32 tmp = {0.f};
-    tmp.bf16[1] = x;
-    return tmp.f32;
-  }
-
-  operator float() const { return tofloat(); }
-
-  static bf16 from_bin(const uint16_t x) {
-    bf16 res;
-    res.x = x;
-    return res;
-  }
-
-  void fromfloat(float _v) {
-#if CompileBF16()
-    x = bit_cast<uint16_t>(f32_to_bf16(_v));
-#else
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-#endif
-  }
-
-  void fromfloat_nosimd(float _v) {
-    bf16f32 tmp = {0.f};
-    tmp.f32 = _v;
-    // See document of VCVTNEPS2BF16 in Intel® 64 and IA-32 Architectures
-    // Software Developer’s Manual Volume 2
-    const auto lsb = tmp.bf16[1] & 1;
-    tmp.u += 0x7fff + lsb;
-    x = tmp.bf16[1];
-  }
-};
-
-struct fp16 {
-  uint16_t x;
-
-  fp16() { x = 0; }
-  explicit fp16(float val) { (*this) = val; }
-  explicit fp16(bf16 val) { (*this) = static_cast<float>(val); }
-
-  fp16& operator=(float val) {
-#if CompileFP16()
-    this->x = bit_cast<uint16_t>(static_cast<_Float16>(val));
-#else
-    // round-to-nearest-even: add last bit after truncated mantissa
-    const uint32_t b = bit_cast<uint32_t>(val) + 0x00001000;
-    const uint32_t e = (b & 0x7F800000) >> 23;  // exponent
-    // mantissa; in line below: 0x007FF000 = 0x00800000-0x00001000 = decimal indicator flag - initial rounding
-    const uint32_t m = b & 0x007FFFFF;
-    // sign : normalized : denormalized : saturate
-
-    this->x = static_cast<uint16_t>((b & 0x80000000) >> 16 | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13) |
-                                    ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) |
-                                    (e > 143) * 0x7FFF);
-#endif
-    return *this;
-  }
-  explicit operator float() const {
-#if CompileFP16()
-    return static_cast<float>(bit_cast<_Float16>(this->x));
-#else
-    // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15,
-    // +-131008.0, +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
-    const uint32_t e = (x & 0x7C00) >> 10;  // exponent
-    const uint32_t m = (x & 0x03FF) << 13;  // mantissa
-    // evil log2 bit hack to count leading zeros in denormalized format
-    const uint32_t v = bit_cast<uint32_t>(static_cast<float>(m)) >> 23;
-    // sign : normalized : denormalized
-    return bit_cast<float>((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) |
-                           ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)));
-#endif
-  }
-  explicit operator bf16() const {
-#if CompileBF16() && CompileFP16()
-    return bf16(static_cast<float>(bit_cast<_Float16>(this->x)));
-#else
-    // Extract the exponent, and mantissa from the fp16 value.
-    int exponent = x >> 10 & 0x1f;
-    int mantissa = x & 0x3ff;
-
-    // If the exponent is 0, the bf16 value is 0.
-    if (exponent == 0) {
-      return bf16();
-    }
-    // If the exponent is 31, the bf16 value is the sign bit plus 0x7fff.
-    else if (exponent == 31) {
-      bf16 res{};
-      return bf16::from_bin(x | 0x7fff);
-    }
-    // Otherwise, the bf16 value is the sign bit plus the exponent minus 15,
-    // followed by the mantissa.
-    else {
-      int sign = x & 0x8000;
-      return bf16::from_bin(static_cast<uint16_t>(sign | (exponent + 128 - 16) << 7 | mantissa >> 3));
-    }
-#endif
-  }
-};
-
-struct bit4x2 {
-  int8_t x : 4;
-  int8_t y : 4;
-  bit4x2(int8_t v) : x(v), y(v) {}
-  bit4x2() : x(0), y(0) {}
-};
-
-struct int4x2 : bit4x2 {
-  int4x2(int8_t v) : bit4x2(v) {}
-  int4x2() : bit4x2() {}
-  static int8_t convert(int8_t src) {
-    int32_t dst = src;
-    dst = dst >= 0 ? dst + 8 : dst - 8;
-    dst = dst / 16;
-    dst = dst > 7 ? 7 : dst;
-    dst = dst < -8 ? -8 : dst;
-    return static_cast<int8_t>(dst);
-  }
-};
-
-struct f4x2 : bit4x2 {
-  f4x2(int8_t v) : bit4x2(v) {}
-  f4x2() : bit4x2() {}
-};
-
-template <typename T>
-inline constexpr JBLAS_DTYPE jblas_dtype = std::is_same_v<T, double>        ? JBLAS_DTYPE::F64
-                                           : std::is_same_v<T, float>       ? JBLAS_DTYPE::F32
-                                           : std::is_same_v<T, utils::bf16> ? JBLAS_DTYPE::BF16
-                                           : std::is_same_v<T, utils::fp16> ? JBLAS_DTYPE::F16
-                                           : std::is_same_v<T, int8_t>      ? JBLAS_DTYPE::S8
-                                           : std::is_same_v<T, uint8_t>     ? JBLAS_DTYPE::U8
-                                                                            : (assert(0), JBLAS_DTYPE::F32);
-template <typename T>
-inline constexpr const char* type_str = std::is_same_v<T, double>    ? "double"
-                                        : std::is_same_v<T, float>   ? "float"
-                                        : std::is_same_v<T, bf16>    ? "bf16"
-                                        : std::is_same_v<T, fp16>    ? "fp16"
-                                        : std::is_same_v<T, int8_t>  ? "int8_t"
-                                        : std::is_same_v<T, uint8_t> ? "uint8_t"
-                                                                     : (assert(0), "undef");
-
-inline const char* dtype2str(JBLAS_DTYPE dtype) {
-  switch (dtype) {
-    case JBLAS_DTYPE::F64:
-      return "float64";
-    case JBLAS_DTYPE::F32:
-      return "float32";
-    case JBLAS_DTYPE::F16:
-      return "float16";
-    case JBLAS_DTYPE::BF16:
-      return "bfloat16";
-    case JBLAS_DTYPE::F8_E4M3:
-      return "fp8_e4m3";
-    case JBLAS_DTYPE::F8_E5M2:
-      return "fp8_e5m2";
-    case JBLAS_DTYPE::F8_E3M4:
-      return "fp8_e3m4";
-    case JBLAS_DTYPE::S8:
-      return "signed_int8";
-    case JBLAS_DTYPE::U8:
-      return "unsigned_int8";
-    case JBLAS_DTYPE::S4_CLIP:
-      return "int4_clip";
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      return "int4_fullrange";
-    case JBLAS_DTYPE::F4_E2M1:
-      return "fp4_e2m1";
-    case JBLAS_DTYPE::F4_BNB:
-      return "fp4_bitsandbytes";
-    case JBLAS_DTYPE::F4_NF4:
-      return "fp4_nf4";
-    case JBLAS_DTYPE::S32:
-      return "signed_int32";
-    case JBLAS_DTYPE::U32:
-      return "unsigned_int32";
-    default:
-      return "ErrType";
-  }
-}
-
-template <JBLAS_DTYPE DT>
-inline constexpr const char* dtype_str() {
-  return dtype2str(DT);
-}
-
-inline constexpr size_t jblas_dtype_size(const JBLAS_DTYPE t) {
-  auto bits = static_cast<uint32_t>(t) & static_cast<uint32_t>(0xff);
-  return bits >> 3;  // bits to bytes
-}
-
-#ifndef _WIN32
-static void request_perm_xtile_data() {
-  unsigned long bitmask;
-  long rc;
-
-  rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
-  if (rc) fatal_error("XTILE_DATA request failed: %ld", rc);
-
-  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
-  if (rc) fatal_error("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc);
-#ifndef NDEBUG
-  if (bitmask & XFEATURE_MASK_XTILE) printf("ARCH_REQ_XCOMP_PERM XTILE_DATA successful.\n");
-#endif
-}
-#else
-static void request_perm_xtile_data() {}
-#endif
-
-template <JBLAS_ISA ISA_T>
-class isa_base {
- public:
-  static bool constexpr avx = ISA_T >= JblasAVX;
-  static bool constexpr avx2 = ISA_T >= JblasAVX2;
-  static bool constexpr avx512f = ISA_T >= JblasAVX512F;
-  static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
-  static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
-  static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
-  static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
-};
-
-static inline int padto_le(int src, int padding) { return src / padding * padding; }
-
-static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
-
-static inline int updiv(int a, int b) { return (a + b - 1) / b; }
-
-static inline size_t updiv(size_t a, int b) { return (a + b - 1) / b; }
-
-static inline int downdiv(int a, int b) { return a / b; }
-
-static inline int remainsize(int pos, int size, int N) { return pos + N <= size ? N : size - pos; }
-
-template <typename _SRCT, typename _DSTT>
-static inline _DSTT cast(_SRCT _src) {
-  return static_cast<_DSTT>(_src);
-}
-
-template <>
-int8_t cast(float _src) {
-  _src = roundf(_src);
-  _src = std::min(_src, 127.f);
-  _src = std::max(_src, -128.f);
-  return static_cast<int8_t>(_src);
-}
-
-template <>
-uint8_t cast(float _src) {
-  _src += 0.5f;
-  _src = std::min(_src, 255.f);
-  _src = std::max(_src, 0.f);
-  return static_cast<uint8_t>(_src);
-}
-
-template <>
-int cast(float _src) {
-  return int(roundf(_src));
-}
-
-template <>
-float cast(bf16 _src) {
-  return _src.tofloat();
-}
-
-template <>
-bf16 cast(float _src) {
-  bf16 tmp;
-  tmp.fromfloat(_src);
-  return tmp;
-}
-
-template <typename _T>
-void serialize(int8_t*& buf, _T _val) {
-  *reinterpret_cast<_T*>(buf) = _val;
-  buf += sizeof(_T);
-}
-
-template <typename _T>
-_T deserialize(int8_t*& buf) {
-  auto val = *reinterpret_cast<_T*>(buf);
-  buf += sizeof(_T);
-  return val;
-}
-
-static inline int padto(int a, int b) { return updiv(a, b) * b; }
-static inline size_t padto(size_t a, int b) { return updiv(a, b) * b; }
-
-template <int _Alignment, typename _T>
-static inline _T* pointer_align(_T* src) {
-  auto uptr = reinterpret_cast<uint64_t>(src);
-  return reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-}
-
-template <typename _T>
-static inline _T* amalloc(size_t _size, size_t _alignment = 64) {
-  if (_size == 0) {
-    return NULL;
-  }
-  auto psize = padto(_size * sizeof(_T), static_cast<int>(_alignment));
-#ifdef _WIN32
-  return reinterpret_cast<_T*>(_aligned_malloc(psize, _alignment));
-#else
-  return reinterpret_cast<_T*>(aligned_alloc(_alignment, psize));
-#endif
-}
-
-static inline void afree(void* ptr) {
-  if (ptr == NULL) {
-    return;
-  }
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  free(ptr);
-#endif
-}
-
-template <typename _T, int _Alignment = 64>
-class aligned_vector {
- public:
-  aligned_vector() : mRawsize(0), mPtr(nullptr), mAlignedsize(0) {}
-  aligned_vector(size_t _size) { resize(_size); }
-  aligned_vector(size_t _size, _T _val) {
-    resize(_size);
-    std::fill_n(mVec.begin(), mVec.size(), _val);
-  }
-  size_t size() { return mRawsize; }
-  void resize(size_t size) {
-    mRawsize = size;
-    mAlignedsize = (mRawsize + _Alignment - 1) / _Alignment * _Alignment + _Alignment;
-    if (size) {
-      mVec.resize(mAlignedsize);
-      auto uptr = reinterpret_cast<uint64_t>(mVec.data());
-      mPtr = reinterpret_cast<_T*>((uptr + _Alignment - 1) / _Alignment * _Alignment);
-    } else {
-      mPtr = NULL;
-    }
-  }
-  _T* data() const { return mPtr; }
-  _T& operator[](size_t _n) noexcept { return mPtr[_n]; }
-
- protected:
-  size_t mAlignedsize, mRawsize;
-  std::vector<_T> mVec;
-  _T* mPtr;
-};
-
-template <typename _T, int _Alignment = 64>
-using avector = aligned_vector<_T, _Alignment>;
-
-using milliseconds = std::chrono::milliseconds;
-using nanoseconds = std::chrono::nanoseconds;
-using microseconds = std::chrono::microseconds;
-template <typename _DUR = std::chrono::milliseconds>
-class timer {
- public:
-  using sclock_t = std::chrono::steady_clock;
-  using stime_point_t = std::chrono::time_point<sclock_t>;
-
-  timer() { clear(); }
-
-  void start() { startT = sclock_t::now(); }
-
-  void clear() { startT = stime_point_t::min(); }
-
-  bool null_state() { return startT == stime_point_t::min(); }
-
-  float stop() { return static_cast<float>(std::chrono::duration_cast<_DUR>(sclock_t::now() - startT).count()); }
-
-  stime_point_t startT;
-};
-
-template <typename T>
-class minmax_statistics {
- public:
-  minmax_statistics() { clear(); }
-
-  void clear() {
-    min_val = std::numeric_limits<T>::max();
-    max_val = std::numeric_limits<T>::min();
-    avg_val = 0;
-    count = 0;
-  }
-
-  void add(T _val) {
-    min_val = min_val > _val ? _val : min_val;
-    max_val = max_val < _val ? _val : max_val;
-    count += 1;
-    avg_val = (avg_val * (count - 1) + _val) / count;
-  }
-
-  T min_val, max_val, avg_val;
-  size_t count;
-};
-
-template <int _PRINT_CYCLE_MS = 100, typename _PRECISION = microseconds, typename _LOG_PRECISION = milliseconds>
-class timer_statistics_logger {
- public:
-  typedef timer<milliseconds> log_timer_t;
-  timer_statistics_logger() {
-    clear();
-    log_ratio = static_cast<float>(std::chrono::duration_cast<_PRECISION>(_LOG_PRECISION(1)).count());
-  }
-
-  void clear() {
-    statis.clear();
-    logtm.clear();
-  }
-
-  void start() {
-    if (logtm.null_state()) {
-      logtm.start();
-    }
-    tm.start();
-  }
-
-  bool stop() {
-    auto elapsed = tm.stop();
-    statis.add(elapsed);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  bool add(float time) {
-    statis.add(time);
-    if (logtm.stop() >= _PRINT_CYCLE_MS) {
-      record();
-      clear();
-      logtm.start();
-      return true;
-    }
-    return false;
-  }
-
-  const char* get_log_str() {
-    sprintf(str, "Min:%.4f, Max:%.4f, Average:%.4f", min_val, max_val, avg_val);
-    return str;
-  }
-  float min_val, max_val, avg_val;
-
- private:
-  void record() {
-    min_val = statis.min_val / log_ratio;
-    max_val = statis.max_val / log_ratio;
-    avg_val = statis.avg_val / log_ratio;
-  }
-  float log_ratio;
-  char str[256];
-  timer<_PRECISION> tm;
-  minmax_statistics<float> statis;
-  timer<milliseconds> logtm;
-};
-}  // namespace utils
-
-static float fp4_bnb_dequant_fp32_LUT[] = {
-    0.00000000f,        5.208333333e-03f,   0.66666667f,        1.00000000f,        0.33333333f,
-    0.50000000f,        0.16666667f,        0.25000000f,        -1.f * 0.00000000f, -1.f * 5.208333333e-03f,
-    -1.f * 0.66666667f, -1.f * 1.00000000f, -1.f * 0.33333333f, -1.f * 0.50000000f, -1.f * 0.16666667f,
-    -1.f * 0.25000000f};
-
-static float fp4_e2m1_dequant_fp32_LUT[] = {
-    0.f,
-    0.010416666666666666f,
-    0.16666666666666666f,
-    0.25f,
-    0.333333333333333f,
-    0.5f,
-    0.6666666666666f,
-    1.f,
-    -1.f * 0.f,
-    -1.f * 0.010416666666666666f,
-    -1.f * 0.16666666666666666f,
-    -1.f * 0.25f,
-    -1.f * 0.333333333333333f,
-    -1.f * 0.5f,
-    -1.f * 0.6666666666666f,
-    -1.f * 1.f,
-};
-
-static float nf4_dequant_fp32_LUT[] = {0.f,
-                                       -0.6961928009986877f,
-                                       -0.5250730514526367f,
-                                       -0.39491748809814453f,
-                                       -0.28444138169288635f,
-                                       -0.18477343022823334f,
-                                       -0.09105003625154495f,
-                                       -1.f,
-                                       0.07958029955625534f,
-                                       0.16093020141124725f,
-                                       0.24611230194568634f,
-                                       0.33791524171829224f,
-                                       0.44070982933044434f,
-                                       0.5626170039176941f,
-                                       0.7229568362236023f,
-                                       1.0f};
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
deleted file mode 100644
index 27e240a822cdc..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/jit_blas_wrapper.h
+++ /dev/null
@@ -1,281 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <thread>
-
-#include "jit_blas_epilogue.h"
-#include "jit_blas_gemm.h"
-#include "jit_blas_prologue_a.h"
-#include "jit_blas_prologue_b.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx512f.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace wrapper {
-namespace gemm {
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _Epilogue_T>
-class LauncherBase {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using EpiParam = typename Epilogue::Param;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K;
-    const AParam paramA;
-    const BParam paramB;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<CType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpCache = (void*)(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpC, tmpCache);
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                      tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-      for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-        int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-        auto cptr_cache = tmpC + i * _config.block[1];
-        int ccache_stride = _config.block[1] * sizeof(CType);
-        if (k_paddedle) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                              (blk_m + i + _config.loc[0]), iterk, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                            acache_step * sizeof(AType), bcache_stride, ccache_stride, iterk, tmpcache,
-                            _config.tmpcachesize);
-        }
-        int k_tail = k_remain - k_paddedle;
-        if (k_tail) {
-          AType* aptr_cache = tmpA;
-          int acache_step = 0;
-          mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail, (blk_m + i + _config.loc[0]),
-                              iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-          mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                            GemmCore::KTILE, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                            iterk + k_paddedle, tmpcache, _config.tmpcachesize);
-        }
-      }
-    }
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpcache, _config.tmpcachesize);
-  }
-};
-
-template <JBLAS_ISA _RT_ISA_T, class _GemmCore_T, template <class _T, JBLAS_ISA> class _PrologueA_T,
-          template <class _T, JBLAS_ISA> class _PrologueB_T, template <JBLAS_ISA> class _BlockEpilogue_T,
-          template <JBLAS_ISA> class _Epilogue_T>
-class LauncherKBlock {
- public:
-  using GemmCore = _GemmCore_T;
-  using PrologueA = _PrologueA_T<GemmCore, _RT_ISA_T>;
-  using PrologueB = _PrologueB_T<GemmCore, _RT_ISA_T>;
-  using Epilogue = _Epilogue_T<_RT_ISA_T>;
-  using BlockEpilogue = _BlockEpilogue_T<_RT_ISA_T>;
-  using AType = typename GemmCore::AType;
-  using AParam = typename PrologueA::Param;
-  using BType = typename GemmCore::BType;
-  using BParam = typename PrologueB::Param;
-  using CType = typename GemmCore::CType;
-  using BEpiParam = typename BlockEpilogue::Param;
-  using EpiParam = typename Epilogue::Param;
-  using AccType = float;
-  static_assert(GemmCore::ISA <= _RT_ISA_T, "RunTime ISA should cover GEMM's ISA");
-  struct Param {
-    const int M, N, K, KBlock;
-    const AParam paramA;
-    const BParam paramB;
-    const BEpiParam paramBlk;
-    const EpiParam paramC;
-  };
-  _GemmCore_T mGemmCore;
-  PrologueA mProA;
-  PrologueB mProB;
-  BlockEpilogue mBlockEpi;
-  Epilogue mEpilogue;
-
-  void run(const Param& _param, const parallel::gemm::ThreadProblemBase& _config) {
-    mGemmCore.configure();
-    auto StackTmp = alloca(_config.l2cachesize);
-    auto tmpB = reinterpret_cast<BType*>(StackTmp);
-    tmpB = utils::pointer_align<64>(tmpB);
-    auto tmpA = reinterpret_cast<AType*>(tmpB + static_cast<size_t>(_config.block[1]) * _config.block[2]);
-    tmpA = utils::pointer_align<64>(tmpA);
-    auto tmpC = reinterpret_cast<AccType*>(tmpA + static_cast<size_t>(GemmCore::MTILE) * _config.block[2]);
-    tmpC = utils::pointer_align<64>(tmpC);
-    auto tmpBlk = reinterpret_cast<CType*>(tmpC + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpBlk = utils::pointer_align<64>(tmpBlk);
-    auto tmpCache = reinterpret_cast<void*>(tmpBlk + static_cast<size_t>(_config.block[0]) * _config.block[1]);
-    tmpCache = utils::pointer_align<64>(tmpCache);
-    for (int itern = 0; itern < _config.size[1]; itern += _config.block[1]) {
-      int n_remain = utils::remainsize(itern, _config.size[1], _config.block[1]);
-      for (int iterm = 0; iterm < _config.size[0]; iterm += _config.block[0]) {
-        int m_remain = utils::remainsize(iterm, _config.size[0], _config.block[0]);
-        std::memset(tmpC, 0, _config.block[0] * _config.block[1] * sizeof(AccType));
-        if (_param.KBlock <= _config.block[2]) {
-          run_block(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        } else {
-          run_block_large(_param, _config, iterm, itern, m_remain, n_remain, tmpA, tmpB, tmpBlk, tmpC, tmpCache);
-        }
-      }
-    }
-  }
-
- protected:
-  void run_block(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                 int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC, void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    for (int iterk = 0; iterk < _param.K; iterk += _config.block[2]) {
-      int k_remain = utils::remainsize(iterk, _param.K, _config.block[2]);
-      int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-      auto bptr_cache = tmpB;
-      int bcache_step = 0;
-      mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk, _config.loc[1] + blk_n, _param.paramB,
-                            tmpcache, _config.tmpcachesize);
-      int bcache_stride = bcache_step * sizeof(BType);
-
-      for (int ikk = 0; ikk < k_remain; ikk += _param.KBlock) {
-        int k_remain1 = utils::remainsize(iterk + ikk, _param.K, _param.KBlock);
-        int k_paddedle1 = utils::padto_le(k_remain1, GemmCore::KTILE);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle1) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle1,
-                                (blk_m + i + _config.loc[0]), iterk + ikk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + ikk * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_paddedle1, acache_step * sizeof(AType), bcache_stride, ccache_stride, 0, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain1 - k_paddedle1;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + ikk + k_paddedle1, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + (ikk + k_paddedle1) * GemmCore::NTILE, cptr_cache, m_remain,
-                              n_padded, k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride,
-                              0 + k_paddedle1, tmpcache, _config.tmpcachesize);
-          }
-        }
-        mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                          (iterk + ikk) / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache,
-                          _config.tmpcachesize);
-      }
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-
-  void run_block_large(const Param& _param, const parallel::gemm::ThreadProblemBase& _config, int blk_m, int blk_n,
-                       int blk_msize, int blk_nsize, AType* tmpA, BType* tmpB, CType* tmpBlk, AccType* tmpC,
-                       void* tmpcache) {
-    int n_padded = utils::padto(blk_nsize, GemmCore::NTILE);
-    assert(_param.K % _param.KBlock == 0);
-    for (int iterk = 0; iterk < _param.K; iterk += _param.KBlock) {
-      memset(tmpBlk, 0, sizeof(CType) * blk_msize * _config.block[1]);
-      for (int iblkk = 0; iblkk < _param.KBlock; iblkk += _config.block[2]) {
-        int k_remain = utils::remainsize(iterk + iblkk, iterk + _param.KBlock, _config.block[2]);
-        int k_padded = utils::padto(k_remain, GemmCore::KTILE);
-        int k_paddedle = utils::padto_le(k_remain, GemmCore::KTILE);
-        auto bptr_cache = tmpB;
-        int bcache_step = 0;
-        mProB.getKBlockWeight(&bptr_cache, &bcache_step, k_padded, n_padded, iterk + iblkk, _config.loc[1] + blk_n,
-                              _param.paramB, tmpcache, _config.tmpcachesize);
-        int bcache_stride = bcache_step * sizeof(BType);
-        for (int i = 0; i < blk_msize; i += GemmCore::MTILE) {
-          int m_remain = utils::remainsize(i, blk_msize, GemmCore::MTILE);
-          auto cptr_cache = tmpBlk + i * _config.block[1];
-          int ccache_stride = _config.block[1] * sizeof(CType);
-          if (k_paddedle) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_paddedle,
-                                (blk_m + i + _config.loc[0]), iterk + iblkk, tmpcache, _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache, cptr_cache, m_remain, n_padded, k_paddedle,
-                              acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk, tmpcache,
-                              _config.tmpcachesize);
-          }
-          int k_tail = k_remain - k_paddedle;
-          if (k_tail) {
-            AType* aptr_cache = tmpA;
-            int acache_step = 0;
-            mProA.getActivation(&aptr_cache, &acache_step, _param.paramA, m_remain, k_tail,
-                                (blk_m + i + _config.loc[0]), iterk + k_paddedle + iblkk, tmpcache,
-                                _config.tmpcachesize);
-            mGemmCore.forward(aptr_cache, bptr_cache + k_paddedle * GemmCore::NTILE, cptr_cache, m_remain, n_padded,
-                              k_tail, acache_step * sizeof(AType), bcache_stride, ccache_stride, iblkk + k_paddedle,
-                              tmpcache, _config.tmpcachesize);
-          }
-        }
-      }
-      mBlockEpi.forward(tmpBlk, tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n,
-                        iterk / _param.KBlock, blk_msize, blk_nsize, _param.paramBlk, tmpcache, _config.tmpcachesize);
-    }
-    auto cachewithblk = _config.tmpcachesize + static_cast<size_t>(_config.block[0]) * _config.block[1] * sizeof(CType);
-    mEpilogue.forward(tmpC, _config.block[1], (_config.loc[0] + blk_m), _config.loc[1] + blk_n, blk_msize, blk_nsize,
-                      _param.paramC, tmpBlk, cachewithblk);
-  }
-};
-}  // namespace gemm
-}  // namespace wrapper
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
deleted file mode 100644
index 56472aba64f91..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx2.h
+++ /dev/null
@@ -1,874 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jblas/jit_blas.h"
-#include "kernel_ref.h"
-#include "jit_blas_utils.h"
-#if CompileAVX2()
-#include <immintrin.h>
-#endif
-namespace jblas {
-namespace kernel {
-namespace avx2 {
-#if CompileAVX2()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx2", "fma")
-#else
-#endif
-
-static uint8_t shuffle_map[] = {0x00, 0x01, 0x02, 0x03, 0xff, 0xff, 0xff, 0xff,
-                                0x04, 0x05, 0x06, 0x07, 0xff, 0xff, 0xff, 0xff};
-
-template <JBLAS_DTYPE S4_T>
-static inline __m128i unpack_4bits_sse(void* srcptr) {
-  auto shuffle_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(shuffle_map));
-  auto raw_data = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto xmm0 = _mm_shuffle_epi8(raw_data, shuffle_v);
-  auto xmm1 = _mm_srli_epi32(xmm0, 0x04);
-  auto and_helper = _mm_set1_epi8(0x0f);
-  xmm0 = _mm_and_si128(xmm0, and_helper);
-  xmm1 = _mm_and_si128(xmm1, and_helper);
-  auto xmm2 = _mm_unpacklo_epi8(xmm0, xmm1);
-  auto xmm3 = _mm_unpackhi_epi8(xmm0, xmm1);
-  xmm2 = _mm_unpacklo_epi64(xmm2, xmm3);
-  if constexpr (S4_T != JBLAS_DTYPE::S4_FULLRANGE) xmm2 = _mm_slli_epi32(xmm2, 4);
-  return xmm2;
-}
-
-inline __m256 ymm_cvt_bf16_fp32(__m128i vbf16) {
-  auto vf32 = _mm256_cvtepu16_epi32(vbf16);
-  return _mm256_castsi256_ps(_mm256_slli_epi32(vf32, 16));
-}
-
-inline __m128i ymm_cvtepi32_epi16(__m256i src) {
-  __m128i tmp;
-#ifdef __GNUC__
-  for (size_t i = 0; i < 8; i++) {
-    (reinterpret_cast<int16_t*>(&tmp))[i] = (reinterpret_cast<int32_t*>(&src))[i];
-  }
-#else
-  for (size_t i = 0; i < 8; i++) {
-    tmp.m128i_i16[i] = src.m256i_i32[i];
-  }
-#endif
-  return tmp;
-}
-
-inline __m128i ymm_cvt_fp32_bf16(__m256 vfp32) {
-  return ymm_cvtepi32_epi16(_mm256_bsrli_epi128(_mm256_castps_si256(vfp32), 2));
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8_16_sse(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<S4_T>(srcptr);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    auto s8 = _mm_set1_epi8(8);
-    dst0 = _mm_sub_epi8(dst0, s8);
-  }
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v8(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr));
-  auto ymm = _mm256_cvtepi8_epi32(xmm);
-  auto ymm1 = _mm256_cvtepi32_ps(ymm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto xmm = ymm_cvt_fp32_bf16(ymm1);
-    _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), xmm);
-  } else {
-    _mm256_storeu_ps(dstptr, ymm1);
-  }
-}
-
-static inline void fp4_pad_4bit(int8_t* dstptr, int8_t* srcptr) {
-  auto dst0 = unpack_4bits_sse<JBLAS_DTYPE::S4_FULLRANGE>(srcptr);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), dst0);
-}
-
-template <int N, bool _IS_SYM>
-static inline void dequant_s8_N_avx2(float* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps = nullptr) {
-  static_assert(N % 8 == 0);
-  int constexpr VLoop = N / 8;
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto zmm = _mm256_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm256_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm256_cvtepi32_ps(zmm);
-    fzmm = _mm256_mul_ps(fzmm, vscales[iv]);
-    _mm256_storeu_ps(dstptr + iv * 8, fzmm);
-  }
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm256_set1_ps(alpha);
-  auto vbeta = _mm256_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm256_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        vdst = _mm256_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm256_mul_ps(valpha, vsrc);
-        _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <bool WITH_ZP>
-JBLAS_CODE dequant_kblock_s8_f32_fwd(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                     float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  const int Vlen = 8;
-  size_t simd_process_num = utils::padto_le(col, Vlen);
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    int j = 0;
-    for (; j < simd_process_num; j += Vlen) {
-      auto s8_ymm_v = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + i * ld_src + j));
-      auto s32_ymm_v = _mm256_cvtepi8_epi32(s8_ymm_v);
-      if constexpr (WITH_ZP) {
-        s32_ymm_v = _mm256_sub_epi32(
-            s32_ymm_v,
-            _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + kpos * NPad + j))));
-      }
-      auto f32_ymm_v = _mm256_cvtepi32_ps(s32_ymm_v);
-      f32_ymm_v = _mm256_mul_ps(f32_ymm_v, _mm256_loadu_ps(sptr + j));
-      _mm256_storeu_ps(dstptr + i * ld_dst + j, f32_ymm_v);
-    }
-    for (; j < col; j++) {
-      float tmp = (float)(srcptr[i * ld_src + j]);
-      if constexpr (WITH_ZP) tmp -= (float)(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = tmp * sptr[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequant_kblock_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                               float* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  if (zero_points == nullptr)
-    return dequant_kblock_s8_f32_fwd<false>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                            kblock, NPad);
-  else
-    return dequant_kblock_s8_f32_fwd<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                           kblock, NPad);
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col8 = utils::padto_le(col, 8);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm256_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col8; icol += 8) {
-      __m256 vwscale;
-      if constexpr (std::is_same_v<SCAB_T, float>) {
-        vwscale = _mm256_loadu_ps(scaleB + icol);
-      } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-        auto tmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(scaleB + icol));
-        vwscale = ymm_cvt_bf16_fp32(tmp);
-      }
-      auto vscale = _mm256_mul_ps(valpha, vwscale);
-      auto vsrcd = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + irow * srcstep + icol));
-      auto vsrc = _mm256_cvtepi32_ps(vsrcd);
-      vsrc = _mm256_mul_ps(vsrc, vscale);
-      _mm256_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm256_set1_ps(-zpf);
-    for (; j < col8; j += VLen) {
-      auto vreduce = _mm256_loadu_ps(reduce + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm256_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zps + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scales + j));
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzp, vreduce, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 8;
-  auto col8 = utils::padto_le(col, VLen);
-  auto vk = _mm256_set1_ps(static_cast<float>(k));
-  const int32_t mask[] = {-1, -1, 0, 0};
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm256_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm256_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col8; j += VLen) {
-      auto vzp_s32 = _mm256_cvtepi8_epi32(_mm_maskload_epi32(reinterpret_cast<const int*>(zpb + j),
-                                                             _mm_loadu_si128(reinterpret_cast<const __m128i*>(mask))));
-      auto vzp_f32 = _mm256_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm256_mul_ps(vzp_f32, _mm256_loadu_ps(scaleb + j));
-      auto vreduceb = _mm256_loadu_ps(reduceb + j);
-      auto vacc = _mm256_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm256_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm256_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm256_mul_ps(vzpb, vk);
-      vacc = _mm256_fmadd_ps(vzpa, vzpb, vacc);
-      _mm256_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zpb[j]) * scaleb[j] * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * static_cast<float>(zpb[j]) * scaleb[j] * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2));
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      convert_s4_s8_16_sse<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      convert_s8_fp_v8(dstptr + i, tmp);
-      convert_s8_fp_v8(dstptr + i + 8, tmp + 8);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 8) {
-          convert_s8_fp_v8(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 8;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m256 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm256_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha + j));
-      valpha = ymm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm256_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm256_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm256_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm256_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += alpha[j] * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m256* vscales, __m256i* vzps) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    fp32_dq_v = _mm256_mul_ps(fp32_dq_v, vscales[iv]);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 8 == 0);
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-  int constexpr VLoop = N / 8;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv++) {
-    auto idx = _mm_loadl_epi64(reinterpret_cast<__m128i*>(srcptr + iv * 8));
-    auto pad_idx = _mm256_cvtepu8_epi32(idx);
-    auto fp32_dq_v = _mm256_i32gather_ps(LUT, pad_idx, 4);
-    if constexpr (std::is_same_v<_DST_T, float>) {
-      _mm256_storeu_ps(dstptr + iv * 8, fp32_dq_v);
-    } else if constexpr (std::is_same_v<_DST_T, utils::bf16>) {
-      auto bf16v = ymm_cvt_fp32_bf16(fp32_dq_v);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr + iv * 8), bf16v);
-    }
-  }
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = static_cast<size_t>(row) * col;
-    size_t ele16 = utils::padto_le(elesize, 16);
-    size_t i = 0;
-    assert(tmpsize >= 16);
-#pragma unroll
-    for (; i < ele16; i += 16) {
-      fp4_pad_4bit(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2));
-      unpack_f4_N<16, DST_T, F4_T>(dstptr + i, tmp);
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasSuccess;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmpbuf,
-                                                         size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto vmask = _mm256_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    __m256 vscales[6];
-    __m256i vzps[6];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop16 = 48 * UnrollRow / 16;
-    assert(tmpsize >= (48 * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * 48, vscales, vzps);
-      }
-      for (; irow < row0; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);
-    assert(ld_dst == 48);
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter16 = 0; iter16 < Loop16; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 8 * iter16));
-        for (int iterr = 0; iterr < UnrollRow; iterr++)
-          dequantize(dstptr + (irow + irr + iterr) * ld_src, tmpbuf + iterr * 48, vscales, vzps);
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 6; iv++) {
-        vscales[iv] = _mm256_loadu_ps(scales + (k_offset + irow) / kblock * NPad + iv * 8);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadl_epi64(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 8));
-          vzps[iv] = _mm256_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < row; irow++) {
-        for (int iter16 = 0; iter16 < 3; iter16++)
-          pad_bit4(tmpbuf + iter16 * 16, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 8 * iter16));
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  } else {
-    assert(0);
-  }
-  return JblasNotSupport;
-}
-
-template <bool _IS_SYM, typename _ST, typename _DST_T>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DST_T* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DST_T*, int8_t*, __m256*, __m256i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*), int8_t* tmp,
-                                                         size_t tmpsize) {
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<true, _ST, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              fp4_pad_4bit, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-enum class AVX2_REDUCE_TYPE { MAX, MIN, ADD };
-#define AVX2_REDUCE_OP                                                  \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) x = _mm256_max_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) x = _mm256_min_ps(x, y); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) x = _mm256_add_ps(x, y);
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline float avx2_reduce_ps(__m256 x) {
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b01001110);
-  AVX2_REDUCE_OP
-  y = _mm256_permute_ps(x, 0b10110001);
-  AVX2_REDUCE_OP
-  return _mm256_cvtss_f32(x);
-}
-
-#define AVX2_REDUCE_OP_EPI32(dst, src)                                           \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MAX) dst = _mm256_max_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::MIN) dst = _mm256_min_epi32(dst, src); \
-  if constexpr (TYPE == AVX2_REDUCE_TYPE::ADD) dst = _mm256_add_epi32(dst, src);
-
-#ifndef _mm256_cvtsi256_si32
-#define _mm256_cvtsi256_si32(a) (_mm_cvtsi128_si32(_mm256_castsi256_si128(a)))
-#endif
-
-template <AVX2_REDUCE_TYPE TYPE>
-inline int avx2_reduce_epi32(__m256i xd) {
-  auto x = _mm256_castsi256_ps(xd);
-  __m256 y = _mm256_permute2f128_ps(x, x, 1);
-  auto yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b01001110);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  x = _mm256_castsi256_ps(xd);
-  y = _mm256_permute_ps(x, 0b10110001);
-  yd = _mm256_castps_si256(y);
-  AVX2_REDUCE_OP_EPI32(xd, yd);
-  return _mm256_cvtsi256_si32(xd);
-}
-
-inline __m128i avx2_cvtepi32_epu8(__m256i x) {
-  auto out_v = _mm_packus_epi32(_mm256_castsi256_si128(x), _mm256_extractf128_si256(x, 1));
-  out_v = _mm_packus_epi16(out_v, out_v);
-  return out_v;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 8;
-  auto vff = _mm256_set1_epi32(255);
-  auto v0 = _mm256_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m256 vmaxval = _mm256_set1_ps(0.f);
-      __m256 vminval = _mm256_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m256 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) assert(0);
-        vmaxval = _mm256_max_ps(vmaxval, vsrc);
-        vminval = _mm256_min_ps(vminval, vsrc);
-      }
-      auto maxval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MAX>(vmaxval);
-      auto minval = avx2_reduce_ps<AVX2_REDUCE_TYPE::MIN>(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm256_set1_ps(rscale);
-      auto vdzp = _mm256_set1_epi32(zp);
-      ij = 0;
-      if (blkreduce) {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          sum += avx2_reduce_epi32<AVX2_REDUCE_TYPE::ADD>(vdsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      } else {
-        for (; ij < vblocksize; ij += VLen) {
-          __m256 vsrc;
-          if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm256_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-          if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-            auto vtmp = _mm_loadu_si128(reinterpret_cast<__m128i*>(&srcptr[(j + ij) + i * ld_src]));
-            vsrc = ymm_cvt_bf16_fp32(vtmp);
-          }
-          vsrc = _mm256_mul_ps(vsrc, vrscale);
-          auto vdsrc = _mm256_cvtps_epi32(vsrc);
-          vdsrc = _mm256_add_epi32(vdsrc, vdzp);
-          vdsrc = _mm256_min_epi32(vdsrc, vff);
-          vdsrc = _mm256_max_epi32(vdsrc, v0);
-          auto vbsrc = avx2_cvtepi32_epu8(vdsrc);
-          _mm_storel_epi64(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-        }
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = (float)srcptr[(j + ij) + i * ld_src];
-        srcval = srcval * rscale;
-        auto srcint = int(roundf(srcval));
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        maxval = std::max((float)srcptr[ij + i * ld_src], maxval);
-        minval = std::min((float)srcptr[ij + i * ld_src], minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto srcint = utils::cast<float, int>(srcptr[ij + i * ld_src] * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = utils::cast<int, uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 8;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm256_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-        auto s1 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm256_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = avx2_reduce_ps<AVX2_REDUCE_TYPE::ADD>(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 8;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      auto bf16_v = _mm_loadu_si128(reinterpret_cast<__m128i*>(src + j));
-      auto fp32_v = _mm256_castsi256_ps(_mm256_bslli_epi128(_mm256_cvtepu16_epi32(bf16_v), 2));
-      _mm256_storeu_ps(dst + j, fp32_v);
-    }
-    for (; j < col; j++) {
-      *(dst + j) = (src + j)->tofloat();
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-static const uint8_t avx2_bf16_convert_maigc_num[32] = {
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x02, 0x03, 0x06, 0x07, 0x0a, 0x0b, 0x0e, 0x0f, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-
-static inline __m128i cvt_fp32_to_bf16(const __m256 src, __m256i* and_helper, __m256i* add_helper) {
-  auto shuffle_v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(avx2_bf16_convert_maigc_num));
-  auto round_bias = _mm256_castps_si256(src);
-  round_bias = _mm256_and_si256(*and_helper, _mm256_srli_si256(round_bias, 2));
-  round_bias = _mm256_add_epi32(round_bias, *add_helper);
-  auto round_fp32_v = _mm256_add_epi32(_mm256_castps_si256(src), round_bias);
-  __m256i trunc_elements = _mm256_shuffle_epi8(round_fp32_v, shuffle_v);
-  __m256i ordered = _mm256_permute4x64_epi64(trunc_elements, 0x58);
-  return _mm256_castsi256_si128(ordered);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 8;
-  auto bf16_and_helper = _mm256_set1_epi32(0X00000001);
-  auto bf16_add_helper = _mm256_set1_epi32(0x00007FFF);
-  auto col_body_loop = col / simd_proc_elt * simd_proc_elt;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j += simd_proc_elt) {
-      auto pack_bf16_value = cvt_fp32_to_bf16(_mm256_loadu_ps(reinterpret_cast<const float*>(src) + j),
-                                              &bf16_and_helper, &bf16_add_helper);
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + j * sizeof(jblas::utils::bf16)), pack_bf16_value);
-    }
-    for (; j < col; j++) {
-      (reinterpret_cast<jblas::utils::bf16*>(dst) + j)->fromfloat(*(reinterpret_cast<const float*>(src) + j));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx2
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
deleted file mode 100644
index 70cea4749aa79..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512_bf16.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <immintrin.h>
-#include "kernel_avx512f.h"
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace avx512_bf16 {
-#if CompileBF16()
-#pragma GCC push_options
-#pragma GCC target("avx512bf16", "avx512vl", "avx512bw")
-#endif
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileBF16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,  //
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          reinterpret_cast<__m512>(_mm512_bslli_epi128(_mm512_cvtepu16_epi32(_mm256_loadu_epi16(src + j)), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#endif
-  return avx512f::bf16_cvt_fp32_2D_write_back(src_ptr, dst_ptr, row, col, src_step, dst_step, zeropadding);
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-#if CompileBF16()
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 32;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const uint32_t tail_mask = (1U << col_tail) - 1;
-  int npadding = dststride - col * sizeof(utils::bf16);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      _mm512_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-          (__m512i)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-                                       _mm512_loadu_ps(src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_epi16(
-          (dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)), tail_mask,  //
-          (__m512i)_mm512_cvtne2ps_pbh(
-              _mm512_maskz_loadu_ps(tail_mask >> 16, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 16),
-              _mm512_maskz_loadu_ps(tail_mask >> 0, src + sizeof(float) * simd_proc_elt * j + sizeof(float) * 0)));
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-#endif
-  return avx512f::fp32_cvt_bf16_2D_write_back(raw_srcptr, raw_dstptr, row, col, srcstride, dststride, zeropadding);
-}
-#if CompileBF16()
-#pragma GCC pop_options
-#endif
-}  // namespace avx512_bf16
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
deleted file mode 100644
index 3dc0278b8b801..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_avx512f.h
+++ /dev/null
@@ -1,1966 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include "jit_blas_utils.h"
-#include "kernel_ref.h"
-
-#include <array>
-#include <cstring>
-#include <type_traits>
-#if CompileAVX512F()
-#include <immintrin.h>
-#endif
-
-namespace jblas {
-namespace kernel {
-namespace avx512f {
-#if CompileAVX512F()
-#ifdef __GNUC__
-#pragma GCC push_options
-#pragma GCC target("avx512f", "avx512bw", "avx512vl", "avx512vbmi", "avx512dq")
-#if CompileBF16()
-#pragma GCC target("avx512bf16")
-#endif
-#if CompileFP16()
-#pragma GCC target("avx512fp16")
-#endif
-#else
-#endif
-
-inline __m512 zmm_cvt_bf16_fp32(__m256i vbf16) {
-#if CompileBF16()
-  return _mm512_cvtpbh_ps((__m256bh)vbf16);
-#else
-  auto vf32 = _mm512_cvtepu16_epi32(vbf16);
-  return _mm512_castsi512_ps(_mm512_slli_epi32(vf32, 16));
-#endif
-}
-
-inline __m256i zmm_cvt_fp32_bf16(__m512 vfp32) {
-#if CompileBF16()
-  return (__m256i)_mm512_cvtneps_pbh(vfp32);
-#else
-  return _mm512_cvtepi32_epi16(_mm512_bsrli_epi128(_mm512_castps_si512(vfp32), 2));
-#endif
-}
-
-static inline __m512i unpack_4bits(__m256i v4bits, __m512i vmask) {
-  auto ymm1 = _mm256_slli_epi32(v4bits, 4);
-  auto zmm = _mm512_cvtepi8_epi16(v4bits);
-  auto zmm1 = _mm512_cvtepi8_epi16(ymm1);
-  zmm = _mm512_slli_epi16(zmm, 8);
-  zmm1 = _mm512_mask_mov_epi8(zmm1, 0xaaaaaaaaaaaaaaaa, zmm);
-  zmm1 = _mm512_and_epi32(zmm1, vmask);
-  return zmm1;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline void convert_s4_s8(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int LoadMask) {
-  auto ymm = _mm256_maskz_loadu_epi32(__mmask8(LoadMask), reinterpret_cast<const __m256i*>(srcptr));
-  auto zmm = unpack_4bits(ymm, vmask);
-  if constexpr (S4_T == JBLAS_DTYPE::S4_FULLRANGE) {
-    zmm = _mm512_srli_epi32(zmm, 4);
-    auto s8 = _mm512_set1_epi8(8);
-    zmm = _mm512_sub_epi8(zmm, s8);
-  }
-  _mm512_mask_storeu_epi64(dstptr, __mmask8(LoadMask), zmm);
-}
-
-template <typename T>
-static inline void convert_s8_fp_v16(T* dstptr, int8_t* srcptr) {
-  auto xmm = _mm_loadu_si128(reinterpret_cast<const __m128i*>(srcptr));
-  auto zmm = _mm512_cvtepi8_epi32(xmm);
-  auto zmm1 = _mm512_cvtepi32_ps(zmm);
-  if constexpr (std::is_same_v<T, utils::bf16>) {
-    auto ymm = zmm_cvt_fp32_bf16(zmm1);
-    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr), ymm);
-  } else {
-    _mm512_storeu_ps(dstptr, zmm1);
-  }
-}
-
-constexpr void (*pad_fp4)(int8_t* dstptr, int8_t* srcptr, __m512i vmask, int) = &convert_s4_s8<JBLAS_DTYPE::S4_CLIP>;
-
-template <int N, typename _DST_T, bool _IS_SYM>
-static inline void dequant_s8_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto src_s8 = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    auto zmm = _mm512_cvtepi8_epi32(src_s8);
-    if constexpr (!_IS_SYM) zmm = _mm512_sub_epi32(zmm, vzps[iv]);
-    auto fzmm = _mm512_cvtepi32_ps(zmm);
-    fzmm = _mm512_mul_ps(fzmm, vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void dequant_f4_N(_DST_T* dstptr, int8_t* srcptr, __m512* vscales, __m512i* vzps = nullptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_mul_ps(_mm512_castsi512_ps(fp32_dq_v), vscales[iv]);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <int N, typename _DST_T, JBLAS_DTYPE F4_T>
-static inline void unpack_f4_N(_DST_T* dstptr, int8_t* srcptr) {
-  static_assert(N % 16 == 0);
-  int constexpr VLoop = N / 16;
-  float* LUT;
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) {
-    LUT = fp4_bnb_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    LUT = nf4_dequant_fp32_LUT;
-  } else if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) {
-    LUT = fp4_e2m1_dequant_fp32_LUT;
-  }
-#pragma unroll(VLoop)
-  for (int iv = 0; iv < VLoop; iv += 1) {
-    auto idx = _mm_loadu_si128(reinterpret_cast<__m128i*>(srcptr + iv * 16));
-    idx = _mm_srli_epi32(idx, 4);
-    auto pad_idx = _mm512_cvtepu8_epi32(idx);
-    auto lut = _mm512_loadu_si512(LUT);
-    auto fp32_dq_v = _mm512_permutexvar_epi32(pad_idx, lut);
-    auto fzmm = _mm512_castsi512_ps(fp32_dq_v);
-    if constexpr (std::is_same<_DST_T, float>::value) {
-      _mm512_storeu_ps(dstptr + iv * 16, fzmm);
-    } else if constexpr (std::is_same<_DST_T, utils::bf16>::value) {
-      auto bf16_v = zmm_cvt_fp32_bf16(fzmm);
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + iv * 16), bf16_v);
-    } else {
-      assert(false);
-    }
-  }
-}
-
-template <typename _ST>
-static inline __m512 vec_loadscalex16(_ST* ptr) {
-  return _mm512_loadu_ps(ptr);
-}
-
-template <>
-inline __m512 vec_loadscalex16(utils::bf16* ptr) {
-  auto vbf16 = _mm256_loadu_si256(reinterpret_cast<__m256i*>(ptr));
-  return zmm_cvt_bf16_fp32(vbf16);
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs) {
-  dst2regs[0] = _mm512_unpacklo_epi32(src1regs[0], src1regs[0]);
-  dst2regs[1] = _mm512_unpackhi_epi32(src1regs[0], src1regs[0]);
-}
-
-static inline void vec_broadcast_ps_1_2(__m512* dst2regs, __m512* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, _mm512_castps_si512(src1regs[0]));
-  dst2regs[0] = _mm512_castsi512_ps(_mm512_unpacklo_epi32(tmpreg, tmpreg));
-  dst2regs[1] = _mm512_castsi512_ps(_mm512_unpackhi_epi32(tmpreg, tmpreg));
-}
-
-static inline void vec_broadcast_epi32_1_2(__m512i* dst2regs, __m512i* src1regs, __m512i idxreg) {
-  auto tmpreg = _mm512_permutexvar_epi64(idxreg, src1regs[0]);
-  dst2regs[0] = _mm512_unpacklo_epi32(tmpreg, tmpreg);
-  dst2regs[1] = _mm512_unpackhi_epi32(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_pi8_1_2(__m128i* dst2regs, __m128i* src1regs, __m128i idxreg) {
-  auto tmpreg = _mm_permutexvar_epi16(idxreg, src1regs[0]);
-  dst2regs[0] = _mm_unpacklo_epi8(tmpreg, tmpreg);
-  dst2regs[1] = _mm_unpackhi_epi8(tmpreg, tmpreg);
-}
-
-static inline void vec_broadcast_epi32_2_4(__m512i* dst4regs, __m512i* src2regs) {
-  vec_broadcast_epi32_1_2(dst4regs, src2regs);
-  vec_broadcast_epi32_1_2(dst4regs + 2, src2regs + 1);
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow1(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == 48) {
-    constexpr int ColTile = 48;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    constexpr int LoadMask48 = (1 << (48 / 8)) - 1;
-    __m512 vscales[NRegs];
-    __m512i vzps[NRegs];
-    int constexpr UnrollRow = 4;
-    int constexpr Loop64 = ColTile * UnrollRow / 64;
-    assert(tmpsize >= (ColTile * UnrollRow));
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int irow = 0;
-    if (row0) {
-      int rowpad4 = utils::padto_le(row0, UnrollRow);
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-      for (; irow < rowpad4; irow += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + 32 * iter64), zmm_mask,
-                   LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-      for (; irow < row0; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    int row1_blk = utils::padto_le(row1, kblock) + row0;
-    assert(kblock % UnrollRow == 0);
-    assert(ld_src == 48);  // no padding for unroll process
-
-    for (; irow < row1_blk; irow += kblock) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-
-      for (int irr = 0; irr < kblock; irr += UnrollRow) {
-        for (int iter64 = 0; iter64 < Loop64; iter64++) {
-          pad_bit4(tmpbuf + iter64 * 64, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + 32 * iter64),
-                   zmm_mask, LoadMask64);
-        }
-        for (int iterr = 0; iterr < UnrollRow; iterr++) {
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr + iterr) * ld_dst, tmpbuf + iterr * ColTile, vscales, vzps);
-          }
-        }
-      }
-    }
-    if (irow < row) {
-      for (int iv = 0; iv < 3; iv++) {
-        vscales[iv] = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16);
-        if constexpr (!_IS_SYM) {
-          auto tmp =
-              _mm_loadu_si128(reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16));
-          vzps[iv] = _mm512_cvtepi8_epi32(tmp);
-        }
-      }
-    }
-    for (; irow < row; irow++) {
-      pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2), zmm_mask, LoadMask48);
-      if constexpr (_IS_SYM) {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, nullptr);
-      } else {
-        dequantize(dstptr + irow * ld_dst, tmpbuf, vscales, vzps);
-      }
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename _ST, typename _DT, bool _IS_SYM = true>
-static inline JBLAS_CODE decompress_kblock_bit4_packrow2(utils::bit4x2* srcptr, _DT* dstptr, int row, int col,
-                                                         int ld_src, int ld_dst, _ST* scales, int8_t* zero_points,
-                                                         int k_offset, int kblock, int NPad,
-                                                         void (*dequantize)(_DT*, int8_t*, __m512*, __m512i*),
-                                                         void (*pad_bit4)(int8_t*, int8_t*, __m512i, int),
-                                                         int8_t* tmpbuf, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  auto broadcast_idx = _mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
-  auto broadcast_idx_128 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
-  if (col % 64 == 0) {
-    constexpr int ColTile = 64;
-    constexpr int NRegs = ColTile / 16;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (int icol = 0; icol < col; icol += ColTile) {
-      __m512 vscales[NRegs];
-      __m512i vzps[NRegs];
-      assert(tmpsize >= ColTile);
-      int row0 = kblock - k_offset % kblock;
-      row0 = row0 == kblock ? 0 : row0;
-      row0 = row0 > row ? row : row0;
-      int row1 = row - row0;
-      int irow = 0;
-      if (row0) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (; irow < row0; irow++) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-
-      int row1_blk = utils::padto_le(row1, kblock) + row0;
-      for (; irow < row1_blk; irow += kblock) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-
-        for (int irr = 0; irr < kblock; irr += 1) {
-          pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + (irow + irr) * ld_src / 2 + icol / 2), zmm_mask,
-                   LoadMask64);
-          if constexpr (_IS_SYM) {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, nullptr);
-          } else {
-            dequantize(dstptr + (irow + irr) * ld_dst + icol, tmpbuf, vscales, vzps);
-          }
-        }
-      }
-      if (irow < row) {
-        for (int iv = 0; iv < 2; iv++) {
-          auto tmpscale = vec_loadscalex16(scales + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2);
-          vec_broadcast_ps_1_2(vscales + iv * 2, &tmpscale, broadcast_idx);
-          if constexpr (!_IS_SYM) {
-            auto tmpzp = _mm_loadu_si128(
-                reinterpret_cast<__m128i*>(zero_points + (k_offset + irow) / kblock * NPad + iv * 16 + icol / 2));
-            auto vzp = _mm512_cvtepi8_epi32(tmpzp);
-            vec_broadcast_epi32_1_2(vzps + iv * 2, &vzp, broadcast_idx);
-          }
-        }
-      }
-      for (; irow < row; irow++) {
-        pad_bit4(tmpbuf, reinterpret_cast<int8_t*>(srcptr + irow * ld_src / 2 + icol / 2), zmm_mask, LoadMask64);
-        if constexpr (_IS_SYM) {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, nullptr);
-        } else {
-          dequantize(dstptr + irow * ld_dst + icol, tmpbuf, vscales, vzps);
-        }
-      }
-    }
-
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int8_t* zero_points, int k_offset, int kblock,
-                                                 int NPad, int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow1<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<48, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  } else if constexpr (_PACK_ROW == 2) {
-    if (zero_points == nullptr) {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, true>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    } else {
-      return decompress_kblock_bit4_packrow2<_ST, _DST_T, false>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          &dequant_s8_N<64, _DST_T, false>, &convert_s4_s8<S4_T>, tmp, tmpsize);
-    }
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE _F4_T, typename _DST_T, int _PACK_ROW, typename _ST>
-static inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, _ST* scales, int k_offset, int kblock, int NPad,
-                                                 int8_t* tmp, size_t tmpsize) {
-  if constexpr (_PACK_ROW == 1) {
-    return decompress_kblock_bit4_packrow1<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<48, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  } else if constexpr (_PACK_ROW == 2) {
-    return decompress_kblock_bit4_packrow2<_ST, _DST_T, true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, nullptr,
-                                                              k_offset, kblock, NPad, &dequant_f4_N<64, _DST_T, _F4_T>,
-                                                              pad_fp4, tmp, tmpsize);
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE F4_T, typename DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      pad_fp4(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      pad_fp4(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 64) {
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        pad_fp4(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        unpack_f4_N<64, DST_T, F4_T>(dstptr + i, tmp);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.x));
-      dstptr[i + 1] = static_cast<DST_T>(ref::f4_unpack<F4_T>(tmp.y));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(dstptr + i + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(dstptr + i + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(dstptr + i, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = jblas::kernel::ref::get_s8<S4_T>(tmp.x);
-      dstptr[i + 1] = jblas::kernel::ref::get_s8<S4_T>(tmp.y);
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_sym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                            int ld_src, int ld_dst, float* scales, int blocksize) {
-  int constexpr VLen = 16;
-  auto v127 = _mm512_set1_ps(127.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      vscale = _mm512_div_ps(vmaxval, v127);
-      auto vrscale = _mm512_div_ps(v127, vmaxval);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock_asym(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                             int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                             int blocksize) {
-  int constexpr VLen = 16;
-  auto v255 = _mm512_set1_ps(255.f);
-  auto v2 = _mm512_set1_ps(2.f);
-  auto v0 = _mm512_set1_ps(0.f);
-  int col16 = utils::padto_le(col, 16);
-  int i = 0;
-  auto align_row = row / blocksize * blocksize;
-  for (; i < col16; i += VLen) {
-    int j = 0;
-    auto simd_process_block = [&](int size) {
-      __m512 vscale;
-      __m512 vzp;
-      __m512 vmaxval = v0;
-      __m512 vminval = vmaxval;
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto vsub = _mm512_sub_ps(vmaxval, vminval);
-      vscale = _mm512_div_ps(vsub, v255);
-      auto vrscale = _mm512_div_ps(v255, vsub);
-      _mm512_storeu_ps(&scales[j / blocksize * ld_dst + i], vscale);
-      auto vsum = _mm512_add_ps(vmaxval, vminval);
-      auto vmedium = _mm512_div_ps(vsum, v2);
-      vzp = _mm512_mul_ps(_mm512_sub_ps(v0, vmedium), vrscale);
-      auto vbzp = _mm512_cvtsepi32_epi8(_mm512_cvtps_epi32(vzp));
-      _mm_storeu_si128(reinterpret_cast<__m128i*>(&zero_points[j / blocksize * ld_dst + i]), vbzp);
-      for (size_t ij = 0; ij < size; ij++) {
-        auto vsrc = _mm512_loadu_ps(&srcptr[(j + ij) * ld_src + i]);
-        vsrc = _mm512_mul_ps(_mm512_sub_ps(vsrc, vmedium), vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        auto vbsrc = _mm512_cvtsepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) * ld_dst + i]), vbsrc);
-      }
-    };
-    for (; j < align_row; j += blocksize) simd_process_block(blocksize);
-    if (j < row) simd_process_block(row - align_row);
-  }
-  for (; i < col; i++) {
-    int j = 0;
-    auto scalar_process_block = [&](int size) {
-      float maxval = 0;
-      float minval = 0;
-      for (size_t ij = 0; ij < size; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(maxval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255.f;
-      float rscale = 1.f / scale;
-      scales[j / blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2.f;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < size; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    for (; j < align_row; j += blocksize) scalar_process_block(blocksize);
-    if (j < row) scalar_process_block(row - align_row);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-static inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col,
-                                                        int ld_src, int ld_dst, float* scales, int8_t* zero_points,
-                                                        int blocksize) {
-  if (zero_points == nullptr)
-    return quantize_f32_sign_int_rowblock_sym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, blocksize);
-  else
-    return quantize_f32_sign_int_rowblock_asym(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-}
-
-static float F4_NF4_quant_sub_helper[] = {0.f,         0.23746347f, 0.38810113f, 0.50841697f, 0.61348899f, 0.71018467f,
-                                          0.80257138f, 0.88788655f, 0.96835165f, 1.05161765f, 1.14011017f, 1.23740894f,
-                                          1.34975982f, 1.49088332f, 1.70957482f, 2.0f};
-static float F4_BNB_quant_sub_helper[] = {0.00260417f, 0.0859375f, 0.20833333f, 0.29166667f,
-                                          0.4166667f,  0.583333f,  0.8333333f,  1.01f};
-static float F4_E2M1_quant_sub_helper[] = {0.00520833f, 0.08854167f, 0.20833333f, 0.29166667f,
-                                           0.41666667f, 0.58333333f, 0.83333333f, 1.01f};
-constexpr static int8_t F4_NF4_simd_quant_v[] = {0b0111, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0000,
-                                                 0b1000, 0b1001, 0b1010, 0b1011, 0b1100, 0b1101, 0b1110, 0b1111};
-constexpr static int8_t F4_BNB_simd_quant_v[] = {0b0000, 0b0001, 0b0110, 0b0111, 0b0100, 0b0101, 0b0010, 0b0011};
-constexpr static int8_t F4_E2M1_simd_quant_v[] = {0b0000, 0b0001, 0b0010, 0b0011, 0b0100, 0b0101, 0b0110, 0b0111};
-
-template <std::size_t N, std::size_t... I>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr, std::index_sequence<I...>) {
-  return std::array<int8_t, N * 16>{(arr[I / 16])...};
-}
-
-template <std::size_t N>
-constexpr auto broadcast_N_2_Nx16(const int8_t* arr) {
-  return broadcast_N_2_Nx16<N>(arr, std::make_index_sequence<N * 16>{});
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_4x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m128i xmm0{}, xmm1{}, xmm2{}, xmm3{};
-  __m512 zmm0{}, zmm1{}, zmm2{}, zmm3{}, zmm4, zmm5, zmm6, zmm7, zmm_scale{};
-  __mmask16 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm1 = _mm512_mask_loadu_ps(zmm1, ls_mask, srcptr + 1 * ld_src);
-  zmm2 = _mm512_mask_loadu_ps(zmm2, ls_mask, srcptr + 2 * ld_src);
-  zmm3 = _mm512_mask_loadu_ps(zmm3, ls_mask, srcptr + 3 * ld_src);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  zmm1 = _mm512_mul_ps(zmm1, zmm_scale);
-  zmm2 = _mm512_mul_ps(zmm2, zmm_scale);
-  zmm3 = _mm512_mul_ps(zmm3, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zmm_zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zmm_zp);
-    zmm1 = _mm512_add_ps(zmm1, zmm_zp);
-    zmm2 = _mm512_add_ps(zmm2, zmm_zp);
-    zmm3 = _mm512_add_ps(zmm3, zmm_zp);
-  } else {
-    mask4 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    mask5 = _mm512_cmplt_ps_mask(zmm1, zmm_v0);
-    mask6 = _mm512_cmplt_ps_mask(zmm2, zmm_v0);
-    mask7 = _mm512_cmplt_ps_mask(zmm3, zmm_v0);
-
-    zmm0 = _mm512_abs_ps(zmm0);
-    zmm1 = _mm512_abs_ps(zmm1);
-    zmm2 = _mm512_abs_ps(zmm2);
-    zmm3 = _mm512_abs_ps(zmm3);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm4 = _mm512_sub_ps(zmm0, sub_v);
-    zmm5 = _mm512_sub_ps(zmm1, sub_v);
-    zmm6 = _mm512_sub_ps(zmm2, sub_v);
-    zmm7 = _mm512_sub_ps(zmm3, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm4, zmm_v0);
-    mask1 = _mm512_cmple_ps_mask(zmm5, zmm_v0);
-    mask2 = _mm512_cmple_ps_mask(zmm6, zmm_v0);
-    mask3 = _mm512_cmple_ps_mask(zmm7, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm1 = _mm_mask_blend_epi8(mask1, xmm1, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm2 = _mm_mask_blend_epi8(mask2, xmm2, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    xmm3 = _mm_mask_blend_epi8(mask3, xmm3, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-    zmm1 = _mm512_mask_add_ps(zmm1, mask1, zmm1, avoid_double_cmp);
-    zmm2 = _mm512_mask_add_ps(zmm2, mask2, zmm2, avoid_double_cmp);
-    zmm3 = _mm512_mask_add_ps(zmm3, mask3, zmm3, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask4, xmm0, xmm_bias);
-    xmm1 = _mm_mask_add_epi8(xmm1, mask5, xmm1, xmm_bias);
-    xmm2 = _mm_mask_add_epi8(xmm2, mask6, xmm2, xmm_bias);
-    xmm3 = _mm_mask_add_epi8(xmm3, mask7, xmm3, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-  _mm_mask_storeu_epi8(dstptr + 1 * ld_dst, ls_mask, xmm1);
-  _mm_mask_storeu_epi8(dstptr + 2 * ld_dst, ls_mask, xmm2);
-  _mm_mask_storeu_epi8(dstptr + 3 * ld_dst, ls_mask, xmm3);
-}
-
-template <JBLAS_DTYPE F4_T>
-inline void f32_f4_quantize_1x16(const float* srcptr, int8_t* dstptr, int ld_src, int ld_dst,
-                                 const int8_t* broadcast_f4_v, float* scales, __mmask16 ls_mask) {
-  __m512 zmm0{}, zmm1, zmm_scale{};
-  zmm_scale = _mm512_rcp14_ps(_mm512_mask_loadu_ps(zmm_scale, ls_mask, scales));
-  auto avoid_double_cmp = _mm512_set1_ps(100.f);
-  auto zmm_v0 = _mm512_set1_ps(0.f);
-  __m128i xmm0{};
-  __mmask16 mask0, mask1;
-  zmm0 = _mm512_mask_loadu_ps(zmm0, ls_mask, srcptr);
-  zmm0 = _mm512_mul_ps(zmm0, zmm_scale);
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) {
-    auto zp = _mm512_set1_ps(0.8480964004993439f);
-    zmm0 = _mm512_add_ps(zmm0, zp);
-  } else {
-    mask1 = _mm512_cmplt_ps_mask(zmm0, zmm_v0);
-    zmm0 = _mm512_abs_ps(zmm0);
-  }
-  constexpr int loop_num = F4_T == JBLAS_DTYPE::F4_NF4 ? 16 : 8;
-  for (int i = 0; i < loop_num; i++) {
-    __m512 sub_v;
-    if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) sub_v = _mm512_set1_ps(F4_NF4_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) sub_v = _mm512_set1_ps(F4_BNB_quant_sub_helper[i]);
-    if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1) sub_v = _mm512_set1_ps(F4_E2M1_quant_sub_helper[i]);
-    zmm1 = _mm512_sub_ps(zmm0, sub_v);
-    mask0 = _mm512_cmple_ps_mask(zmm1, zmm_v0);
-    xmm0 = _mm_mask_blend_epi8(mask0, xmm0, _mm_loadu_si128(reinterpret_cast<const __m128i*>(broadcast_f4_v + i * 16)));
-    zmm0 = _mm512_mask_add_ps(zmm0, mask0, zmm0, avoid_double_cmp);
-  }
-  if constexpr (F4_T != JBLAS_DTYPE::F4_NF4) {
-    auto xmm_bias = _mm_set1_epi8(0x08);
-    xmm0 = _mm_mask_add_epi8(xmm0, mask1, xmm0, xmm_bias);
-  }
-  _mm_mask_storeu_epi8(dstptr, ls_mask, xmm0);
-}
-
-inline void calc_blkx16_scale(const float* srcptr, int blocksize, int ld_src, float* scales, __mmask16 ls_mask) {
-  auto absmax = _mm512_set1_ps(0.f);
-  __m512 tmp{};
-  for (int i = 0; i < blocksize; i++) {
-    absmax = _mm512_range_ps(absmax, _mm512_mask_loadu_ps(tmp, ls_mask, srcptr + i * ld_src), 7);
-  }
-  _mm512_mask_storeu_ps(scales, ls_mask, absmax);
-}
-
-constexpr auto broadcast_F4_NF4_quantv = broadcast_N_2_Nx16<16>(F4_NF4_simd_quant_v);
-constexpr auto broadcast_F4_BNB_quantv = broadcast_N_2_Nx16<8>(F4_BNB_simd_quant_v);
-constexpr auto broadcast_F4_E2M1_quantv = broadcast_N_2_Nx16<8>(F4_E2M1_simd_quant_v);
-
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  // assert(col % 16 == 0);
-  auto align_row = row / blocksize * blocksize;
-  auto align_blk = blocksize / 4 * 4;
-  int8_t* broadcast_f4_quantv;
-  if constexpr (F4_T == JBLAS_DTYPE::F4_NF4) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_NF4_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_BNB) broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_BNB_quantv.data());
-  if constexpr (F4_T == JBLAS_DTYPE::F4_E2M1)
-    broadcast_f4_quantv = const_cast<int8_t*>(broadcast_F4_E2M1_quantv.data());
-  int i = 0;
-  int align_col = col / 16 * 16;
-
-  auto process_row_blk = [&](int i, int col_size) {
-    int j = 0;
-    __mmask16 ls_mask = _cvtu32_mask16(0xffff >> (16 - col_size));
-    for (; j < align_row; j += blocksize) {
-      calc_blkx16_scale(srcptr + j * ld_src + i, blocksize, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      for (; k < align_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < blocksize; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-    if (j < row) {
-      auto fin_row = row - align_row;
-      calc_blkx16_scale(srcptr + j * ld_src + i, fin_row, ld_src, scales + j / blocksize * ld_dst + i, ls_mask);
-      int k = 0;
-      auto align_fin_blk = fin_row / 4 * 4;
-      for (; k < align_fin_blk; k += 4) {
-        f32_f4_quantize_4x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-      for (; k < fin_row; k++) {
-        f32_f4_quantize_1x16<F4_T>(srcptr + (j + k) * ld_src + i, dstptr + (j + k) * ld_dst + i, ld_src, ld_dst,
-                                   broadcast_f4_quantv, scales + j / blocksize * ld_dst + i, ls_mask);
-      }
-    }
-  };
-
-  for (; i < align_col; i += 16) process_row_blk(i, 16);
-  if (i < col) process_row_blk(i, col - i);
-
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                                 float* blkreduce) {
-  int constexpr VLen = 16;
-  auto vff = _mm512_set1_epi32(255);
-  auto v0 = _mm512_set1_epi32(0);
-  int vblocksize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(0.f);
-      __m512 vminval = _mm512_set1_ps(0.f);
-      size_t ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-        vminval = _mm512_min_ps(vminval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      auto minval = _mm512_reduce_min_ps(vminval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          maxval = std::max(maxval, srcval);
-          minval = std::min(minval, srcval);
-        }
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      auto vdzp = _mm512_set1_epi32(zp);
-      int sum = 0;
-      ij = 0;
-      for (; ij < vblocksize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        if (blkreduce) {
-          sum += _mm512_reduce_add_epi32(vdsrc);
-        }
-        vdsrc = _mm512_add_epi32(vdsrc, vdzp);
-        vdsrc = _mm512_min_epi32(vdsrc, vff);
-        vdsrc = _mm512_max_epi32(vdsrc, v0);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      for (; ij < blocksize; ij++) {
-        auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        srcval = srcval * rscale;
-        auto srcint = utils::cast<float, int>(srcval);
-        sum += srcint;
-        srcint += zp;
-        srcint = std::min(srcint, 0xff);
-        srcint = std::max(srcint, 0);
-        dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[ij + i * ld_src]);
-        auto srcint = utils::cast<float, int>(fsrc * rscale);
-        sum += srcint;
-        srcint += zp;
-        srcint = srcint <= 255 ? srcint : 255;
-        srcint = srcint >= 0 ? srcint : 0;
-        dstptr[ij + i * ld_dst] = srcint;
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr,
-                                                 int ld_dst, float* scales, int ld_scale, int blocksize,
-                                                 float* reduce) {
-  int constexpr VLen = 16;
-  auto vpos = _mm512_set1_epi32(127);
-  auto vneg = _mm512_set1_epi32(-128);
-  int VBlockSize = utils::padto_le(blocksize, VLen);
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i += 1) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      __m512 vmaxval = _mm512_set1_ps(std::numeric_limits<float>::min());
-      size_t ij = 0;
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_abs_ps(vsrc);
-        vmaxval = _mm512_max_ps(vmaxval, vsrc);
-      }
-      auto maxval = _mm512_reduce_max_ps(vmaxval);
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = std::abs(static_cast<float>(srcptr[(j + ij) + i * ld_src]));
-          maxval = std::max(maxval, srcval);
-        }
-      }
-      float scale = maxval / 127;
-      scales[j / blocksize + i * ld_scale] = scale;
-      float rscale = 1.f / scale;
-      auto vrscale = _mm512_set1_ps(rscale);
-      ij = 0;
-      int sum = 0;
-
-      for (; ij < VBlockSize; ij += VLen) {
-        __m512 vsrc;
-        if constexpr (std::is_same_v<SRC_T, float>) vsrc = _mm512_loadu_ps(&srcptr[(j + ij) + i * ld_src]);
-        if constexpr (std::is_same_v<SRC_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(srcptr + j + ij + i * ld_src));
-          vsrc = zmm_cvt_bf16_fp32(tmp);
-        }
-        vsrc = _mm512_mul_ps(vsrc, vrscale);
-        auto vdsrc = _mm512_cvtps_epi32(vsrc);
-        sum += _mm512_reduce_add_epi32(vdsrc);
-        vdsrc = _mm512_min_epi32(vdsrc, vpos);
-        vdsrc = _mm512_max_epi32(vdsrc, vneg);
-        auto vbsrc = _mm512_cvtepi32_epi8(vdsrc);
-        _mm_storeu_si128(reinterpret_cast<__m128i*>(&dstptr[(j + ij) + i * ld_dst]), vbsrc);
-      }
-      if (ij < blocksize) {
-        for (; ij < blocksize; ij++) {
-          auto srcval = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-          srcval = srcval * rscale;
-          auto srcint = int(roundf(srcval));
-          sum += srcint;
-          srcint = std::min(srcint, 127);
-          srcint = std::max(srcint, -127);
-          dstptr[(j + ij) + i * ld_dst] = static_cast<uint8_t>(srcint);
-        }
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        absmaxval = std::max(std::abs((float)srcptr[(j + ij) + i * ld_src]), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>((float)srcptr[(ij) + i * ld_src] * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  auto valpha = _mm512_set1_ps(alpha);
-  auto vbeta = _mm512_set1_ps(beta);
-
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    if (beta != 0.f) {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vsrc1 = _mm512_loadu_ps(src1ptr + i * src1step + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        vdst = _mm512_fmadd_ps(vbeta, vsrc1, vdst);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    } else {
-      for (; j < vN; j += Vlen) {
-        auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-        auto vdst = _mm512_mul_ps(valpha, vsrc);
-        _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-      }
-      for (; j < N; j += 1) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  uint32_t mask = 0xf0f0f0f0;
-  auto zmm_mask = _mm512_set1_epi32(*reinterpret_cast<int*>(&mask));
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele256 = utils::padto_le(elesize, 256);
-    size_t ele64 = utils::padto_le(elesize, 64);
-    assert(tmpsize >= 256);
-    size_t i = 0;
-    constexpr int LoadMask64 = (1 << (64 / 8)) - 1;
-    for (; i < ele256; i += 256) {
-      convert_s4_s8<S4_T>(tmp + 0, reinterpret_cast<int8_t*>(srcptr + i / 2 + 0), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 64, reinterpret_cast<int8_t*>(srcptr + i / 2 + 32), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 128, reinterpret_cast<int8_t*>(srcptr + i / 2 + 64), zmm_mask, LoadMask64);
-      convert_s4_s8<S4_T>(tmp + 192, reinterpret_cast<int8_t*>(srcptr + i / 2 + 96), zmm_mask, LoadMask64);
-      for (size_t j = 0; j < 256; j += 16) {
-        convert_s8_fp_v16(dstptr + i + j, tmp + j);
-      }
-    }
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        convert_s4_s8<S4_T>(tmp, reinterpret_cast<int8_t*>(srcptr + i / 2), zmm_mask, LoadMask64);
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, tmp + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 2) {
-      auto tmp = srcptr[i / 2];
-      dstptr[i + 0] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.x)));
-      dstptr[i + 1] = static_cast<_DST_T>(static_cast<float>(jblas::kernel::ref::get_s8<S4_T>(tmp.y)));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  if (col == ld_src) {
-    size_t elesize = (size_t)row * col;
-    size_t ele64 = utils::padto_le(elesize, 64);
-    size_t i = 0;
-    if (i + 64 <= ele64) {
-      for (; i < ele64; i += 64) {
-        for (size_t j = 0; j < 64; j += 16) {
-          convert_s8_fp_v16(dstptr + i + j, srcptr + i + j);
-        }
-      }
-    }
-    for (; i < elesize; i += 1) {
-      auto tmp = srcptr[i];
-      dstptr[i] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-    return JblasSuccess;
-  }
-  return JblasNotSupport;
-}
-
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    __m512 valpha;
-    if constexpr (std::is_same_v<SCA_T, float>) {
-      valpha = _mm512_loadu_ps(alpha + j);
-    } else if constexpr (std::is_same_v<SCA_T, utils::bf16>) {
-      auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(alpha + j));
-      valpha = zmm_cvt_bf16_fp32(tmp);
-    }
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_fmadd_ps(valpha, vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += static_cast<float>(alpha[j]) * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  int constexpr Vlen = 16;
-  auto vN = utils::padto_le(N, Vlen);
-  int j = 0;
-  for (; j < vN; j += Vlen) {
-    for (size_t i = 0; i < M; i++) {
-      auto vsrc = _mm512_loadu_ps(srcptr + i * srcstep + j);
-      auto vsrc1 = _mm512_loadu_ps(dstptr + i * dststep + j);
-      auto vdst = _mm512_add_ps(vsrc, vsrc1);
-      _mm512_storeu_ps(dstptr + i * dststep + j, vdst);
-    }
-  }
-  for (; j < N; j += 1) {
-    for (size_t i = 0; i < M; i++) {
-      dstptr[i * dststep + j] += srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline void vec_quanout_s32_u32_v16(const int32_t* srcptr, __m512& vfactor, __m512i& vzp, __m512i& vzeros,
-                                           __m512i& v255, uint8_t* dstptr) {
-  auto vsrcd = _mm512_loadu_si512(srcptr);
-  auto vsrcf = _mm512_mul_ps(vfactor, _mm512_cvtepi32_ps(vsrcd));
-  vsrcd = _mm512_cvtps_epi32(vsrcf);
-  vsrcd = _mm512_add_epi32(vsrcd, vzp);
-  vsrcd = _mm512_max_epi32(vsrcd, vzeros);
-  vsrcd = _mm512_min_epi32(vsrcd, v255);
-  auto vdstb = _mm512_cvtepi32_epi8(vsrcd);
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(dstptr), vdstb);
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  auto vfactor = _mm512_set1_ps(factor);
-  auto vzp = _mm512_set1_epi32(zpDst);
-  auto vzeros = _mm512_set1_epi32(0);
-  auto v255 = _mm512_set1_epi32(255);
-  int N64 = utils::padto_le(N, 64);
-  int N48 = utils::padto_le(N, 48);
-  int N16 = utils::padto_le(N, 16);
-  for (int i = 0; i < M; i++) {
-    int j = 0;
-    for (; j < N64; j += 64) {
-      for (int iv = 0; iv < 4; iv++) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                &dstptr[i * dststep + j + iv * 16]);
-      }
-    }
-    if (N48 - j >= 48) {
-      for (; j < N48; j += 48) {
-        for (int iv = 0; iv < 3; iv++) {
-          vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j + iv * 16], vfactor, vzp, vzeros, v255,
-                                  &dstptr[i * dststep + j + iv * 16]);
-        }
-      }
-    }
-    if (N16 - j >= 16) {
-      for (; j < N16; j += 16) {
-        vec_quanout_s32_u32_v16(&srcptr[i * srcstep + j], vfactor, vzp, vzeros, v255, &dstptr[i * dststep + j]);
-      }
-    }
-    for (; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  auto vbeta = _mm512_set1_ps(beta);
-  int col16 = utils::padto_le(col, 16);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = ascales[irow * ldas] * alpha;
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col16; icol += 16) {
-      auto vwscale = _mm512_loadu_ps(wscales + icol);
-      auto vscale = _mm512_mul_ps(valpha, vwscale);
-      auto vdst = _mm512_loadu_ps(dstptr + irow * ld_dst + icol);
-      vdst = _mm512_mul_ps(vdst, vbeta);
-      auto vsrcd = _mm512_loadu_si512(srcptr + irow * ld_src + icol);
-      auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-      vsrc = _mm512_fmadd_ps(vsrc, vscale, vdst);
-      _mm512_storeu_ps(dstptr + irow * ld_dst + icol, vsrc);
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * ld_dst + icol] =
-          scale * wscales[icol] * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int row, const int col, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  int col16 = utils::padto_le(col, 16);
-  int col64 = utils::padto_le(col, 64);
-  for (int irow = 0; irow < row; irow++) {
-    auto scale = scaleA[irow * ldsa];
-    auto valpha = _mm512_set1_ps(scale);
-    int icol = 0;
-    for (; icol < col64; icol += 64) {
-      for (int ic = 0; ic < 4; ic++) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol + ic * 16);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol + ic * 16));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol + ic * 16);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol + ic * 16, vsrc);
-      }
-    }
-    if (icol + 16 <= col16) {
-      for (; icol < col16; icol += 16) {
-        __m512 vwscale;
-        if constexpr (std::is_same_v<SCAB_T, float>) {
-          vwscale = _mm512_loadu_ps(scaleB + icol);
-        } else if constexpr (std::is_same_v<SCAB_T, utils::bf16>) {
-          auto tmp = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scaleB + icol));
-          vwscale = zmm_cvt_bf16_fp32(tmp);
-        }
-        auto vscale = _mm512_mul_ps(valpha, vwscale);
-        auto vsrcd = _mm512_loadu_si512(srcptr + irow * srcstep + icol);
-        auto vsrc = _mm512_cvtepi32_ps(vsrcd);
-        vsrc = _mm512_mul_ps(vsrc, vscale);
-        _mm512_storeu_ps(dstptr + irow * dststep + icol, vsrc);
-      }
-    }
-    for (; icol < col; icol += 1) {
-      dstptr[irow * dststep + icol] = scale * scaleB[icol] * srcptr[irow * srcstep + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  int constexpr VN = 64 / sizeof(srcval);
-  int numv = utils::padto_le(num, VN);
-  auto vsrc = _mm512_set1_epi8(srcval);
-  for (; i < numv; i += VN) {
-    _mm512_storeu_si512(dstptr + i, vsrc);
-  }
-  int num32 = utils::padto_le(num, 32);
-  if (i + 32 <= num32) {
-    for (; i < num32; i += 32) {
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dstptr + i), _mm512_castsi512_si256(vsrc));
-    }
-  }
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    int j = 0;
-    auto vzp = _mm512_set1_ps(-zpf);
-    for (; j < col16; j += VLen) {
-      auto vreduce = _mm512_loadu_ps(reduce + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= zpf * reduce[j];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  for (int i = 0; i < row; i++) {
-    auto vreduce = _mm512_set1_ps(-reduce[i * lds]);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zps + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzp = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scales + j));
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzp, vreduce, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reduce[i * lds];
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  int constexpr VLen = 16;
-  auto col16 = utils::padto_le(col, VLen);
-  auto vk = _mm512_set1_ps(static_cast<float>(k));
-  for (int i = 0; i < row; i++) {
-    auto vreducea = _mm512_set1_ps(-reducea[i * lds]);
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    auto vzpa = _mm512_set1_ps(-zpaf);
-    int j = 0;
-    for (; j < col16; j += VLen) {
-      auto vzp_s32 = _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<__m128i*>(zpb + j)));
-      auto vzp_f32 = _mm512_cvtepi32_ps(vzp_s32);
-      auto vzpb = _mm512_mul_ps(vzp_f32, _mm512_loadu_ps(scaleb + j));
-      auto vreduceb = _mm512_loadu_ps(reduceb + j);
-      auto vacc = _mm512_loadu_ps(&accptr[i * ldacc + j]);
-      vacc = _mm512_fmadd_ps(vzpa, vreduceb, vacc);
-      vacc = _mm512_fmadd_ps(vzpb, vreducea, vacc);
-      vzpb = _mm512_mul_ps(vzpb, vk);
-      vacc = _mm512_fmadd_ps(vzpa, vzpb, vacc);
-      _mm512_storeu_ps(&accptr[i * ldacc + j], vacc);
-    }
-    if (j < col) {
-      for (; j < col; j++) {
-        float zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-        accptr[i * ldacc + j] -= zpbf * reducea[i * lds];
-        accptr[i * ldacc + j] -= zpaf * reduceb[j];
-        accptr[i * ldacc + j] -= zpaf * zpbf * k;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_bf16_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col,
-                                                     int srcstride, int dststride, bool zeropadding) {
-  auto srcptr = reinterpret_cast<const char*>(raw_srcptr);
-  auto dstptr = reinterpret_cast<char*>(raw_dstptr);
-  constexpr int simd_proc_elt = 16;
-  auto col_body_loop = col / simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  auto tail_mask = _cvtu32_mask16(0xffff >> (16 - col_tail));
-  int npadding = dststride - col * sizeof(utils::bf16);
-  auto bf16_and_helper = _mm512_set1_epi32(0x00000001);
-  auto bf16_add_helper = _mm512_set1_epi32(0X00007FFF);
-  for (int i = 0; i < row; i++) {
-    auto src = srcptr + i * srcstride;
-    auto dst = dstptr + i * dststride;
-    int j = 0;
-    for (; j < col_body_loop; j++) {
-      auto round_bias = _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v = _mm512_add_epi32(round_bias, _mm512_loadu_si512(src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_value = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                          pack_bf16_value);
-    }
-    if (col_tail > 0) {
-      auto round_bias = _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j);
-      round_bias = _mm512_and_epi32(bf16_and_helper, _mm512_bsrli_epi128(round_bias, 2));
-      round_bias = _mm512_add_epi32(round_bias, bf16_add_helper);
-      auto round_fp32_v =
-          _mm512_add_epi32(round_bias, _mm512_maskz_loadu_epi32(tail_mask, src + sizeof(float) * simd_proc_elt * j));
-      auto pack_bf16_tail = _mm512_cvtepi32_epi16(_mm512_srli_epi32(round_fp32_v, 16));
-      _mm256_mask_storeu_epi16(reinterpret_cast<__m256i*>(dst + (j * simd_proc_elt) * sizeof(jblas::utils::bf16)),
-                               tail_mask, pack_bf16_tail);
-    }
-    if (zeropadding && npadding) {
-      std::memset(dst + col * sizeof(utils::bf16), 0, npadding);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  int constexpr VLen = 16;
-  auto vblock2_ = utils::padto_le(blocksize, VLen * 2);
-  auto vblock_ = utils::padto_le(blocksize, VLen);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      auto vsum = _mm512_set1_ps(0.f);
-      int jj = 0;
-      auto vblock2 = j + vblock2_ <= col ? vblock2_ : 0;
-      auto vblock = j + vblock_ <= col ? vblock_ : 0;
-      for (; jj < vblock2; jj += VLen * 2) {
-        auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-        auto vtmp1 = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj + VLen);
-        auto s0 = _mm512_reduce_add_ps(vtmp);
-        auto s1 = _mm512_reduce_add_ps(vtmp1);
-        tmp += s0;
-        tmp += s1;
-      }
-      if (jj + VLen <= vblock) {
-        for (; jj < vblock; jj += VLen) {
-          auto vtmp = _mm512_loadu_ps(srcptr + i * ldsrc + j + jj);
-          auto s0 = _mm512_reduce_add_ps(vtmp);
-          tmp += s0;
-        }
-      }
-      for (; jj < blocksize; jj++) {
-        tmp += *(srcptr + i * ldsrc + j + jj);
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE fp32_cvt_fp16_2D_write_back(const float* src_ptr, utils::fp16* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(utils::fp16);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm256_storeu_ph(dst + j, _mm512_cvtxps_ph(_mm512_loadu_ps(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm256_mask_storeu_epi16(  //
-          dst + j, tail_mask, _mm256_castph_si256(_mm512_cvtxps_ph(_mm512_maskz_loadu_ps(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE fp16_cvt_fp32_2D_write_back(const utils::fp16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-#if CompileFP16()
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    const auto src = src_ptr + i * src_step;
-    const auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt) {
-      _mm512_storeu_ps(dst + j, _mm512_cvtxph_ps(_mm256_loadu_ph(src + j)));
-    }
-    if (col_tail > 0) {
-      _mm512_mask_storeu_ps(dst + j, tail_mask,
-                            _mm512_cvtxph_ps(_mm256_castsi256_ph(_mm256_maskz_loadu_epi16(tail_mask, src + j))));
-    }
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-#else
-  return JblasNotSupport;
-#endif
-}
-
-static inline JBLAS_CODE bf16_cvt_fp32_2D_write_back(const utils::bf16* src_ptr, float* dst_ptr, int row, int col,
-                                                     int src_step, int dst_step, bool zeropadding) {
-  const int npadding = (dst_step - col) * sizeof(float);
-  constexpr int simd_proc_elt = 16;
-  auto col_body = col / simd_proc_elt * simd_proc_elt;
-  auto col_tail = col % simd_proc_elt;
-  const auto tail_mask = _cvtu32_mask16((1U << col_tail) - 1);
-  for (int i = 0; i < row; i++) {
-    auto src = const_cast<utils::bf16*>(src_ptr + i * src_step);
-    auto dst = dst_ptr + i * dst_step;
-    int j = 0;
-    for (; j < col_body; j += simd_proc_elt)
-      _mm512_storeu_ps(
-          dst + j,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (col_tail > 0)
-      _mm512_mask_storeu_ps(
-          dst + j, tail_mask,
-          _mm512_castsi512_ps(_mm512_bslli_epi128(
-              _mm512_cvtepu16_epi32(_mm256_castps_si256(_mm256_loadu_ps(reinterpret_cast<float*>(src + j)))), 2)));
-    if (zeropadding && npadding) std::memset(dst + col, 0, npadding);
-  }
-  return JblasSuccess;
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wignored-attributes"  // https://stackoverflow.com/a/49216021
-#endif
-// Interleave 2 bf16 zmm vectors inplace
-static inline void interleave_word(std::array<__m512i, 2>& dst) {  // NOLINT [runtime/references]
-  static constexpr uint32_t perm_idx_a[16]{
-      0 | 0,  1 | 0,  2 | 0,  3 | 0,   //
-      0 | 16, 1 | 16, 2 | 16, 3 | 16,  //
-      4 | 0,  5 | 0,  6 | 0,  7 | 0,   //
-      4 | 16, 5 | 16, 6 | 16, 7 | 16,  //
-  };
-  static constexpr uint32_t perm_idx_b[16]{
-      8 | 0,   9 | 0,   10 | 0,  11 | 0,   //
-      8 | 16,  9 | 16,  10 | 16, 11 | 16,  //
-      12 | 0,  13 | 0,  14 | 0,  15 | 0,   //
-      12 | 16, 13 | 16, 14 | 16, 15 | 16,  //
-  };
-  static const auto v_perm_idx_a = _mm512_loadu_si512(perm_idx_a);
-  static const auto v_perm_idx_b = _mm512_loadu_si512(perm_idx_b);
-
-  __m512i tmp[2];
-  tmp[0] = _mm512_unpacklo_epi16(dst[0], dst[1]);
-  tmp[1] = _mm512_unpackhi_epi16(dst[0], dst[1]);
-  dst[0] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_a, tmp[1]);
-  dst[1] = _mm512_permutex2var_epi32(tmp[0], v_perm_idx_b, tmp[1]);
-}
-
-// Interleave 16 zmm vectors of dwords inplace
-static inline void tr_x16_dword(std::array<__m512i, 16>& dst) {  // NOLINT [runtime/references]
-  __m512i tmp[16];
-
-#pragma unroll(8)
-  for (int i = 0; i < 8; ++i) {
-    tmp[2 * i] = _mm512_unpacklo_epi32(dst[2 * i], dst[2 * i + 1]);
-    tmp[2 * i + 1] = _mm512_unpackhi_epi32(dst[2 * i], dst[2 * i + 1]);
-  }
-
-#pragma unroll(4)
-  for (int i = 0; i < 4; ++i) {
-    dst[4 * i] = _mm512_unpacklo_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 1] = _mm512_unpackhi_epi64(tmp[4 * i], tmp[4 * i + 2]);
-    dst[4 * i + 2] = _mm512_unpacklo_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-    dst[4 * i + 3] = _mm512_unpackhi_epi64(tmp[4 * i + 1], tmp[4 * i + 3]);
-  }
-
-#pragma unroll(2)
-  for (int i = 0; i < 2; ++i) {
-    tmp[8 * i + 0] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0x88);
-    tmp[8 * i + 1] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0x88);
-    tmp[8 * i + 2] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0x88);
-    tmp[8 * i + 3] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0x88);
-    tmp[8 * i + 4] = _mm512_shuffle_i32x4(dst[8 * i + 0], dst[8 * i + 4], 0xdd);
-    tmp[8 * i + 5] = _mm512_shuffle_i32x4(dst[8 * i + 1], dst[8 * i + 5], 0xdd);
-    tmp[8 * i + 6] = _mm512_shuffle_i32x4(dst[8 * i + 2], dst[8 * i + 6], 0xdd);
-    tmp[8 * i + 7] = _mm512_shuffle_i32x4(dst[8 * i + 3], dst[8 * i + 7], 0xdd);
-  }
-
-  dst[0] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0x88);
-  dst[1] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0x88);
-  dst[2] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0x88);
-  dst[3] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0x88);
-  dst[4] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0x88);
-  dst[5] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0x88);
-  dst[6] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0x88);
-  dst[7] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0x88);
-  dst[8] = _mm512_shuffle_i32x4(tmp[0], tmp[8], 0xdd);
-  dst[9] = _mm512_shuffle_i32x4(tmp[1], tmp[9], 0xdd);
-  dst[10] = _mm512_shuffle_i32x4(tmp[2], tmp[10], 0xdd);
-  dst[11] = _mm512_shuffle_i32x4(tmp[3], tmp[11], 0xdd);
-  dst[12] = _mm512_shuffle_i32x4(tmp[4], tmp[12], 0xdd);
-  dst[13] = _mm512_shuffle_i32x4(tmp[5], tmp[13], 0xdd);
-  dst[14] = _mm512_shuffle_i32x4(tmp[6], tmp[14], 0xdd);
-  dst[15] = _mm512_shuffle_i32x4(tmp[7], tmp[15], 0xdd);
-}
-
-#if CompileBF16() && CompileFP16()
-// Load 2 fp16 vectors; convert them to bf16 and interleave them
-template <int tail>
-static inline std::array<__m512i, 2> load_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-// load_fp16_bf16_interleave_word with maskz
-template <int tail>
-static inline std::array<__m512i, 2> load_maskz_fp16_bf16_interleave_word(const utils::fp16* a, size_t lda,
-                                                                          uint32_t mask) {
-  static_assert(tail > 0 && tail <= 2, "Unexpected tail value.");
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  std::array<__m512i, 2> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 2; ++i) dst[i] = _mm512_setzero_epi32();
-  interleave_word(dst);
-  return dst;
-}
-
-template <int tail>
-static inline std::array<__m512i, 16> load_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                     //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_loadu_epi16(a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_fp16_bf16_tr_x16_dword<1>)* load_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<1>,  load_fp16_bf16_tr_x16_dword<2>,
-    load_fp16_bf16_tr_x16_dword<3>,  load_fp16_bf16_tr_x16_dword<4>,  load_fp16_bf16_tr_x16_dword<5>,
-    load_fp16_bf16_tr_x16_dword<6>,  load_fp16_bf16_tr_x16_dword<7>,  load_fp16_bf16_tr_x16_dword<8>,
-    load_fp16_bf16_tr_x16_dword<9>,  load_fp16_bf16_tr_x16_dword<10>, load_fp16_bf16_tr_x16_dword<11>,
-    load_fp16_bf16_tr_x16_dword<12>, load_fp16_bf16_tr_x16_dword<13>, load_fp16_bf16_tr_x16_dword<14>,
-    load_fp16_bf16_tr_x16_dword<15>, load_fp16_bf16_tr_x16_dword<16>,
-};
-
-template <int tail>
-static inline std::array<__m512i, 16> load_maskz_fp16_bf16_tr_x16_dword(const utils::fp16* a, size_t lda,
-                                                                        uint32_t mask) {
-  static_assert(tail > 0 && tail <= 16, "Unexpected tail value.");
-  std::array<__m512i, 16> dst;
-
-  const auto mask_lo = mask;
-  const auto mask_hi = mask >> 16;
-  for (int i = 0; i < tail; ++i) {
-    dst[i] = (__m512i)(_mm512_cvtne2ps_pbh(                                    //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_hi, a + i * lda + 16)),  //
-        _mm512_cvtph_ps(_mm256_maskz_loadu_epi16(mask_lo, a + i * lda + 0))));
-  }
-  for (int i = tail; i < 16; ++i) dst[i] = _mm512_setzero_epi32();
-  tr_x16_dword(dst);
-  return dst;
-}
-static constexpr decltype(load_maskz_fp16_bf16_tr_x16_dword<1>)* load_maskz_fp16_bf16_tr_x16_dword_tbl[17]{
-    load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<1>,  load_maskz_fp16_bf16_tr_x16_dword<2>,
-    load_maskz_fp16_bf16_tr_x16_dword<3>,  load_maskz_fp16_bf16_tr_x16_dword<4>,  load_maskz_fp16_bf16_tr_x16_dword<5>,
-    load_maskz_fp16_bf16_tr_x16_dword<6>,  load_maskz_fp16_bf16_tr_x16_dword<7>,  load_maskz_fp16_bf16_tr_x16_dword<8>,
-    load_maskz_fp16_bf16_tr_x16_dword<9>,  load_maskz_fp16_bf16_tr_x16_dword<10>, load_maskz_fp16_bf16_tr_x16_dword<11>,
-    load_maskz_fp16_bf16_tr_x16_dword<12>, load_maskz_fp16_bf16_tr_x16_dword<13>, load_maskz_fp16_bf16_tr_x16_dword<14>,
-    load_maskz_fp16_bf16_tr_x16_dword<15>, load_maskz_fp16_bf16_tr_x16_dword<16>,
-};
-#endif
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-struct padding_interleave_cvt {
-  padding_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int NTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int RowPack = 2;
-  padding_interleave_cvt() = delete;
-
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int NTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    int i = 0;
-    for (; i < row / RowPack * RowPack; i += RowPack) {
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<2>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    if (i < row) {                      // i: tail processing
-      static constexpr int tail_m = 1;  // must be 1
-      int j = 0;
-      for (; j < col / NTile * NTile; j += NTile) {
-        assert(NTile % 32 == 0);
-        for (int jj = 0; jj < NTile; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-      }
-      if (j < col) {  // j: tail processing
-        int jj = 0;
-        for (; j + jj < col / 32 * 32; jj += 32) {
-          const auto xss = load_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-        }
-        if (j + jj < col) {  // jj: tail processing
-          const uint32_t mask = (1U << (col - j - jj)) - 1;
-          const auto xss = load_maskz_fp16_bf16_interleave_word<tail_m>(src + i * src_step + j + jj, src_step, mask);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 0) * RowPack, xss[0]);
-          _mm512_storeu_si512(dst + i * NTile + j * dst_step + (jj + 16) * RowPack, xss[1]);
-          jj += 32;
-        }
-        for (; jj < NTile; jj += 32) {  // jj: padding zero
-          memset(dst + i * NTile + j * dst_step + jj * RowPack, 0, sizeof(utils::bf16) * 32 * RowPack);
-        }
-        j += NTile;
-      }
-      for (; j < col_pad; j += NTile) {  // j: padding zero
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-      i += RowPack;
-    }
-    for (; i < row_pad; i += RowPack) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += NTile) {
-        memset(dst + i * NTile + j * dst_step, 0, sizeof(utils::bf16) * NTile * RowPack);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-template <typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-struct padding_trans_interleave_cvt {
-  padding_trans_interleave_cvt() = delete;
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int MTile, int row, int col, int row_pad, int col_pad,
-                            int src_step, int dst_step) {
-    return JblasNotSupport;
-  }
-};
-#if CompileBF16() && CompileFP16()
-template <>
-struct padding_trans_interleave_cvt<utils::fp16, utils::bf16, 2> {
-  static constexpr int ColPack = 2;
-  padding_trans_interleave_cvt() = delete;
-
-  static JBLAS_CODE forward(const utils::fp16* src, utils::bf16* dst, int MTile, int row, int col, int row_pad,
-                            int col_pad, int src_step, int dst_step) {
-    assert(row_pad % 16 == 0 && col_pad % 32 == 0);
-    int i = 0;
-    for (; i < row / MTile * MTile; i += MTile) {
-      assert(MTile % 16 == 0);
-      int j = 0;
-      for (; j < col / 32 * 32; j += 32) {
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-      }
-      if (j < col) {  // j: tail processing
-        for (int ii = 0; ii < MTile; ii += 16) {
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        j += 32;
-      }
-      for (; j < col_pad; j += 2) {  // j: padding zero
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    if (i < row) {  // i: tail processing
-      int ii = 0;
-      for (; i + ii < row / 16 * 16; ii += 16) {
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss = load_maskz_fp16_bf16_tr_x16_dword<16>(src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      if (i + ii < row) {  // ii: tail processing
-        const int tbl_idx = row - i - ii;
-        int j = 0;
-        for (; j < col / 32 * 32; j += 32) {
-          assert(MTile % 16 == 0);
-          const auto xss = load_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-        }
-        if (j < col) {  // j: tail processing
-          assert(MTile % 16 == 0);
-          const uint32_t mask = (1U << (col - j)) - 1;
-          const auto xss =
-              load_maskz_fp16_bf16_tr_x16_dword_tbl[tbl_idx](src + (i + ii) * src_step + j, src_step, mask);
-          for (int jj = 0; jj < 32; jj += 2) {
-            _mm512_storeu_si512(dst + i * dst_step + ii * ColPack + (j + jj) * MTile, xss[jj / 2]);
-          }
-          j += 32;
-        }
-        for (; j < col_pad; j += 2) {  // j: padding zero
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-        ii += 16;
-      }
-      for (; ii < MTile; ii += 16) {  // ii: padding zero
-        for (int j = 0; j < col_pad; j += 2) {
-          memset(dst + i * dst_step + ii * ColPack + j * MTile, 0, 2 * sizeof(utils::bf16) * 16);
-        }
-      }
-      assert(ii == MTile);
-      i += MTile;
-    }
-    assert(row_pad % MTile == 0);
-    for (; i < row_pad; i += MTile) {  // i: padding zero
-      for (int j = 0; j < col_pad; j += 2) {
-        memset(dst + i * dst_step + j * MTile, 0, 2 * sizeof(utils::bf16) * MTile);
-      }
-    }
-    return JblasSuccess;
-  }
-};
-#endif
-
-#ifdef __GNUC__
-#pragma GCC pop_options
-#else
-#endif
-#endif
-}  // namespace avx512f
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
deleted file mode 100644
index 245401876c91b..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit.h
+++ /dev/null
@@ -1,1375 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "jit_base.h"
-#include "jit_blas_utils.h"
-#include "kernel_jit_injector.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit {
-
-class DequanS8F32 {
- public:
-  class MicroKernelAVX512F : protected jblas::xbyak::JitAvx512f {
-   public:
-    struct params {
-      void *srcptr, *dstptr;
-      int row, col;
-      int srcstride, dststride;
-      float* scales;
-      int8_t* zps;
-    };
-    typedef long long (*func_t)(params*);
-    static int constexpr VBytes = 64;
-    static int constexpr RegScale = 0;
-    static int constexpr RegZP = 4;
-    static int constexpr RegTmp = RegScale + 8;
-    MicroKernelAVX512F(bool is_sym_) {
-      is_sym = is_sym_;
-      generate();
-      this->ready();
-      mKernel = this->getCode<func_t>();
-    }
-
-    void generate() {
-      inLocalLabel();  // use local label for multiple instance
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 14;
-      Xbyak::util::StackFrame st(this, 1, 13, SF_TmpPos + SF_TmpSize);
-      parambase = st.p[0];
-      reg_srcptr = st.t[0];
-      reg_dstptr = st.t[1];
-      reg_srcstride = st.t[2];
-      reg_dststride = st.t[3];
-      reg_rowsize = st.t[4];
-      reg_colsize = st.t[5];
-      reg_iterrow = st.t[6];
-      reg_itercol = st.t[7];
-      reg_tmp = st.t[8];
-      reg_scaleptr = st.t[9];
-      reg_tmpdst = st.t[10];
-      reg_tmp1 = st.t[12];
-      reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      mov(reg_scaleptr, ptr[parambase + OFFSET(scales)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      xor_(reg_itercol, reg_itercol);
-
-      // reuse parambase reg
-      if (!is_sym) {
-        mov(reg_tmp1, ptr[parambase + OFFSET(zps)]);
-        mov(reg_zpptr, reg_tmp1);
-        xor_(reg_tmp1, reg_tmp1);
-      }
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, 64);
-      jl(".proc48", T_NEAR);
-      generateNTile(4);
-      add(reg_itercol, 64);
-      add(reg_srcptr, 1 * 64);
-      add(reg_dstptr, 4 * 64);
-      add(reg_scaleptr, 4 * 64);
-      if (!is_sym) add(reg_zpptr, 1 * 64);
-      jmp(".colend", T_NEAR);
-
-      L(".proc48");
-      cmp(reg_tmp, 48);
-      jl(".proc32", T_NEAR);
-      generateNTile(3);
-      add(reg_itercol, 48);
-      add(reg_srcptr, 1 * 48);
-      add(reg_dstptr, 4 * 48);
-      add(reg_scaleptr, 4 * 48);
-      if (!is_sym) add(reg_zpptr, 1 * 48);
-      jmp(".colend", T_NEAR);
-
-      L(".proc32");
-      generateNTile(2);
-      add(reg_itercol, 32);
-      add(reg_srcptr, 1 * 32);
-      add(reg_dstptr, 4 * 32);
-      add(reg_scaleptr, 4 * 32);
-      if (!is_sym) add(reg_zpptr, 1 * 32);
-
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-      outLocalLabel();  // end of local label
-    }
-
-    void generateNTile(int N) {
-      for (int i = 0; i < N; i++) {
-        vmovups(Xbyak::Zmm(RegScale + i), ptr[reg_scaleptr + i * 64]);
-        if (!is_sym) {
-          vpmovsxbd(Xbyak::Zmm(RegZP + i), ptr[reg_zpptr + i * 16]);
-        }
-      }
-      inLocalLabel();
-      xor_(reg_iterrow, reg_iterrow);
-      mov(reg_tmp, reg_srcptr);
-      mov(reg_tmp1, reg_dstptr);
-      L(".rowloop");
-      for (int i = 0; i < N; i++) {
-        vpmovsxbd(Xbyak::Zmm(RegTmp), ptr[reg_tmp + i * 16]);
-        if (!is_sym) {
-          vpsubd(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegZP + i));
-        }
-        vcvtdq2ps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegTmp));
-        vmulps(Xbyak::Zmm(RegTmp), Xbyak::Zmm(RegScale + i));
-        vmovups(ptr[reg_tmp1 + i * 64], Xbyak::Zmm(RegTmp));
-      }
-      add(reg_tmp, reg_srcstride);
-      add(reg_tmp1, reg_dststride);
-      add(reg_iterrow, 1);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-      outLocalLabel();
-    }
-    func_t mKernel = nullptr;
-
-   private:
-    Xbyak::Reg64 parambase;
-    Xbyak::Reg64 reg_srcptr;
-    Xbyak::Reg64 reg_dstptr;
-    Xbyak::Reg64 reg_srcstride;
-    Xbyak::Reg64 reg_dststride;
-    Xbyak::Reg64 reg_rowsize;
-    Xbyak::Reg64 reg_colsize;
-    Xbyak::Reg64 reg_iterrow;
-    Xbyak::Reg64 reg_itercol;
-    Xbyak::Reg64 reg_tmp;
-    Xbyak::Reg64 reg_scaleptr;
-    Xbyak::Reg64 reg_tmpdst;
-    Xbyak::Reg64 reg_tmp1;
-    Xbyak::Reg64 reg_ret;
-    Xbyak::Reg64 reg_zpptr = reg_ret;
-    bool is_sym;
-  };
-  static void forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst, float* scales,
-                              int8_t* zero_points) {
-    static MicroKernelAVX512F mAVX512FSym(true);
-    static MicroKernelAVX512F mAVX512FASym(false);
-    auto param = MicroKernelAVX512F::params{srcptr,
-                                            dstptr,
-                                            row,
-                                            col,
-                                            static_cast<int>(ld_src * sizeof(int8_t)),
-                                            static_cast<int>(ld_dst * sizeof(float)),
-                                            scales,
-                                            zero_points};
-    if (zero_points == nullptr) {
-      mAVX512FSym.mKernel(&param);
-    } else {
-      mAVX512FASym.mKernel(&param);
-    }
-  }
-};
-
-class DequanKBlockS8F32 {
- public:
-  template <typename _ST>
-  static inline JBLAS_CODE forward_avx512f(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _ST* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-    int row0 = kblock - k_offset % kblock;
-    row0 = row0 == kblock ? 0 : row0;
-    row0 = row0 > row ? row : row0;
-    int row1 = row - row0;
-    int row1_blk = utils::padto_le(row1, kblock);
-    int row2 = row - row1_blk - row0;
-    auto sptr = scales + k_offset / kblock * NPad;
-    int8_t* zptr = nullptr;
-    if (zero_points != nullptr) zptr = zero_points + k_offset / kblock * NPad;
-    if (row0 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row0, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += row0 * ld_src;
-      dstptr += row0 * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    for (int i = 0; i < row1_blk; i += kblock) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, kblock, col, ld_src, ld_dst, sptr, zptr);
-      srcptr += kblock * ld_src;
-      dstptr += kblock * ld_dst;
-      sptr += NPad;
-      if (zero_points != nullptr) zptr += NPad;
-    }
-    if (row2 > 0) {
-      DequanS8F32::forward_avx512f(srcptr, dstptr, row2, col, ld_src, ld_dst, sptr, zptr);
-    }
-    return JblasSuccess;
-  }
-};
-
-class JitMemcpy2DAvx2 : protected jblas::xbyak::JitAvx2 {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 32;
-  JitMemcpy2DAvx2(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx2 instance_withops(1, p);
-    static JitMemcpy2DAvx2 instance2_withops(2, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    if (col * sizeof(_SRC_T) % 4 != 0) {
-      return JblasNotSupport;
-    }
-    static JitMemcpy2DAvx2 instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx2 instance2_withops(2, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row2 = utils::padto_le(row, 2);
-    if (row2) {
-      param.row = row2;
-      instance2_withops.mKernel(&param);
-    }
-    int rowtail = row - row2;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row2 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row2 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {
-    // unrollK=[1,2]
-    assert(unrollk == 1 || unrollk == 2);
-    Xbyak::Label data_label;
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      int SF_TmpPos = 16 * 10;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_ymm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_ymm_idx, reg_ret);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            vmovups(Xbyak::Ymm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Ymm(i + j * ColUnroll), k * 3 * sizeof(float));
-            vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Ymm(i + j * ColUnroll));
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Ymm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Ymm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      mov(reg_tmp2, reg_colsize);
-      sub(reg_tmp2, reg_itercol);
-      cmp(reg_tmp2, VBytes);
-      jb(".maskflag", T_NEAR);
-      cmp(reg_tmp2, 0);
-      jl(".maskend", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(0));
-        }
-      } else {
-        vmovups(Xbyak::Ymm(0), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vmovups(ptr[reg_tmpdst], Xbyak::Ymm(0));
-      }
-      jmp(".maskend", T_NEAR);
-      L(".maskflag");
-      // 0<tail<8
-      mov(reg_tmp1.cvt32(), 1);
-      shlx(reg_tmp1.cvt32(), reg_tmp1.cvt32(), reg_tmp2.cvt32());
-      sub(reg_tmp1.cvt32(), 1);
-      vmovd(Xbyak::Xmm(1), reg_tmp1.cvt32());
-      vpbroadcastd(Xbyak::Ymm(1), Xbyak::Xmm(1));
-      vpsllvd(Xbyak::Ymm(1), Xbyak::Ymm(1), ptr[rip + data_label]);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc + reg_srcstride * j]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-          vpmaskmovd(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Ymm(1), Xbyak::Ymm(0));
-        }
-      } else {
-        vpmaskmovd(Xbyak::Ymm(0), Xbyak::Ymm(1), ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Ymm(0), k * 3 * sizeof(float));
-        vpmaskmovd(ptr[reg_tmpdst], Xbyak::Ymm(1), Xbyak::Ymm(0));
-      }
-      L(".maskend");
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    L(data_label);
-    uint32_t mask_bias[8] = {28, 24, 20, 16, 12, 8, 4, 0};
-    db(reinterpret_cast<uint8_t*>(mask_bias), sizeof(mask_bias));
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_ymm_idx;
-};
-
-class JitMemcpy2DAvx512f : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr, *elt_const_v;
-    int row, col;
-    int srcstride, dststride;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  JitMemcpy2DAvx512f(int unroll_row, std::vector<kernel::jit_injector::eltwise_injector> injectors) {
-    generate(unroll_row, injectors);
-  }
-
-  template <typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* elt_const_v = nullptr, const Eltops&... ops) {
-    static std::vector<kernel::jit_injector::eltwise_injector> p = {static_cast<JBLAS_ELTWISEOP>(ops)...};
-    if constexpr (sizeof...(ops) != 0)
-      static_assert(std::is_same<_SRC_T, float>::value && std::is_same<_DST_T, float>::value);
-    static JitMemcpy2DAvx512f instance_withops(1, p);
-    static JitMemcpy2DAvx512f instance4_withops(4, p);
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
-  template <typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP Op>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* elt_const_v = nullptr) {
-    static JitMemcpy2DAvx512f instance_withops(1, {kernel::jit_injector::eltwise_injector(Op)});
-    static JitMemcpy2DAvx512f instance4_withops(4, {kernel::jit_injector::eltwise_injector(Op)});
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // TODO SRC_T DST_T conversion copy
-    auto param = params{reinterpret_cast<void*>(const_cast<_SRC_T*>(srcptr)),
-                        reinterpret_cast<void*>(dstptr),
-                        elt_const_v,
-                        row,
-                        static_cast<int>(col * sizeof(_SRC_T)),
-                        static_cast<int>(srcstep * sizeof(_SRC_T)),
-                        static_cast<int>(dststep * sizeof(_DST_T))};
-    int row4 = utils::padto_le(row, 4);
-    if (row4) {
-      param.row = row4;
-      instance4_withops.mKernel(&param);
-    }
-    int rowtail = row - row4;
-    if (rowtail) {
-      param.srcptr = reinterpret_cast<char*>(param.srcptr) + row4 * srcstep * sizeof(_SRC_T);
-      param.dstptr = reinterpret_cast<char*>(param.dstptr) + row4 * dststep * sizeof(_DST_T);
-      param.row = rowtail;
-      instance_withops.mKernel(&param);
-    }
-    return JblasSuccess;
-  }
-
- protected:
-  void generate(int unrollk, std::vector<kernel::jit_injector::eltwise_injector>& injectors) {  // unrollK=[1,2,4]
-    if (unrollk != 1 && unrollk != 2 && unrollk != 4) {
-      assert(false);
-      return;
-    }
-    inLocalLabel();  // use local label for multiple instance
-    {
-      int SF_TmpSize = 64;
-      Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-      const Xbyak::Reg64& parambase = st.p[0];
-      const Xbyak::Reg64& reg_srcptr = st.t[0];
-      const Xbyak::Reg64& reg_dstptr = st.t[1];
-      const Xbyak::Reg64& reg_srcstride = st.t[2];
-      const Xbyak::Reg64& reg_dststride = st.t[3];
-      const Xbyak::Reg64& reg_rowsize = st.t[4];
-      const Xbyak::Reg64& reg_colsize = st.t[5];
-      const Xbyak::Reg64& reg_iterrow = st.t[6];
-      const Xbyak::Reg64& reg_itercol = st.t[7];
-      const Xbyak::Reg64& reg_tmp = st.t[8];
-      const Xbyak::Reg64& reg_elt_constv = st.t[8];  // alias of reg_tmp.
-      const Xbyak::Reg64& reg_tmpsrc = st.t[9];
-      const Xbyak::Reg64& reg_tmpdst = st.t[10];
-      const Xbyak::Reg64& reg_tmp1 = st.t[12];
-      const Xbyak::Reg64& reg_tmp2 = st.t[11];
-      const Xbyak::Reg64& reg_ret = rax;
-
-      vreg_push(rsp);
-
-      mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-      mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-      xor_(reg_srcstride, reg_srcstride);
-      mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-      xor_(reg_dststride, reg_dststride);
-      mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-
-      load32(reg_colsize, ptr[parambase + OFFSET(col)]);
-      load32(reg_rowsize, ptr[parambase + OFFSET(row)]);
-      if (unrollk == 4) {
-        imul(reg_tmp1, reg_srcstride, 3);
-        imul(reg_tmp2, reg_dststride, 3);
-      }
-      int const ColUnroll = 4;
-
-      for (int i = 0; i < unrollk * ColUnroll; i++) used_zmm_idx.insert(i);
-      for (auto&& injector : injectors) {
-        injector.assign_resources(this, used_zmm_idx, reg_ret, k2);
-        injector.assign_reg_elt_constp(reg_elt_constv);
-      }
-
-      xor_(reg_iterrow, reg_iterrow);
-      L(".rowloop");
-      xor_(reg_itercol, reg_itercol);
-      mov(reg_tmpsrc, reg_srcptr);
-      mov(reg_tmpdst, reg_dstptr);
-
-      L(".colloop");
-      mov(reg_tmp, reg_colsize);
-      sub(reg_tmp, reg_itercol);
-      cmp(reg_tmp, ColUnroll * VBytes);
-      jl(".maskproc", T_NEAR);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          for (int i = 0; i < ColUnroll; i++) {
-            if (j == 3) {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_tmp1 + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_tmp2 + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            } else {
-              vmovups(Xbyak::Zmm(i + j * ColUnroll), ptr[reg_tmpsrc + reg_srcstride * j + i * VBytes]);
-              for (int k = 0; k < injectors.size(); k++)
-                injectors[k].vector_compute(Xbyak::Zmm(i + j * ColUnroll), k * 3 * sizeof(float));
-              vmovups(ptr[reg_tmpdst + reg_dststride * j + i * VBytes], Xbyak::Zmm(i + j * ColUnroll));
-            }
-          }
-        }
-      } else {
-        for (int i = 0; i < ColUnroll; i++) {
-          vmovups(Xbyak::Zmm(i), ptr[reg_tmpsrc + i * VBytes]);
-          for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(i), k * 3 * sizeof(float));
-          vmovups(ptr[reg_tmpdst + i * VBytes], Xbyak::Zmm(i));
-        }
-      }
-      add(reg_tmpsrc, ColUnroll * VBytes);
-      add(reg_tmpdst, ColUnroll * VBytes);
-      add(reg_itercol, ColUnroll * VBytes);
-      jmp(".colend", T_NEAR);
-      L(".maskproc");
-      push(reg_tmp1);
-      generate_Nbitsmask(k1, reg_itercol, reg_colsize, reg_tmp, reg_tmp1, VBytes);
-      pop(reg_tmp1);
-      mov(reg_elt_constv, ptr[parambase + OFFSET(elt_const_v)]);
-      if (unrollk > 1) {
-        for (int j = 0; j < unrollk; j++) {
-          if (j == 3) {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_tmp1]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_tmp2], Xbyak::Zmm(0) | k1);
-          } else {
-            vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc + reg_srcstride * j]);
-            for (int k = 0; k < injectors.size(); k++)
-              injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-            vmovdqu8(ptr[reg_tmpdst + reg_dststride * j], Xbyak::Zmm(0) | k1);
-          }
-        }
-      } else {
-        vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_tmpsrc]);
-        for (int k = 0; k < injectors.size(); k++) injectors[k].vector_compute(Xbyak::Zmm(0), k * 3 * sizeof(float));
-        vmovdqu8(ptr[reg_tmpdst], Xbyak::Zmm(0) | k1);
-      }
-      add(reg_tmpsrc, VBytes);
-      add(reg_tmpdst, VBytes);
-      add(reg_itercol, VBytes);
-      L(".colend");
-      cmp(reg_itercol, reg_colsize);
-      jb(".colloop");
-      add(reg_iterrow, unrollk);
-      lea(reg_srcptr, ptr[reg_srcptr + reg_srcstride * unrollk]);
-      lea(reg_dstptr, ptr[reg_dstptr + reg_dststride * unrollk]);
-      cmp(reg_iterrow, reg_rowsize);
-      jb(".rowloop");
-
-      mov(reg_ret, 0);
-      vreg_pop(rsp);
-    }
-    outLocalLabel();  // end of local label
-    for (auto&& injector : injectors) injector.prepare_table();
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  func_t mKernel = nullptr;
-  std::set<int> used_zmm_idx;
-};
-
-static inline Xbyak::Zmm unpack_4bit(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm zmm, Xbyak::Zmm zmm1,
-                                     Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Ymm ymm1(zmm1.getIdx());
-  jit->vpmovsxbw(zmm, v4bits);
-  jit->vpslld(ymm1, v4bits, 4);
-  jit->vpmovsxbw(zmm1, ymm1);
-  jit->vpsllw(zmm, zmm, 8);
-  jit->vmovdqu8(zmm1 | unpack_mask, zmm);
-  jit->vpandd(zmm1, vmask, zmm1);
-  return zmm1;
-}
-
-static inline Xbyak::Zmm unpack_4bit_2regs(Xbyak::CodeGenerator* jit, Xbyak::Ymm v4bits, Xbyak::Zmm tmp,
-                                           Xbyak::Zmm vmask, Xbyak::Opmask unpack_mask) {
-  Xbyak::Zmm dst(v4bits.getIdx());
-  jit->vpmovsxbw(tmp, v4bits);
-  jit->vpslld(v4bits, v4bits, 4);
-  jit->vpmovsxbw(dst, v4bits);
-  jit->vpsllw(tmp, tmp, 8);
-  jit->vmovdqu8(dst | unpack_mask, tmp);
-  jit->vpandd(dst, vmask, dst);
-  return dst;
-}
-
-class DecompressS4S8_AVX512F : protected jblas::xbyak::JitAvx512f {
- public:
-  struct params {
-    void *srcptr, *dstptr;
-    size_t size;
-  };
-  typedef long long (*func_t)(params*);
-
- public:
-  static int constexpr VBytes = 64;
-  DecompressS4S8_AVX512F() {
-    inLocalLabel();  // use local label for multiple instance
-    int SF_TmpSize = 64;
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_size = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[12];
-    const Xbyak::Reg64& reg_ret = rax;
-
-    vreg_push(rsp);
-
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_size, ptr[parambase + OFFSET(size)]);
-    Xbyak::Opmask unpack_mask(4);
-    Xbyak::Zmm zmm_mask(31);
-    mov(reg_tmp.cvt32(), uint32_t(0xf0f0f0f0));
-    vpbroadcastd(zmm_mask, reg_tmp.cvt32());
-    mov(reg_tmp, 0xaaaaaaaaaaaaaaaa);
-    kmovq(unpack_mask, reg_tmp);
-    int const ColUnroll = 4;
-    xor_(reg_iterrow, reg_iterrow);
-    xor_(reg_itercol, reg_itercol);
-    L(".colloop");
-    mov(reg_tmp, reg_size);
-    sub(reg_tmp, reg_itercol);
-    cmp(reg_tmp, ColUnroll * VBytes);
-    jl(".maskproc", T_NEAR);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    for (int i = 0; i < ColUnroll; i++) {
-      vmovups(Xbyak::Ymm(i), ptr[reg_srcptr + reg_tmp + i * VBytes / 2]);
-      unpack_4bit_2regs(this, Xbyak::Ymm(i), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-      vmovups(ptr[reg_dstptr + reg_itercol + i * VBytes], Xbyak::Zmm(i));
-    }
-    add(reg_itercol, ColUnroll * VBytes);
-    jmp(".colend");
-    L(".maskproc");
-    generate_Nbitsmask(k1, reg_itercol, reg_size, reg_tmp, reg_tmp1, VBytes);
-    mov(reg_tmp, reg_itercol);
-    shr(reg_tmp, 1);
-    vmovdqu8(Xbyak::Zmm(0) | k1, ptr[reg_srcptr + reg_tmp]);
-    unpack_4bit_2regs(this, Xbyak::Ymm(0), Xbyak::Zmm(ColUnroll), zmm_mask, unpack_mask);
-    vmovdqu8(ptr[reg_dstptr + reg_itercol], Xbyak::Zmm(0) | k1);
-    add(reg_itercol, VBytes);
-    L(".colend");
-    cmp(reg_itercol, reg_size);
-    jb(".colloop");
-
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-    outLocalLabel();  // end of local label
-
-    this->ready();
-    mKernel = this->getCode<func_t>();
-  }
-
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, size_t size) {
-    static DecompressS4S8_AVX512F instance;
-    auto param = params{srcptr, dstptr, size};
-    instance.mKernel(&param);
-    return JblasSuccess;
-  }
-
- private:
-  func_t mKernel = nullptr;
-};
-
-static inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                          int ld_dst) {
-  if (col != ld_src) {  // memory is not continuous
-    return JblasNotSupport;
-  }
-  DecompressS4S8_AVX512F::forward(srcptr, dstptr, (size_t)row * col);
-  return JblasSuccess;
-}
-
-// src: row x col => dst: ⌈col/n_tile⌉ x ⌈row/row_pack⌉ x n_tile x row_pack (zeor-padded)
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride, dststride;  // dst = dst_base + dststride * n_idx, where n_idx % n_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  static inline const uint16_t idx_interleave_self[32] = {
-      0,  16, 1,  17, 2,  18, 3,  19,  //
-      4,  20, 5,  21, 6,  22, 7,  23,  //
-      8,  24, 9,  25, 10, 26, 11, 27,  //
-      12, 28, 13, 29, 14, 30, 15, 31,  //
-  };
-
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t) : PaddingInterleaveCvt(n_tile, dst_t, dst_t) {}
-  PaddingInterleaveCvt(int n_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int row_pack = 0) : xbyak::JitAvx512f() {
-    inLocalLabel();  // use local label for multiple instance
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (row_pack == 0) row_pack = 4 / dst_bytes;  // default value
-    const auto ne_zmm = 64 / std::max(src_bytes, dst_bytes);
-    const auto src_bytes_vmm = ne_zmm * src_bytes;
-
-    assert(n_tile % ne_zmm == 0);
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    int SF_TmpSize = 64;
-    Xbyak::Label l_idx_interleave_self;
-    std::shared_ptr<void> epilogue{
-        // generate code at the very end
-        nullptr, [&](void*) {
-          align(64);
-          L(l_idx_interleave_self);
-          db(reinterpret_cast<const uint8_t*>(idx_interleave_self), sizeof(idx_interleave_self));
-          outLocalLabel();  // end of local label
-
-          this->ready();
-          this->mKernel = this->getCode<func_t>();
-        }};
-    Xbyak::util::StackFrame st(this, 1, 13, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[5];
-    const Xbyak::Reg64& reg_iterrow = st.t[6];
-    const Xbyak::Reg64& reg_itercol = st.t[7];
-    const Xbyak::Reg64& reg_tmp = st.t[8];
-    const Xbyak::Reg64& reg_tmp1 = st.t[9];
-    const Xbyak::Reg64& reg_tmp2 = st.t[12];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    auto& mask_rd = k1;
-    const Xbyak::Zmm& vreg_idx0 = zmm31;
-
-    vreg_push(rsp);
-    vmovups(vreg_idx0, zword[rip + l_idx_interleave_self]);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(row_pack), reg_tmps(row_pack);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = row_pack;
-    for (int i = 0; i < row_pack; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < row_pack; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(reg_itercol, reg_itercol);
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, row_pack);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      for (int ii = 0; ii < row_pack; ii++) {
-        const Xbyak::Xmm reg_srcs_ii = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[ii].getIdx())
-                                       : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[ii].getIdx())
-                                                             : (assert(false), reg_srcs[ii]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_ii | mask_rd | T_z, ptr[reg_tmp1 + ii * reg_srcstride + jj * src_bytes]);
-        }
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        // interleave_2rows_4regs(reg_srcs.data(), reg_tmps.data());
-        assert(false);  // Not implemented
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-    lea(reg_srcptr, ptr[reg_srcptr + row_pack * reg_srcstride]);
-    lea(reg_dstptr, ptr[reg_dstptr + row_pack * n_tile * dst_bytes]);
-
-    add(reg_iterrow, row_pack);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    L(".tailcolloop");
-    mov(reg_tmp1, reg_itercol);
-    imul(reg_tmp1, reg_dststride);
-    lea(reg_tmp, ptr[reg_dstptr + reg_tmp1]);
-    lea(reg_tmp1, ptr[reg_srcptr + reg_itercol * src_bytes]);
-    for (int jj = 0; jj < n_tile; jj += ne_zmm) {
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize - jj], reg_tmp2, reg_tmp3, ne_zmm);
-      if (row_pack == 2) {
-        const Xbyak::Xmm reg_srcs_0 = src_bytes_vmm == 64   ? Xbyak::Zmm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 32 ? Xbyak::Ymm(reg_srcs[0].getIdx())
-                                      : src_bytes_vmm == 16 ? Xbyak::Xmm(reg_srcs[0].getIdx())
-                                                            : (assert(false), reg_srcs[0]);
-        if (src_bytes == 1) {
-          vmovdqu8(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 2) {
-          vmovdqu16(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        } else if (src_bytes == 4) {
-          vmovdqu32(reg_srcs_0 | mask_rd | T_z, ptr[reg_tmp1 + jj * src_bytes]);
-        }
-        vxorps(reg_srcs[1], reg_srcs[1]);
-      } else {
-        assert(false);
-      }
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        vcvtne2ps2bf16(reg_tmps[0], reg_srcs[1], reg_srcs[0]);
-        vpermt2w(reg_tmps[0], vreg_idx0, reg_tmps[0]);
-        vmovups(ptr[reg_tmp + jj * row_pack * dst_bytes], reg_tmps[0]);
-      } else {
-        assert(false);
-      }
-    }
-    add(reg_itercol, n_tile);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".tailcolloop");
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    const auto kern_col_pad = utils::padto(col, NTile);
-    const auto kern_row_pad = utils::padto(row, RowPack);
-    assert(kern_col_pad <= col_pad && col_pad % NTile == 0);
-    assert(kern_row_pad <= row_pad && row_pad % RowPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    static const PaddingInterleaveCvt kern(NTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, RowPack);
-    kern(&param);
-
-    // extra row and col pad
-    const auto row_pad_size_memset = sizeof(T_DST) * (row_pad - kern_row_pad) * NTile;
-    if (row_pad_size_memset) {
-      for (int j = 0; j < kern_col_pad; j += NTile)
-        memset(dst + j * dst_step + kern_row_pad * NTile, 0, row_pad_size_memset);
-    }
-    for (int j = kern_col_pad; j < col_pad; j += NTile)  //
-      memset(dst + j * dst_step, 0, sizeof(T_DST) * NTile * row_pad);
-  }
-
-  template <int NTile, typename T_SRC, typename T_DST = T_SRC, int RowPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(col, NTile) <= col_pad && col_pad % NTile == 0);
-    assert(utils::padto(row, RowPack) <= row_pad && row_pad % RowPack == 0);
-    for (int i = 0; i < row_pad; i += RowPack)
-      for (int j = 0; j < col_pad; j += NTile)
-        for (int ii = 0; ii < RowPack; ++ii)
-          for (int jj = 0; jj < NTile; ++jj)
-            dst[i * NTile + j * dst_step + ii + jj * RowPack] =
-                static_cast<T_DST>((i + ii < row && j + jj < col) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// src: row x col => dst: ⌈row/m_tile⌉ x ⌈col/(trans_cell*col_pack==64/sizeof(t_dst))⌉ x m_tile x col_pack (zeor-padded)
-// Note1: the extra padding on the dimension of col due to the implementation limitation
-// Note2: dst will only be zero-padded to a multiple of trans_cell in the dimension of m_tile
-// Extra padding can be applied with memset calls in `static void forward(...)`
-class PaddingTransInterleaveCvt : protected xbyak::JitAvx512f {
- public:
-  struct params {
-    const void* srcptr;
-    void* dstptr;
-    int row, col;
-    int srcstride;  // src = src_base + srcstride * m_idx
-    int dststride;  // dst = dst_base + dststride * m_idx, where m_idx % m_tile == 0
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-  const int trans_cell;  // transpose matrices of size trans_cellxtrans_cell (in terms of #elements or #packs)
-
- private:
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t) : PaddingTransInterleaveCvt(m_tile, dst_t, dst_t) {}
-  PaddingTransInterleaveCvt(int m_tile, JBLAS_DTYPE dst_t, JBLAS_DTYPE src_t, int col_pack = 0)
-      : xbyak::JitAvx512f(), trans_cell(64 / col_pack / int(utils::jblas_dtype_size(dst_t))) {
-    const auto src_bytes = static_cast<int>(utils::jblas_dtype_size(src_t));
-    const auto dst_bytes = static_cast<int>(utils::jblas_dtype_size(dst_t));
-    if (col_pack == 0) col_pack = 4 / dst_bytes;  // default value
-    // const auto src_bytes_vmm = ne_zmm * src_bytes;
-    // const auto dst_bytes_vmm = ne_zmm * dst_bytes;
-
-    assert(m_tile % trans_cell == 0);
-    assert(col_pack > 0 && col_pack < 3);  // TODO(yi): int8 interleave not implemented
-
-    inLocalLabel();                // use local label for multiple instance
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 11 | Xbyak::util::UseRDX, 16 * 10);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_srcptr = st.t[0];
-    const Xbyak::Reg64& reg_dstptr = st.t[1];
-    const Xbyak::Reg64& reg_srcstride = st.t[2];
-    const Xbyak::Reg64& reg_dststride = st.t[3];
-    const Xbyak::Reg64& reg_colsize = st.t[4];
-    const Xbyak::Reg64& reg_iterrow = st.t[5];
-    const Xbyak::Reg64& reg_itercol = st.t[6];
-    const Xbyak::Reg64& reg_tmp = st.t[7];
-    const Xbyak::Reg64& reg_tmp2 = st.t[9];
-    const Xbyak::Reg64& reg_tmp3 = st.t[10];
-
-    const Xbyak::Reg64& reg_ret = rax;
-    const auto& mask_rd = k1;
-    const auto& mask_rd2 = k2;
-
-    vreg_push(rsp);
-    mov(reg_srcptr, ptr[parambase + OFFSET(srcptr)]);
-    mov(reg_srcstride.cvt32(), ptr[parambase + OFFSET(srcstride)]);
-    mov(reg_dststride.cvt32(), ptr[parambase + OFFSET(dststride)]);
-    mov(reg_colsize.cvt32(), ptr[parambase + OFFSET(col)]);
-
-    std::vector<Xbyak::Zmm> reg_srcs(trans_cell), reg_tmps(trans_cell);
-    const int ZIDX_TranSrc = 0;
-    const int ZIDX_TransTmp = trans_cell;
-    for (int i = 0; i < trans_cell; i++) reg_srcs[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-    for (int i = 0; i < trans_cell; i++) reg_tmps[i] = Xbyak::Zmm(ZIDX_TransTmp + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    L(".rowloop");
-    xor_(rdx, rdx);
-    mov(rax, reg_iterrow);
-    mov(reg_tmp, m_tile);
-    div(reg_tmp);                                 // reg_iterrow `div` m_tile
-    imul(reg_dstptr, rdx, col_pack * dst_bytes);  // ii * col_pack
-    add(reg_dstptr, ptr[parambase + OFFSET(dstptr)]);
-    imul(reg_tmp, rax, m_tile);
-    imul(reg_tmp, reg_dststride);
-    lea(reg_dstptr, ptr[reg_dstptr + reg_tmp]);  // dst = dst_base + i * dst_step + ii * col_pack
-    xor_(reg_itercol, reg_itercol);
-
-    mov(reg_tmp2.cvt32(), ptr[parambase + OFFSET(row)]);
-    sub(reg_tmp2, reg_iterrow);
-    cmp(reg_tmp2, trans_cell);
-    jb(".tailrowloop", T_NEAR);
-
-    L(".colloop");
-    generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-    if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-      kshiftrq(mask_rd2, mask_rd, 16);
-      assert(trans_cell == 16);
-      for (int ii = 0; ii < trans_cell; ++ii) {
-        lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-        vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-        vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-        vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-      }
-      transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-      for (int jj = 0; jj < trans_cell; ++jj) {
-        vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-      }
-    } else {
-      assert(false);  // Not implemented
-    }
-    lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-    lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-    cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-    jb(".colloop");
-
-    imul(reg_tmp, reg_srcstride, trans_cell);
-    lea(reg_srcptr, ptr[reg_srcptr + reg_tmp]);  // srcptr += trans_cell * srcstride
-    lea(reg_iterrow, ptr[reg_iterrow + trans_cell]);
-    cmp(reg_iterrow.cvt32(), ptr[parambase + OFFSET(row)]);
-    jb(".rowloop");
-    jmp(".aftercolloop", T_NEAR);
-
-    L(".tailrowloop");
-    // reg_itercol, reg_dstptr should have been set in the non-tail section
-    Xbyak::Label l_tail_tbl;
-    std::vector<Xbyak::Label> l_tail_case(trans_cell);
-    mov(reg_tmp, l_tail_tbl);                              // TODO(Yi): rip + l + offset?
-    jmp(ptr[reg_tmp + reg_tmp2 * sizeof(void*)], T_NEAR);  // switch(rows-iterrow) ...
-    align(sizeof(intptr_t));
-    L(l_tail_tbl);
-    db(reinterpret_cast<uintptr_t>(nullptr), sizeof(intptr_t));  // case 0 should never occur
-    for (int i = 1; i < trans_cell; ++i) putL(l_tail_case[i]);
-
-    for (int m_tail = 1; m_tail < trans_cell; ++m_tail) {  // case (m_tail):
-      auto& tailcolloop = l_tail_case[m_tail];
-      L(tailcolloop);
-      generate_Nbitsmask(mask_rd, reg_itercol, ptr[reg_colsize], reg_tmp2, reg_tmp3, 64 / dst_bytes);
-      if (src_t == JBLAS_DTYPE::F32 && dst_t == JBLAS_DTYPE::BF16) {
-        kshiftrq(mask_rd2, mask_rd, 16);
-        assert(trans_cell == 16);
-        for (int ii = 0; ii < trans_cell; ++ii) {
-          if (ii < m_tail) {
-            lea(reg_tmp, (ii == 0) ? ptr[reg_srcptr + reg_itercol * src_bytes] : ptr[reg_tmp + reg_srcstride]);
-            vmovups(reg_srcs[ii] | mask_rd | T_z, zword[reg_tmp]);
-            vmovups(reg_tmps[ii] | mask_rd2 | T_z, zword[reg_tmp + 64]);
-            vcvtne2ps2bf16(reg_srcs[ii], reg_tmps[ii], reg_srcs[ii]);
-          } else if (ii == m_tail) {
-            vxorps(reg_srcs[ii], reg_srcs[ii], reg_srcs[ii]);
-          } else {
-            vmovaps(reg_srcs[ii], reg_srcs[m_tail]);
-          }
-        }
-        transpose16x16_4B(reg_srcs.data(), reg_tmps.data());
-        for (int jj = 0; jj < trans_cell; ++jj) {
-          vmovups(ptr[reg_dstptr + jj * m_tile * col_pack * dst_bytes], reg_srcs[jj]);
-        }
-      } else {
-        assert(false);  // Not implemented
-      }
-      lea(reg_dstptr, ptr[reg_dstptr + col_pack * trans_cell * dst_bytes * m_tile]);
-      lea(reg_itercol, ptr[reg_itercol + col_pack * trans_cell]);
-      cmp(reg_itercol.cvt32(), ptr[parambase + OFFSET(col)]);
-      jb(tailcolloop);
-      jmp(".aftercolloop", T_NEAR);
-    }
-
-    L(".aftercolloop");
-    mov(reg_ret, 0);
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                      int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    static const PaddingTransInterleaveCvt kern(MTile, utils::jblas_dtype<T_DST>, utils::jblas_dtype<T_SRC>, ColPack);
-    // 0-padded guarantee by jit kern
-    const auto kern_row_pad = utils::padto(row, kern.trans_cell),
-               kern_col_pad = utils::padto(col, kern.trans_cell * ColPack);
-    assert(kern_row_pad <= row_pad && row_pad % MTile == 0);
-    assert(kern_col_pad <= col_pad && col_pad % ColPack == 0);
-    const auto src_stride = static_cast<int>(sizeof(T_SRC)) * src_step;
-    const auto dst_stride = static_cast<int>(sizeof(T_DST)) * dst_step;
-    params param = {src, dst, row, col, src_stride, dst_stride};
-    kern(&param);
-
-    // extra row and col pad
-    const auto col_pad_size_memset = sizeof(T_DST) * (col_pad - kern_col_pad) * MTile;
-    if (col_pad_size_memset) {
-      for (int i = 0; i < kern_row_pad; i += MTile)
-        memset(dst + i * dst_step + kern_col_pad * MTile, 0, col_pad_size_memset);
-    }
-    const auto row_tail_pad_size_memset = sizeof(T_DST) * (utils::padto(row, MTile) - kern_row_pad) * ColPack;
-    if (row_tail_pad_size_memset) {  // row tail due to kernel limitation: kern_row_pad < next_multiple_of_MTile
-      const auto kern_row_pad_le_mtile = utils::padto_le(kern_row_pad, MTile);
-      const auto tail_dst_base = dst + kern_row_pad_le_mtile * dst_step + kern_row_pad % MTile * ColPack;
-      for (int j = 0; j < kern_col_pad; j += ColPack) memset(tail_dst_base + j * MTile, 0, row_tail_pad_size_memset);
-    }
-    for (int j = utils::padto(row, MTile); j < row_pad; j += MTile)
-      memset(dst + kern_row_pad * dst_step, 0, sizeof(T_DST) * MTile * col_pad);
-  }
-
-  template <int MTile, typename T_SRC, typename T_DST = T_SRC, int ColPack = 4 / sizeof(T_DST)>
-  static void reference(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                        int dst_step) {
-    assert(utils::padto(row, MTile) <= row_pad && row_pad % MTile == 0);
-    assert(utils::padto(col, ColPack) <= col_pad && col_pad % ColPack == 0);
-    for (int i = 0; i < row_pad; i += MTile)
-      for (int j = 0; j < col_pad; j += ColPack)
-        for (int ii = 0; ii < MTile; ++ii)
-          for (int jj = 0; jj < ColPack; ++jj)
-            dst[j * MTile + i * dst_step + jj + ii * ColPack] =
-                static_cast<T_DST>((j + jj < col && i + ii < row) ? src[(i + ii) * src_step + j + jj] : 0);
-  }
-};
-
-// Complex number matrix(interleaved) - vector(as diagonal matrix) multiplication; Typically used for
-// shift-RoPE
-//
-// vector: fp16 values; view every adjacent 2 values on colunm as a complex num
-// src: bf16 ⌈row/row_pack⌉ x n_tile x row_pack; view every adjacent 2 values on colunm as a complex num
-// dst: same as src
-class CScaleInterleavedBF16FP16 : protected xbyak::JitAvx512_fp16 {
- public:
-  struct params {
-    void* srcptr;
-    const void* scaleptr;
-    int row;
-  };
-  typedef void (*func_t)(params* p);
-  void operator()(params* p) const { mKernel(p); }
-
- private:
-  explicit CScaleInterleavedBF16FP16(int n_tile, int n_off, int row_pack = 2, int unroll = 2)
-      : xbyak::JitAvx512_fp16() {
-    inLocalLabel();  // use local label for multiple instance
-    assert(("n_tile must be a multiple of 16", n_tile % 16 == 0));
-    assert(row_pack > 0 && row_pack < 3);  // TODO(yi): int8 interleave not implemented
-    int SF_TmpSize = 64;
-    std::shared_ptr<void> epilogue{// generate code at the very end
-                                   nullptr, [&](void*) {
-                                     outLocalLabel();  // end of local label
-                                     this->ready();
-                                     this->mKernel = this->getCode<func_t>();
-                                   }};
-    Xbyak::util::StackFrame st(this, 1, 4, 16 * 10 + SF_TmpSize);
-    const Xbyak::Reg64& parambase = st.p[0];
-    const Xbyak::Reg64& reg_src = st.t[0];
-    const Xbyak::Reg64& reg_scale = st.t[1];
-    const Xbyak::Reg64& reg_rowsize = st.t[2];
-    const Xbyak::Reg64& reg_iterrow = st.t[3];
-    const Xbyak::Zmm& vreg_scale = zmm31;
-    const auto& mask = k1;
-    const auto masked_off = n_off % 16;
-    if (masked_off != 0) {
-      mov(reg_src, ((1ULL << (16 - masked_off)) - 1) << masked_off);
-      kmovw(mask, reg_src.cvt32());
-    }
-
-    vreg_push(rsp);
-    mov(reg_rowsize.cvt32(), ptr[parambase + OFFSET(row)]);
-    mov(reg_src, qword[parambase + OFFSET(srcptr)]);
-    mov(reg_scale, qword[parambase + OFFSET(scaleptr)]);
-
-    std::vector<Xbyak::Zmm> vreg_src(4 * n_tile / 16);
-    const int ZIDX_TranSrc = 0;
-    for (int i = 0; i < 4 * n_tile / 16; i++) vreg_src[i] = Xbyak::Zmm(ZIDX_TranSrc + i);
-
-    xor_(reg_iterrow, reg_iterrow);
-    Xbyak::Label rowloop;
-    L(rowloop);
-    {
-      assert(("only implement for pack2 bf16", row_pack == 2));
-      for (int i = 0; i < unroll * row_pack; i += row_pack) {
-        vpbroadcastd(vreg_scale, dword[reg_scale + reg_iterrow * sizeof(utils::fp16) + i * sizeof(utils::fp16)]);
-
-        if (masked_off != 0) {
-          int j = utils::padto_le(n_off, 16);
-
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)] | mask, vreg0);
-        }
-
-        for (int j = utils::padto(n_off, 16); j < n_tile; j += 16) {
-          const auto& vreg0 = vreg_src[j / 16 * 4 + 0];
-          const auto& vreg1 = vreg_src[j / 16 * 4 + 1];
-          const auto& vreg2 = vreg_src[j / 16 * 4 + 2];
-          const auto& vreg3 = vreg_src[j / 16 * 4 + 3];
-          vpmovzxwd(vreg0, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 0]);
-          vpmovzxwd(vreg1, yword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16) + 32]);
-          vpslldq(vreg0, vreg0, 2);
-          vpslldq(vreg1, vreg1, 2);
-          vcvtps2phx(Xbyak::Ymm(vreg0.getIdx()), vreg0);
-          vcvtps2phx(Xbyak::Ymm(vreg1.getIdx()), vreg1);
-          // #UD If (dest_reg == src1_reg) or (dest_reg == src2_reg)
-          vfmulcph(Xbyak::Ymm(vreg2.getIdx()), Xbyak::Ymm(vreg0.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vfmulcph(Xbyak::Ymm(vreg3.getIdx()), Xbyak::Ymm(vreg1.getIdx()), Xbyak::Ymm(vreg_scale.getIdx()));
-          vcvtph2psx(vreg0, Xbyak::Ymm(vreg2.getIdx()));
-          vcvtph2psx(vreg1, Xbyak::Ymm(vreg3.getIdx()));
-          vcvtne2ps2bf16(vreg0, vreg1, vreg0);
-          vmovups(zword[reg_src + (i * n_tile + j * row_pack) * sizeof(utils::bf16)], vreg0);
-        }
-      }
-    }
-    lea(reg_iterrow, ptr[reg_iterrow + unroll * row_pack]);
-    lea(reg_src, ptr[reg_src + unroll * row_pack * n_tile * sizeof(utils::bf16)]);
-    cmp(reg_iterrow, reg_rowsize);
-    jb(rowloop);
-
-    vreg_pop(rsp);
-  }
-
-  func_t mKernel = nullptr;
-
- public:
-  template <int NTile, int RowPack = 2>
-  static void forward(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    constexpr auto unroll = 2;
-    assert(("row should be paded", row % (RowPack * unroll) == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    int j = utils::padto_le(n_offset, NTile);
-    if (n_offset % NTile != 0) {
-      static const CScaleInterleavedBF16FP16 kern_off(NTile, n_offset % NTile, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern_off(&param);
-      j += NTile;
-    }
-
-    for (; j < col; j += NTile) {
-      static const CScaleInterleavedBF16FP16 kern(NTile, 0, RowPack, unroll);
-      params param = {src + j * src_step, scale, row};
-      kern(&param);
-    }
-  }
-
-  template <int NTile, int RowPack = 2>
-  static void reference(utils::bf16* src, const utils::fp16* scale, int row, int col, int src_step, int n_offset) {
-    static_assert(RowPack == 2, "Only implement rowpack2 bf16");
-    static_assert(NTile % 16 == 0, "NTile must be a multiple of 16");
-    assert(("row should be paded", row % RowPack == 0));
-    assert(("cow should be paded", col % NTile == 0));
-    assert(("can not skip more than col", n_offset < col));
-    for (int j = 0; j < col; j += NTile) {
-      for (int i = 0; i < row; i += RowPack) {
-        for (int jj = 0; jj < NTile; ++jj) {
-          if (j + jj < n_offset) continue;
-          auto& rel = (src + j * src_step)[i * NTile + jj * RowPack + 0];
-          auto& img = (src + j * src_step)[i * NTile + jj * RowPack + 1];
-          const auto rel_f32 = static_cast<float>(rel);
-          const auto img_f32 = static_cast<float>(img);
-          const auto rel_scale = static_cast<float>(scale[i + 0]);
-          const auto img_scale = static_cast<float>(scale[i + 1]);
-          rel = static_cast<utils::bf16>(rel_f32 * rel_scale - img_f32 * img_scale);
-          img = static_cast<utils::bf16>(rel_f32 * img_scale + img_f32 * rel_scale);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace jit
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
deleted file mode 100644
index d3e49eecd6b4e..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_jit_injector.h
+++ /dev/null
@@ -1,930 +0,0 @@
-//  Copyright (c) 2022 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-#pragma once
-
-#include <utility>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <map>
-#include <set>
-#include <array>
-
-#include "jit_blas.h"
-#include "jit_blas_utils.h"
-#include "xbyak/xbyak.h"
-
-namespace jblas {
-namespace kernel {
-namespace jit_injector {
-using Zmm = Xbyak::Zmm;
-using Ymm = Xbyak::Ymm;
-using Xmm = Xbyak::Xmm;
-class eltwise_injector {
- public:
-  eltwise_injector(JBLAS_ELTWISEOP eltwiseop) : elt_op(eltwiseop) { reigster_table_entries(); }
-  virtual ~eltwise_injector() {}
-
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_zmm_idx, const Xbyak::Reg64& table_reg,
-                        const Xbyak::Opmask& mask_reg) {
-    h = ptr;
-    k_mask = mask_reg;
-    p_table = table_reg;
-    assert(used_zmm_idx.size() <= 26);
-    assign_zmm(used_zmm_idx, &zmm_mask);
-    assign_zmm(used_zmm_idx, &zmm_aux0);
-    assign_zmm(used_zmm_idx, &zmm_aux1);
-    assign_zmm(used_zmm_idx, &zmm_aux2);
-    assign_zmm(used_zmm_idx, &zmm_aux3);
-    assign_zmm(used_zmm_idx, &zmm_aux4);
-  }
-  void assign_resources(Xbyak::CodeGenerator* ptr, const std::set<int>& used_ymm_idx, const Xbyak::Reg64& table_reg) {
-    h = ptr;
-    p_table = table_reg;
-    assert(used_ymm_idx.size() <= 10);
-    assign_ymm(used_ymm_idx, &ymm_mask);
-    assign_ymm(used_ymm_idx, &ymm_aux0);
-    assign_ymm(used_ymm_idx, &ymm_aux1);
-    assign_ymm(used_ymm_idx, &ymm_aux2);
-    assign_ymm(used_ymm_idx, &ymm_aux3);
-    assign_ymm(used_ymm_idx, &ymm_aux4);
-  }
-  void assign_reg_elt_constp(const Xbyak::Reg64& reg) { reg_rt_const_p = reg; }
-  void vector_compute(const Xbyak::Zmm& zmm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(zmm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(zmm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(zmm_src);
-        break;
-      case RELU:
-        relu_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LINEAR:
-        linear_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(zmm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(zmm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void vector_compute(const Xbyak::Ymm& ymm_src, int const_p_offset = 0) {
-    load_table_addr();
-    switch (elt_op) {
-      case EXP:
-        exp_compute_vector_fwd(ymm_src);
-        break;
-      case TANH:
-        tanh_compute_vector_fwd(ymm_src);
-        break;
-      case GELU:
-        gelu_compute_vector_fwd(ymm_src);
-        break;
-      case LOW_PRECISION_EXP:
-        low_precision_exp_compute_vector_fwd(ymm_src);
-        break;
-      case SWISH:
-        swish_compute_vector_fwd(ymm_src, const_p_offset);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-  }
-  void prepare_table() {
-    h->align(64);
-    h->L(l_table);
-    assert(sizeof(table_entry_val_t) == 4);  // sizeof(table_entry_val_t) should be 4
-    for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-      const auto& te = (*it).second;
-      const auto len = te.bcast ? 64u : sizeof(table_entry_val_t);
-      for (size_t d = 0; d < len; d += sizeof(table_entry_val_t)) h->dd(te.val);
-    }
-  }
-
- private:
-  void reigster_table_entries() {
-    static const table_t common_values{
-        {zero, {0x00000000, true}},      {half, {0x3f000000, true}},          {one, {0x3f800000, true}},
-        {two, {0x40000000, true}},       {minus_one, {0xbf800000, true}},     {minus_two, {0xc0000000, true}},
-        {ln2f, {0x3f317218, true}},      {one_epi32, {0x00000001, true}},     {positive_mask, {0x7fffffff, true}},
-        {sign_mask, {0x80000000, true}}, {exponent_bias, {0x0000007f, true}},
-    };
-
-    static constexpr std::array<float, 3> exp_approx_f32_coeff{0.35815147f, 0.96963238f, 1.f};
-    static const table_t low_precision_exp_consts{
-        {low_precision_exp_const_v0, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[0]), true}},
-        {low_precision_exp_const_v1, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[1]), true}},
-        {low_precision_exp_const_v2, {jblas::utils::bit_cast<uint32_t>(exp_approx_f32_coeff[2]), true}},
-    };
-
-    static const table_t exp_consts{{exp_log2ef, {0x3fb8aa3b, true}},
-                                    {exp_ln_flt_max_f, {0x42b17218, true}},
-                                    {exp_ln_flt_min_f, {0xc2aeac50, true}}};
-
-    static const table_t exp_polynomial{
-        // p0 = 1.0f
-        {exp_pol, {0x3f7ffffb, true}},  // p1 = 0.999999701f
-        {exp_pol, {0x3efffee3, true}},  // p2 = 0.499991506f
-        {exp_pol, {0x3e2aad40, true}},  // p3 = 0.166676521f
-        {exp_pol, {0x3d2b9d0d, true}},  // p4 = 0.0418978221f
-        {exp_pol, {0x3c07cfce, true}}   // p5 = 0.00828929059f
-    };
-
-    static const table_t gelu_tanh_const{{gelu_tanh_fitting_const, {0x3d372713, true}},
-                                         {gelu_tanh_fitting_const_times_three, {0x3e095d4f, true}},
-                                         {gelu_tanh_sqrt_two_over_pi, {0x3f4c422a, true}},
-                                         {gelu_tanh_flt_max_x, {0x4154C480, true}},
-                                         {gelu_tanh_flt_min_x, {0xC154C480, true}}};
-
-    // tanh(x) constants for four interval approximation
-    static const table_t tanh_consts{{tanh_idx_bias, {0x39800000, true}},
-                                     {tanh_idx_mask, {0xffc00000, true}},
-                                     {tanh_linear_ubound, {0x39ddb3d7, true}},
-                                     {tanh_saturation_lbound, {0x41102cb3, true}}};
-
-    // tanh(x) polynomial approximation
-    // For each coefficient, there is 32 entries
-    static const table_t tanh_polynomial_table{
-        // coefficients of degree 0
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x39bfffff, false}},
-        {tanh_pol_table, {0x39ffffff, false}},
-        {tanh_pol_table, {0x3a3ffffe, false}},
-        {tanh_pol_table, {0x3a7ffffb, false}},
-        {tanh_pol_table, {0x3abffff7, false}},
-        {tanh_pol_table, {0x3affffeb, false}},
-        {tanh_pol_table, {0x3b3fffdc, false}},
-        {tanh_pol_table, {0x3b7fffab, false}},
-        {tanh_pol_table, {0x3bbfff70, false}},
-        {tanh_pol_table, {0x3bfffeab, false}},
-        {tanh_pol_table, {0x3c3ffdc0, false}},
-        {tanh_pol_table, {0x3c7ffaab, false}},
-        {tanh_pol_table, {0x3cbff701, false}},
-        {tanh_pol_table, {0x3cffeaad, false}},
-        {tanh_pol_table, {0x3d3fdc08, false}},
-        {tanh_pol_table, {0x3d7faacd, false}},
-        {tanh_pol_table, {0x3dbf7081, false}},
-        {tanh_pol_table, {0x3dfeacc9, false}},
-        {tanh_pol_table, {0x3e3dc7fd, false}},
-        {tanh_pol_table, {0x3e7acbf5, false}},
-        {tanh_pol_table, {0x3eb77a9f, false}},
-        {tanh_pol_table, {0x3eec9a9f, false}},
-        {tanh_pol_table, {0x3f22991f, false}},
-        {tanh_pol_table, {0x3f42f7d6, false}},
-        {tanh_pol_table, {0x3f67b7cc, false}},
-        {tanh_pol_table, {0x3f76ca83, false}},
-        {tanh_pol_table, {0x3f7ebbe9, false}},
-        {tanh_pol_table, {0x3f7fd40c, false}},
-        {tanh_pol_table, {0x3f7fff32, false}},
-        {tanh_pol_table, {0x3f7ffffc, false}},
-        {tanh_pol_table, {0x3f800000, false}},
-        // coefficients of degree 1
-        {tanh_pol_table, {0x3f800000, false}},
-        {tanh_pol_table, {0x3f800018, false}},
-        {tanh_pol_table, {0x3f7fffe8, false}},
-        {tanh_pol_table, {0x3f7fffda, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffdc, false}},
-        {tanh_pol_table, {0x3f7fffac, false}},
-        {tanh_pol_table, {0x3f7fff70, false}},
-        {tanh_pol_table, {0x3f7ffeec, false}},
-        {tanh_pol_table, {0x3f7ffdc0, false}},
-        {tanh_pol_table, {0x3f7ffbed, false}},
-        {tanh_pol_table, {0x3f7ff704, false}},
-        {tanh_pol_table, {0x3f7feff5, false}},
-        {tanh_pol_table, {0x3f7fdbca, false}},
-        {tanh_pol_table, {0x3f7fbfff, false}},
-        {tanh_pol_table, {0x3f7f7041, false}},
-        {tanh_pol_table, {0x3f7f009b, false}},
-        {tanh_pol_table, {0x3f7dc36c, false}},
-        {tanh_pol_table, {0x3f7c0aa8, false}},
-        {tanh_pol_table, {0x3f7734b8, false}},
-        {tanh_pol_table, {0x3f70a4de, false}},
-        {tanh_pol_table, {0x3f5f1fd8, false}},
-        {tanh_pol_table, {0x3f495493, false}},
-        {tanh_pol_table, {0x3f18b9ec, false}},
-        {tanh_pol_table, {0x3ed706cb, false}},
-        {tanh_pol_table, {0x3e390b06, false}},
-        {tanh_pol_table, {0x3d90b11f, false}},
-        {tanh_pol_table, {0x3c21a053, false}},
-        {tanh_pol_table, {0x3aaf7fdb, false}},
-        {tanh_pol_table, {0x37ccc1a3, false}},
-        {tanh_pol_table, {0x355c6733, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 2
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xbe4e0ff1, false}},
-        {tanh_pol_table, {0x3d25b1b1, false}},
-        {tanh_pol_table, {0x3d6b6dab, false}},
-        {tanh_pol_table, {0x3c9fb1d5, false}},
-        {tanh_pol_table, {0xbabff06f, false}},
-        {tanh_pol_table, {0x3c07b3f6, false}},
-        {tanh_pol_table, {0xbb3fc1bc, false}},
-        {tanh_pol_table, {0x3a9f5921, false}},
-        {tanh_pol_table, {0xbbbf06f2, false}},
-        {tanh_pol_table, {0xbbb0f402, false}},
-        {tanh_pol_table, {0xbc47db9e, false}},
-        {tanh_pol_table, {0xbc73d5e7, false}},
-        {tanh_pol_table, {0xbca25bda, false}},
-        {tanh_pol_table, {0xbcfca780, false}},
-        {tanh_pol_table, {0xbd40e07c, false}},
-        {tanh_pol_table, {0xbd7dab03, false}},
-        {tanh_pol_table, {0xbdbe4a0f, false}},
-        {tanh_pol_table, {0xbdfb14a5, false}},
-        {tanh_pol_table, {0xbe36cc8d, false}},
-        {tanh_pol_table, {0xbe6bd102, false}},
-        {tanh_pol_table, {0xbe9fe7c5, false}},
-        {tanh_pol_table, {0xbeba0f10, false}},
-        {tanh_pol_table, {0xbec206a8, false}},
-        {tanh_pol_table, {0xbea3c388, false}},
-        {tanh_pol_table, {0xbe277d62, false}},
-        {tanh_pol_table, {0xbd8b7960, false}},
-        {tanh_pol_table, {0xbc209f49, false}},
-        {tanh_pol_table, {0xbaad44ca, false}},
-        {tanh_pol_table, {0xb7c6eeac, false}},
-        {tanh_pol_table, {0xb663aa41, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 3
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x45b3ae96, false}},
-        {tanh_pol_table, {0xc414eb20, false}},
-        {tanh_pol_table, {0xc450e02e, false}},
-        {tanh_pol_table, {0xc3152b4e, false}},
-        {tanh_pol_table, {0xbead2f56, false}},
-        {tanh_pol_table, {0xc2162e02, false}},
-        {tanh_pol_table, {0xbeb4bd5a, false}},
-        {tanh_pol_table, {0xc11a59a4, false}},
-        {tanh_pol_table, {0xbed2f507, false}},
-        {tanh_pol_table, {0xc020d32c, false}},
-        {tanh_pol_table, {0x3dd0f506, false}},
-        {tanh_pol_table, {0xbf2a75e2, false}},
-        {tanh_pol_table, {0xbff950e3, false}},
-        {tanh_pol_table, {0xbed47334, false}},
-        {tanh_pol_table, {0xbe809b8c, false}},
-        {tanh_pol_table, {0xbeb64532, false}},
-        {tanh_pol_table, {0xbe961a5b, false}},
-        {tanh_pol_table, {0xbe9b63ac, false}},
-        {tanh_pol_table, {0xbea0d4b2, false}},
-        {tanh_pol_table, {0xbe828a77, false}},
-        {tanh_pol_table, {0xbe378612, false}},
-        {tanh_pol_table, {0xbdc20908, false}},
-        {tanh_pol_table, {0x3d2d3957, false}},
-        {tanh_pol_table, {0x3dd46e89, false}},
-        {tanh_pol_table, {0x3db3f629, false}},
-        {tanh_pol_table, {0x3d2c5e7b, false}},
-        {tanh_pol_table, {0x3bd20403, false}},
-        {tanh_pol_table, {0x3a59dfae, false}},
-        {tanh_pol_table, {0x3770af45, false}},
-        {tanh_pol_table, {0x372cc014, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 4
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xcc981a1b, false}},
-        {tanh_pol_table, {0x4a7edd3d, false}},
-        {tanh_pol_table, {0x4ab1007c, false}},
-        {tanh_pol_table, {0x48fedd9c, false}},
-        {tanh_pol_table, {0x41a557b5, false}},
-        {tanh_pol_table, {0x477ee32a, false}},
-        {tanh_pol_table, {0x422557f5, false}},
-        {tanh_pol_table, {0x45ff3ce4, false}},
-        {tanh_pol_table, {0x42a55641, false}},
-        {tanh_pol_table, {0x446e0867, false}},
-        {tanh_pol_table, {0xc33dc19a, false}},
-        {tanh_pol_table, {0x42915214, false}},
-        {tanh_pol_table, {0x43af4fad, false}},
-        {tanh_pol_table, {0x4110fe88, false}},
-        {tanh_pol_table, {0xc1099b75, false}},
-        {tanh_pol_table, {0x3fc8a8dc, false}},
-        {tanh_pol_table, {0xbfbeaef5, false}},
-        {tanh_pol_table, {0xbe365aad, false}},
-        {tanh_pol_table, {0x3f4d9652, false}},
-        {tanh_pol_table, {0x3ddfa08f, false}},
-        {tanh_pol_table, {0x3e34e9b8, false}},
-        {tanh_pol_table, {0x3e2d07a6, false}},
-        {tanh_pol_table, {0x3dc63567, false}},
-        {tanh_pol_table, {0x3cdaeb78, false}},
-        {tanh_pol_table, {0xbcd17537, false}},
-        {tanh_pol_table, {0xbc92829c, false}},
-        {tanh_pol_table, {0xbb43ab99, false}},
-        {tanh_pol_table, {0xb9b471dd, false}},
-        {tanh_pol_table, {0xb6baad5a, false}},
-        {tanh_pol_table, {0xb78bafc7, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 5
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0x52f688d5, false}},
-        {tanh_pol_table, {0xd0505c72, false}},
-        {tanh_pol_table, {0xd08f98e3, false}},
-        {tanh_pol_table, {0xce505cc9, false}},
-        {tanh_pol_table, {0xc7162b8a, false}},
-        {tanh_pol_table, {0xcc5061d6, false}},
-        {tanh_pol_table, {0xc7162bdf, false}},
-        {tanh_pol_table, {0xca50b37f, false}},
-        {tanh_pol_table, {0xc7162a3a, false}},
-        {tanh_pol_table, {0xc8422086, false}},
-        {tanh_pol_table, {0x471a714e, false}},
-        {tanh_pol_table, {0xc5ece1f1, false}},
-        {tanh_pol_table, {0xc70e3d90, false}},
-        {tanh_pol_table, {0xc3eba94a, false}},
-        {tanh_pol_table, {0x43e0c424, false}},
-        {tanh_pol_table, {0xc21f4552, false}},
-        {tanh_pol_table, {0x42217cc8, false}},
-        {tanh_pol_table, {0x405e7dc4, false}},
-        {tanh_pol_table, {0xc10dd401, false}},
-        {tanh_pol_table, {0x3e96b602, false}},
-        {tanh_pol_table, {0xbd1a6d2f, false}},
-        {tanh_pol_table, {0xbd393883, false}},
-        {tanh_pol_table, {0xbd674682, false}},
-        {tanh_pol_table, {0xbd310016, false}},
-        {tanh_pol_table, {0xb961e269, false}},
-        {tanh_pol_table, {0x3ba32495, false}},
-        {tanh_pol_table, {0x3a7680d5, false}},
-        {tanh_pol_table, {0x38b3173c, false}},
-        {tanh_pol_table, {0x35a9deea, false}},
-        {tanh_pol_table, {0x375c3f2a, false}},
-        {tanh_pol_table, {0x00000000, false}},
-        // coefficients of degree 6
-        {tanh_pol_table, {0x00000000, false}},
-        {tanh_pol_table, {0xd8995ed1, false}},
-        {tanh_pol_table, {0x558285ea, false}},
-        {tanh_pol_table, {0x55b2cd69, false}},
-        {tanh_pol_table, {0x53028625, false}},
-        {tanh_pol_table, {0x4bc9991f, false}},
-        {tanh_pol_table, {0x5082898a, false}},
-        {tanh_pol_table, {0x4b4999b3, false}},
-        {tanh_pol_table, {0x4e02c07c, false}},
-        {tanh_pol_table, {0x4ac99764, false}},
-        {tanh_pol_table, {0x4b72c822, false}},
-        {tanh_pol_table, {0xca40c0e1, false}},
-        {tanh_pol_table, {0x489413e4, false}},
-        {tanh_pol_table, {0x49b12224, false}},
-        {tanh_pol_table, {0x46134c4e, false}},
-        {tanh_pol_table, {0xc60c2d57, false}},
-        {tanh_pol_table, {0x43c83910, false}},
-        {tanh_pol_table, {0xc3c872d1, false}},
-        {tanh_pol_table, {0xc186bc9e, false}},
-        {tanh_pol_table, {0x42325bc3, false}},
-        {tanh_pol_table, {0xbf2ffa4a, false}},
-        {tanh_pol_table, {0x3d9a203c, false}},
-        {tanh_pol_table, {0xbc545a43, false}},
-        {tanh_pol_table, {0xbae08fee, false}},
-        {tanh_pol_table, {0x3c80225d, false}},
-        {tanh_pol_table, {0x3b1fd1df, false}},
-        {tanh_pol_table, {0xba36b9d1, false}},
-        {tanh_pol_table, {0xb91de544, false}},
-        {tanh_pol_table, {0xb71f100f, false}},
-        {tanh_pol_table, {0xb408e2ed, false}},
-        {tanh_pol_table, {0xb685fec8, false}},
-        {tanh_pol_table, {0x00000000, false}},
-    };
-
-    auto push_arg_entry_of = [&](const key_t key, const table_entry_val_t val, const bool broadcast) {
-      mapped_table_entry_t te{0, val, broadcast};
-      entry_map.insert(std::make_pair(key, te));
-    };
-
-    auto push_entries_of = [&](const table_t& t) {
-      for (auto it = t.begin(); it != t.end(); it++) {
-        auto key = it->first;
-        auto te = it->second;
-        push_arg_entry_of(key, te.val, te.bcast);
-      }
-    };
-
-    auto set_table_term_offset = [&]() {
-      size_t off = 0;
-      for (auto it = entry_map.begin(); it != entry_map.end(); it++) {
-        auto& te = (*it).second;
-        te.off = off;
-        off += te.bcast ? 64u : sizeof(table_entry_val_t);
-      }
-    };
-
-    struct need_t {
-      explicit need_t(JBLAS_ELTWISEOP& op) {
-        if (op == EXP) exp_ = true;
-        if (op == TANH) tanh_ = true;
-        if (op == GELU) gelu_ = true;
-        if (op == SWISH) swish_ = true;
-        if (op == LOW_PRECISION_EXP) low_precision_exp_ = true;
-      }
-      bool bf16_ = false;
-      bool exp_ = false;
-      bool tanh_ = false;
-      bool gelu_ = false;
-      bool low_precision_exp_ = false;
-      bool swish_ = false;
-
-      bool bf16() const { return bf16_; }
-      bool exp() const { return exp_; }
-      bool tanh() const { return tanh_; }
-      bool gelu() const { return gelu_; }
-      bool low_precision_exp() { return low_precision_exp_; }
-      bool swish() const { return swish_; }
-    };
-
-    need_t need(elt_op);
-    push_entries_of(common_values);
-    if (need.exp()) {
-      push_entries_of(exp_consts);
-      push_entries_of(exp_polynomial);
-    }
-    if (need.low_precision_exp() || need.swish()) {
-      push_entries_of(exp_polynomial);
-      push_entries_of(exp_consts);
-      push_entries_of(low_precision_exp_consts);
-    }
-    if (need.tanh() || need.gelu()) {
-      push_entries_of(tanh_consts);
-      push_entries_of(tanh_polynomial_table);
-    }
-    if (need.gelu()) push_entries_of(gelu_tanh_const);
-
-    set_table_term_offset();
-  }
-  void exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    /* exp code */
-    h->vcmpps(ymm_mask, ymm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(ymm_src, ymm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(ymm_src, ymm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(ymm_aux1, ymm_src);
-    h->vmulps(ymm_src, ymm_src, table_val(exp_log2ef));
-    h->vaddps(ymm_src, ymm_src, table_val(half));
-    h->vroundps(ymm_aux2, ymm_src, _op_floor);
-
-    // keep ymm_src = fx for further computations
-    h->vmovups(ymm_src, ymm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(ymm_aux1, ymm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of
-    // computing 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(ymm_src, ymm_src, table_val(one));
-    h->vcvtps2dq(ymm_aux2, ymm_src);
-    h->vpaddd(ymm_aux2, ymm_aux2, table_val(exponent_bias));
-    h->vpslld(ymm_aux2, ymm_aux2, n_mantissa_bits);
-
-    // use ymm_src as tmp ymm_zero when applying mask
-    h->vxorps(ymm_src, ymm_src, ymm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendvps(ymm_aux2, ymm_aux2, ymm_src, ymm_mask);
-
-    // compute polynomial
-    h->vmovups(ymm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(ymm_src, ymm_src, ymm_aux2);
-    h->vmulps(ymm_src, ymm_src, table_val(two));
-  }
-  void exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    /* exp code */
-    h->vcmpps(k_mask, zmm_src, table_val(exp_ln_flt_min_f), _cmp_lt_os);
-    h->vminps(zmm_src, zmm_src, table_val(exp_ln_flt_max_f));
-    h->vmaxps(zmm_src, zmm_src, table_val(exp_ln_flt_min_f));
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vmulps(zmm_src, zmm_src, table_val(exp_log2ef));
-    h->vaddps(zmm_src, zmm_src, table_val(half));
-    h->vrndscaleps(zmm_aux2, zmm_src, _op_floor & 0x3);
-
-    // keep zmm_src = fx for further computations
-    h->vmovups(zmm_src, zmm_aux2);
-
-    // x = x - fx * ln2
-    h->vfnmadd231ps(zmm_aux1, zmm_aux2, table_val(ln2f));
-
-    // We do not count 2^n here, because n can reach 128 and 2^128 is not
-    // representable by fp32, so to get around this problem, instead of computing
-    // 2^n * exp(r) will be counted 2*2^(n-1)*exp(r), because 2^127
-    // and 2 are numbers representable in fp32.
-
-    // compute 2^(n-1)
-    h->vsubps(zmm_src, zmm_src, table_val(one));
-    h->vcvtps2dq(zmm_aux2, zmm_src);
-    h->vpaddd(zmm_aux2, zmm_aux2, table_val(exponent_bias));
-    h->vpslld(zmm_aux2, zmm_aux2, n_mantissa_bits);
-
-    // use zmm_src as tmp zmm_zero when applying mask
-    h->vxorps(zmm_src, zmm_src, zmm_src);
-
-    // set zeroes at those points which were < log(FLT_MIN)
-    h->vblendmps(zmm_aux2 | k_mask, zmm_aux2, zmm_src);
-
-    // compute polynomial
-    h->vmovups(zmm_src, table_val(exp_pol, 4));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 3));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 2));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 1));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(exp_pol, 0));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-
-    // y = y * 2^n
-
-    h->vmulps(zmm_src, zmm_src, zmm_aux2);
-    h->vmulps(zmm_src, zmm_src, table_val(two));
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // support abs(x)<23
-    auto code = [&](Xbyak::CodeGenerator* h, const Ymm& dst, const Ymm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Ymm, 4>& tmp) {
-      h->vmulps(tmp[0], src, log2e);      // x / ln2
-      h->vroundps(tmp[0], tmp[0], 0x0A);  // round up
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-
-      const auto& z_sign = tmp[2];
-      const auto& z_abs = tmp[3];
-      h->vcmpps(z_sign, z, table_val(zero), _cmp_lt_os);
-      h->vcvtps2dq(z, z);
-      h->vpabsd(z_abs, z);
-      h->vmovdqu(tmp[1], table_val(one_epi32));
-      h->vpsllvd(z_abs, tmp[1], z_abs);  // 2^z
-      h->vcvtdq2ps(z_abs, z_abs);
-      h->vrcpps(z, z_abs);
-      h->vblendvps(z, z_abs, z, z_sign);
-      h->vmulps(dst, dst, z);  // dst = exp(f) * 2^z
-    };
-    code(h, ymm_src, ymm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4});
-  }
-  void low_precision_exp_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    auto code = [&](Xbyak::CodeGenerator* h, const Zmm& dst, const Zmm& src, const Xbyak::Operand& log2e,
-                    const Xbyak::Operand& ln2, const Xbyak::Operand& coeff0, const Xbyak::Operand& coeff1,
-                    const Xbyak::Operand& coeff2, const std::array<Zmm, 2>& tmp) {
-      h->vmovups(tmp[0], log2e);
-      h->vmulps(tmp[0] | h->T_ru_sae, src, tmp[0]);  // round up(x / ln2)
-      const auto& z = tmp[0];
-      h->vmulps(tmp[1], tmp[0], ln2);
-      h->vsubps(tmp[1], src, tmp[1]);  // x mod ln2 (can we use fmsub?)
-      h->vmovaps(dst, coeff1);
-      h->vfmadd231ps(dst, tmp[1], coeff0);  // dst = f * c0 + c1
-      h->vfmadd213ps(dst, tmp[1], coeff2);  // dst = (f * c0 + c1) * f + c2
-      h->vscalefps(dst, dst, z);            // dst = exp(f) * 2^z
-    };
-    code(h, zmm_src, zmm_src, table_val(exp_log2ef), table_val(ln2f),  //
-         table_val(low_precision_exp_const_v0), table_val(low_precision_exp_const_v1),
-         table_val(low_precision_exp_const_v2), {zmm_aux1, zmm_aux2});
-  }
-  void swish_compute_vector_fwd(const Xbyak::Ymm& ymm_src, int const_p_offset) {
-    h->vbroadcastss(ymm_aux0, h->ptr[reg_rt_const_p + const_p_offset]);
-    h->vmulps(ymm_aux0, ymm_aux0, ymm_src);
-    exp_compute_vector_fwd(ymm_aux0);
-    h->vaddps(ymm_aux0, ymm_aux0, table_val(one));
-    h->vrcpps(ymm_aux0, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void swish_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux0, zmm_src);
-    h->vmulps(zmm_aux0, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset]);
-    low_precision_exp_compute_vector_fwd(zmm_aux0);
-    h->vaddps(zmm_aux0, zmm_aux0, table_val(one));
-    h->vrcp14ps(zmm_aux0, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    // register mapping
-    Ymm ymm_dst = ymm_aux1, ymm_src_shift = ymm_aux1, ymm_coeff = ymm_aux1, ymm_pol = ymm_aux2, ymm_indices = ymm_aux3,
-        ymm_src_original = ymm_aux4, ymm_sign = ymm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Ymm vmm_coeff, int coeff_idx, Ymm vmm_pol_idx) {
-      Ymm ymm_coeff(vmm_coeff.getIdx());
-      Ymm ymm_pol_idx(vmm_pol_idx.getIdx());
-      Xbyak::Address idx_addr =
-          h->ptr[p_table + table_off(tanh_pol_table, coeff_idx * tanh_n_polynomials) + ymm_pol_idx * sizeof(float)];
-      h->vcmpps(ymm_mask, ymm_mask, ymm_mask, _cmp_eq_oq);
-      h->vgatherdps(vmm_coeff, idx_addr, ymm_mask);
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(ymm_src_original, ymm_src);
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(ymm_indices, ymm_src);
-    h->vpsubd(ymm_indices, ymm_indices, table_val(tanh_idx_bias));
-    h->vandps(ymm_indices, ymm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(ymm_indices, ymm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(ymm_src_shift, ymm_src);
-    h->vandps(ymm_src_shift, ymm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(ymm_src, ymm_src, ymm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(ymm_pol, 6, ymm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(ymm_coeff, deg, ymm_indices);
-      h->vfmadd213ps(ymm_pol, ymm_src, ymm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(ymm_src, ymm_src_original);
-    h->vandps(ymm_sign, ymm_sign, table_val(sign_mask));
-    h->vandps(ymm_src, ymm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(ymm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(ymm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_pol, ymm_mask);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(ymm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(ymm_mask, ymm_mask, ymm_src, _cmp_nle_us);
-    h->vblendvps(ymm_dst, ymm_dst, ymm_src, ymm_mask);
-
-    // We reapply the sign and return
-    h->vxorps(ymm_dst, ymm_dst, ymm_sign);
-    h->vmovups(ymm_src, ymm_dst);
-  }
-  void tanh_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    // register mapping
-    Zmm zmm_dst = zmm_aux1, zmm_src_shift = zmm_aux1, zmm_coeff = zmm_aux1, zmm_pol = zmm_aux2, zmm_indices = zmm_aux3,
-        zmm_src_original = zmm_aux4, zmm_sign = zmm_aux4;
-
-    const int tanh_n_polynomials = 32;
-
-    // We split the positive domain in 33 intervals:
-    // a) [0; linear_ubound]: in this interval tanh(x) = x
-    // b) [linear_ubound; 0x1.8p-12]: This interval spans part of a
-    //    half binade
-    // c) [0x1.8p-12; 0x1.0p-11], ..., [0x1.8p2; 0x1.0p3]:
-    //    one interval for each half binade, there are 29 of those
-    // d) [0x1.0p3; saturation_ubound]:
-    //    This interval spans part of a half binade
-    // e) [0x1.205966p3; saturation_ubound]: in this interval, tanh(x) = 1
-    // For b-d, we need 31 polynomials and will do a table lookup for those.
-    // To simplify the logic, we will also put a) in the table.
-    auto coeffs_address = [&](int coeff_off, int off = 0) {
-      return table_val(tanh_pol_table, (size_t)coeff_off * tanh_n_polynomials + off);
-    };
-    auto gather_coefficient = [&](Zmm vmm_coeff, int coeff_idx, Zmm vmm_pol_idx) {
-      Zmm zmm_coeff(vmm_coeff.getIdx());
-      Zmm zmm_pol_idx(vmm_pol_idx.getIdx());
-      h->vmovups(zmm_coeff, coeffs_address(coeff_idx, 0));
-      h->vpermt2ps(zmm_coeff, zmm_pol_idx, coeffs_address(coeff_idx, 16));
-    };
-
-    // because tanh(x) = -tanh(-x), we extract sign to make x positive
-    // and reapply sign at the end
-    h->vmovups(zmm_src_original, zmm_src);
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // We compute the indices for the table lookup
-    h->vmovups(zmm_indices, zmm_src);
-    h->vpsubd(zmm_indices, zmm_indices, table_val(tanh_idx_bias));
-    h->vpandd(zmm_indices, zmm_indices, table_val(tanh_idx_mask));
-    h->vpsrld(zmm_indices, zmm_indices, 22);
-
-    // we do the argument reduction
-    h->vmovups(zmm_src_shift, zmm_src);
-    h->vpandd(zmm_src_shift, zmm_src_shift, table_val(tanh_idx_mask));
-    h->vsubps(zmm_src, zmm_src, zmm_src_shift);
-
-    // we gather and evaluate the polynonials
-    gather_coefficient(zmm_pol, 6, zmm_indices);
-    for (int deg = 5; deg >= 0; --deg) {
-      gather_coefficient(zmm_coeff, deg, zmm_indices);
-      h->vfmadd213ps(zmm_pol, zmm_src, zmm_coeff);
-    }
-
-    // we restore src with cleared sign, and keep sign
-    h->vmovups(zmm_src, zmm_src_original);
-    h->vpandd(zmm_sign, zmm_sign, table_val(sign_mask));
-    h->vpandd(zmm_src, zmm_src, table_val(positive_mask));
-
-    // Now we blend the results
-    // [saturation_ubound; +inf[ : we return +/- 1
-    h->vmovups(zmm_dst, table_val(one));
-    // [linear_ubound; saturation_lbound] : we return +/- P(x)
-    h->vmovups(zmm_mask, table_val(tanh_saturation_lbound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_pol);
-    // [0; linear_ubound]  : we return x
-    h->vmovups(zmm_mask, table_val(tanh_linear_ubound));
-    h->vcmpps(k_mask, zmm_mask, zmm_src, _cmp_nle_us);
-    h->vblendmps(zmm_dst | k_mask, zmm_dst, zmm_src);
-
-    // We reapply the sign and return
-    h->vpxord(zmm_dst, zmm_dst, zmm_sign);
-    h->vmovups(zmm_src, zmm_dst);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Ymm& ymm_src) {
-    h->vmovups(ymm_aux0, ymm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(ymm_src, ymm_src, ymm_src);
-    h->vmovups(ymm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(ymm_src, ymm_aux1, table_val(one));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-    h->vmulps(ymm_src, ymm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(ymm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(ymm_src, ymm_src, table_val(one));
-    h->vmulps(ymm_src, ymm_src, table_val(half));
-    h->vmulps(ymm_src, ymm_src, ymm_aux0);
-  }
-  void gelu_compute_vector_fwd(const Xbyak::Zmm& zmm_src) {
-    h->vmovups(zmm_aux0, zmm_src);
-    // compute G(x) = sqrt_root_two_over_pi * x * (1 + fitting_const * x * x)
-    h->vmulps(zmm_src, zmm_src, zmm_src);
-    h->vmovups(zmm_aux1, table_val(gelu_tanh_fitting_const));
-    h->vfmadd213ps(zmm_src, zmm_aux1, table_val(one));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-    h->vmulps(zmm_src, zmm_src, table_val(gelu_tanh_sqrt_two_over_pi));
-
-    // compute tanh(G(x))
-    tanh_compute_vector_fwd(zmm_src);
-
-    // compute 0.5 * x * (1 + tanh(G(x)))
-    h->vaddps(zmm_src, zmm_src, table_val(one));
-    h->vmulps(zmm_src, zmm_src, table_val(half));
-    h->vmulps(zmm_src, zmm_src, zmm_aux0);
-  }
-  void relu_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vmovups(zmm_aux1, zmm_src);
-    h->vcmpps(k_mask, zmm_src, table_val(zero), _cmp_nle_us);
-    h->vmulps(zmm_src, zmm_src, h->zword_b[reg_rt_const_p + const_p_offset]);
-    h->vblendmps(zmm_src | k_mask, zmm_src, zmm_aux1);
-  }
-  void linear_compute_vector_fwd(const Xbyak::Zmm& zmm_src, int const_p_offset) {
-    h->vbroadcastss(zmm_aux0, h->dword[reg_rt_const_p + const_p_offset]);
-    h->vfmadd213ps(zmm_src, zmm_aux0, h->zword_b[reg_rt_const_p + const_p_offset + 1 * sizeof(float)]);
-  }
-  void load_table_addr() { h->mov(p_table, l_table); }
-  void assign_zmm(const std::set<int>& used_zmm_idx, Zmm* zmm) {
-    constexpr int max_zmm_idx = 32;
-    for (int idx = 0; idx < max_zmm_idx; idx++) {
-      if (used_zmm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *zmm = Zmm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-  void assign_ymm(const std::set<int>& used_ymm_idx, Ymm* ymm) {
-    constexpr int max_ymm_idx = 16;
-    for (int idx = 0; idx < max_ymm_idx; idx++) {
-      if (used_ymm_idx.count(idx) == 0 && assign_vmm_idx.count(idx) == 0) {
-        *ymm = Ymm(idx);
-        assign_vmm_idx.insert(idx);
-        break;
-      }
-    }
-  }
-
- private:
-  JBLAS_ELTWISEOP elt_op;
-  Xbyak::CodeGenerator* h = nullptr;
-
-  /*labels*/
-  Xbyak::Label l_table;
-
-  /*register for fwd*/
-  Xbyak::Reg64 p_table;
-  Xbyak::Reg64 reg_rt_const_p;
-  std::set<int> assign_vmm_idx;  // use for zmm (in avx512) or ymm (in avx2)
-  Zmm zmm_mask, zmm_aux0, zmm_aux1, zmm_aux2, zmm_aux3, zmm_aux4;
-  Ymm ymm_mask, ymm_aux0, ymm_aux1, ymm_aux2, ymm_aux3, ymm_aux4;
-  Xbyak::Opmask k_mask;
-  static constexpr int n_mantissa_bits = 23;
-
-  enum {
-    _cmp_eq_oq = 0u,
-    _cmp_lt_os = 1u,
-    _cmp_le_os = 2u,
-    _cmp_neq_uq = 4u,
-    _cmp_nlt_us = 5u,
-    _cmp_nle_us = 6u,
-
-    _op_floor = 1u,
-    _op_mxcsr = 4u,
-  };
-
-  enum key_t {
-    zero = 0,                             // 0.f
-    half,                                 // 0.5f
-    one,                                  // 1.f  or  mask for exponent bits
-    two,                                  // 2.f
-    three,                                // 3.f
-    six,                                  // 6.f
-    minus_one,                            // -1.f  or  changes sign to opposite
-    minus_two,                            // -2.f
-    minus_three,                          // -3.f
-    ln2f,                                 // 0.69314718f
-    one_epi32,                            // 1 in int32
-    positive_mask,                        // changes sign to positive
-    sign_mask,                            // gets sign value
-    exponent_bias,                        // (127 = 2^7 - 1), gets exponent bits
-    exp_log2ef,                           // 1.44269502f - formula-based for approx
-    exp_ln_flt_max_f,                     // logf(FLT_MAX) - max normal value
-    exp_ln_flt_min_f,                     // logf(FLT_MIN) - min normal value
-    exp_pol,                              // see correspondent table for float values
-    gelu_tanh_fitting_const,              // 0.044715f
-    gelu_tanh_fitting_const_times_three,  // 0.134145f
-    gelu_tanh_sqrt_two_over_pi,           // sqrtf(2.f/pi) = 0.797884f
-    gelu_tanh_flt_max_x,
-    gelu_tanh_flt_min_x,
-    tanh_idx_bias,
-    tanh_idx_mask,
-    tanh_linear_ubound,
-    tanh_saturation_lbound,
-    tanh_pol_table,
-    low_precision_exp_const_v0,
-    low_precision_exp_const_v1,
-    low_precision_exp_const_v2,
-    undef_key,
-  };
-
-  size_t table_off(key_t key, size_t key_off_val_shift = 0) {
-    const auto it = entry_map.find(key);
-    assert(it != entry_map.end());  // "key is not in entry_map"
-    const auto& te = (*it).second;
-    const auto scale = te.bcast ? 64u : sizeof(table_entry_val_t);
-    return te.off + key_off_val_shift * scale;
-  }
-  Xbyak::Address table_val(key_t key, size_t key_off_val_shift = 0) {
-    auto off = table_off(key, key_off_val_shift);
-    return h->ptr[p_table + off];
-  }
-  using table_entry_val_t = uint32_t;
-  using table_entry_offset_t = size_t;  // offsets are in bytes wrt p_table
-  using table_entry_bcast_t = bool;
-
-  struct table_entry_t {
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  struct mapped_table_entry_t {
-    table_entry_offset_t off;
-    table_entry_val_t val;
-    table_entry_bcast_t bcast;
-  };
-  using table_t = std::multimap<key_t, table_entry_t>;
-  using mapped_table_t = std::multimap<key_t, mapped_table_entry_t>;
-  mapped_table_t entry_map = {};
-};
-}  // namespace jit_injector
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
deleted file mode 100644
index 6e00704395ed3..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_ref.h
+++ /dev/null
@@ -1,1039 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <vector>
-#include <algorithm>
-#include <limits>
-#include "jit_blas_utils.h"
-
-namespace jblas {
-namespace kernel {
-namespace ref {
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                            int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  const T_DST dst_0(0);
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        for (int ii = 0; ii < RowPack; ii++) {
-          dst_ptr[i * NTile + j * dst_step + jj * RowPack + ii] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src_ptr[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// revert padding and interleave
-// row*col <= colpad/NTile*rowpad*NTile
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE revert_padding_interleave(const T_SRC* src_ptr, T_DST* dst_ptr, int row, int col, int rowpad,
-                                                   int colpad, int src_step, int dst_step, int NTile, int RowPack) {
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  for (int i = 0; i < rowpad; i += RowPack) {
-    for (int j = 0; j < colpad; j += NTile) {
-      for (int jj = 0; jj < NTile; jj++) {
-        if ((j + jj) < col) {
-          for (int ii = 0; ii < RowPack; ii++) {
-            if ((i + ii) < row) {
-              dst_ptr[(i + ii) * dst_step + (j + jj)] =
-                  static_cast<T_DST>(src_ptr[i * NTile + j * src_step + jj * RowPack + ii]);
-            }
-          }
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-// M x N ===> M/MTile x N/colPack x MTile x colPack (leading dim stride = MTile * dst_stride)
-template <typename T_SRC, typename T_DST = T_SRC>
-static inline JBLAS_CODE padding_trans_interleave(const T_SRC* src, T_DST* dst, int row, int col, int rowpad,
-                                                  int colpad, int src_step, int dst_step, int MTile, int ColPack) {
-  // Note: rows/cols and i/j are in terms of src
-  static_assert(sizeof(T_SRC) == sizeof(T_DST), "SRC & DST size should be the same");
-  const T_DST dst_0(0);
-  for (int i = 0; i < rowpad; i += MTile) {
-    for (int j = 0; j < colpad; j += ColPack) {
-      for (int ii = 0; ii < MTile; ii++) {
-        for (int jj = 0; jj < ColPack; jj++) {
-          dst[i * dst_step + j * MTile + ii * ColPack + jj] =
-              (i + ii) < row && (j + jj) < col  //
-                  ? static_cast<T_DST>(src[(i + ii) * src_step + (j + jj)])
-                  : dst_0;
-        }
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_DT, typename DST_DT>
-static inline JBLAS_CODE dt_cvt_2D_write_back(const void* raw_srcptr, void* raw_dstptr, int row, int col, int srcstride,
-                                              int dststride, bool zeropadding) {
-  for (int i = 0; i < row; i++) {
-    int j = 0;
-    for (; j < col; j++) {
-      const auto src = reinterpret_cast<const SRC_DT*>(reinterpret_cast<const char*>(raw_srcptr) + i * srcstride);
-      const auto dst = reinterpret_cast<DST_DT*>(reinterpret_cast<char*>(raw_dstptr) + i * dststride);
-      dst[j] = static_cast<DST_DT>(src[j]);
-    }
-    if (zeropadding) {
-      for (int bj = j * sizeof(DST_DT); bj < dststride; bj++) {
-        (reinterpret_cast<char*>(raw_dstptr) + i * dststride)[bj] = 0;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_f32(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                       float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] = static_cast<float>(srcptr[i * ld_src + j]) * scales[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE dequan_s8_bf16(int8_t* srcptr, uint16_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                        float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      dstptr[i * ld_dst + j] =
-          jblas::utils::cast<float, jblas::utils::bf16>(static_cast<float>(srcptr[i * ld_src + j]) * scales[j]).x;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _T>
-static inline JBLAS_CODE transpose2d(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < col; i++) {
-    for (size_t j = 0; j < row; j++) {
-      dstptr[j + i * ld_dst] = srcptr[j * ld_src + i];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_s8_s4(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col,
-                                        int ld_src, int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::int4x2 tmp;
-      tmp.x = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 0]);
-      tmp.y = jblas::utils::int4x2::convert(srcptr[j * ld_src + ii + 1]);
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE compress_f4(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                     int ld_dst) {
-  for (int j = 0; j < row; j++) {
-    for (int ii = 0; ii < col; ii += 2) {
-      jblas::utils::f4x2 tmp;
-      tmp.x = srcptr[j * ld_src + ii + 0];
-      tmp.y = srcptr[j * ld_src + ii + 1];
-      dstptr[j * ld_dst / 2 + ii / 2] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-template <int NTile>
-static inline JBLAS_CODE decompress_s4_f32(jblas::utils::int4x2* srcptr, float* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      auto noffset = i * NTile + j % NTile;
-      dstptr[i * ld_dst + j + 0] = static_cast<float>(static_cast<int8_t>(tmp.x) << 4) * scales[noffset + 0];
-      dstptr[i * ld_dst + j + 1] = static_cast<float>(static_cast<int8_t>(tmp.y) << 4) * scales[noffset + 1];
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline int8_t get_s8(int8_t v) {
-  switch (S4_T) {
-    case JBLAS_DTYPE::S4_CLIP:
-      return v << 4;
-    case JBLAS_DTYPE::S4_FULLRANGE:
-      v &= 0x0f;
-      return v - 8;
-    default:
-      assert(false);
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE decompress_s4_s8(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = get_s8<S4_T>(tmp.x);
-      dstptr[i * ld_dst + j + 1] = get_s8<S4_T>(tmp.y);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s8_f32(int8_t* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                           _S_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 1) {
-      float tmp = static_cast<float>(srcptr[i * ld_src + j]);
-      if (zero_points != nullptr) tmp -= static_cast<float>(zero_points[kpos * NPad + j]);
-      dstptr[i * ld_dst + j] = static_cast<_DST_T>(tmp * sptr[j / _PACK_ROW]);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_s4_fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                          int ld_dst, _S_T* scales, int8_t* zero_points, int k_offset, int kblock,
-                                          int NPad, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      if (zero_points != nullptr) {
-        dst0 = (static_cast<float>(get_s8<S4_T>(tmp.x)) - static_cast<float>((zero_points + kpos * NPad)[s0_idx])) *
-               scale0;
-        dst1 = (static_cast<float>(get_s8<S4_T>(tmp.y)) - static_cast<float>((zero_points + kpos * NPad)[s1_idx])) *
-               scale1;
-      } else {
-        dst0 = static_cast<float>(get_s8<S4_T>(tmp.x)) * scale0;
-        dst1 = static_cast<float>(get_s8<S4_T>(tmp.y)) * scale1;
-      }
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_s4_s8fp(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                            int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.x)));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(static_cast<float>(get_s8<S4_T>(tmp.y)));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename DST_T>
-inline JBLAS_CODE decompress_kblock_s8_s8fp(int8_t* srcptr, DST_T* dstptr, int row, int col, int ld_src, int ld_dst) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 1) {
-      auto tmp = srcptr[i * ld_src + j];
-      dstptr[i * ld_dst + j] = static_cast<DST_T>(static_cast<float>(tmp));
-    }
-  }
-  return JblasSuccess;
-}
-
-inline float fp4_bnb_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)          // 0
-    if ((val & 0b0010) == 2)        // 01
-      if ((val & 0b0001) == 1)      // 111
-        return 0.25000000f * sign;  // 1111
-      else
-        return 0.16666667f * sign;  // 1110
-    else if ((val & 0b0001) == 1)   // 110
-      return 0.50000000f * sign;    // 1101
-    else
-      return 0.33333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)   // 10
-    if ((val & 0b0001) == 1)      // 101
-      return 1.00000000f * sign;  // 1011
-    else
-      return 0.66666667f * sign;     // 1010
-  else if ((val & 0b0001) == 1)      // 100
-    return 5.208333333e-03f * sign;  // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_bnb_dequantize(uint8_t val, float absmax) { return fp4_bnb_unpack(val) * absmax; }
-
-inline int8_t fp4_bnb_quantize(float x) {
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 0.29166667f)
-    if (x > 0.583333f)
-      if (x > 0.8333333f)
-        return static_cast<int8_t>(0b0011 + sign);
-      else
-        return static_cast<int8_t>(0b0010 + sign);
-    else if (x > 0.4166667f)
-      return static_cast<int8_t>(0b101 + sign);
-    else
-      return static_cast<int8_t>(0b100 + sign);
-  else if (x > 0.0859375f)
-    if (x > 0.20833333f)
-      return static_cast<int8_t>(0b0111 + sign);
-    else
-      return static_cast<int8_t>(0b0110 + sign);
-  else if (x > 0.00260417f)
-    return static_cast<int8_t>(0b0001 + sign);
-  else
-    return static_cast<int8_t>(0b0000 + sign);
-}
-
-inline int8_t fp4_e2m1_quantize(float x) {
-  // FP4 with bias of 1
-  // first bit is a sign
-  // subnormals
-  // 0b000 = 0
-  // 0b001 = 0.0625
-  // 0b010 = 1
-  // 0b011 = 1.5
-  // 0b100 = 2
-  // 0b101 = 3
-  // 0b110 = 4
-  // 0b111 = 6
-
-  int sign = x < 0 ? 0b1000 : 0b0000;
-  x = fabsf(x);
-  if (x > 1.75f / 6) {
-    if (x > 3.5f / 6) {
-      if (x > 5.f / 6)
-        return static_cast<int8_t>(0b111 + sign);  // 6
-      else
-        return static_cast<int8_t>(0b110 + sign);  // 4
-    } else {
-      if (x > 2.5f / 6)
-        return static_cast<int8_t>(0b101 + sign);  // 3
-      else
-        return static_cast<int8_t>(0b100 + sign);  // 2
-    }
-  } else {
-    if (x > 0.53125f / 6) {
-      if (x > 1.25f / 6)
-        return static_cast<int8_t>(0b011 + sign);  // 1.5
-      else
-        return static_cast<int8_t>(0b010 + sign);  // 1
-    } else {
-      if (x > 0.03125f / 6)
-        return static_cast<int8_t>(0b0001 + sign);  // 0.0625
-      else
-        return static_cast<int8_t>(0b0000 + sign);  // 0
-    }
-  }
-}
-
-inline float fp4_e2m1_unpack(uint8_t val) {
-  float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
-  if ((val & 0b0100) == 4)      // 0
-    if ((val & 0b0010) == 2)    // 01
-      if ((val & 0b0001) == 1)  // 111
-        return 1.f * sign;      // 1111
-      else
-        return 0.6666666666666666f * sign;  // 1110
-    else if ((val & 0b0001) == 1)           // 110
-      return 0.5f * sign;                   // 1101
-    else
-      return 0.3333333333333333f * sign;  // 1100
-  else if ((val & 0b0010) == 2)           // 10
-    if ((val & 0b0001) == 1)              // 101
-      return 0.25f * sign;                // 1011
-    else
-      return 0.16666666666666666f * sign;  // 1010
-  else if ((val & 0b0001) == 1)            // 100
-    return 0.010416666666666666f * sign;   // 1001
-  else
-    return 0.00000000f * sign;  // 1000
-}
-
-inline float fp4_e2m1_dequantize(uint8_t val, float absmax) { return fp4_e2m1_unpack(val) * absmax; }
-
-inline float nf4_unpack(int8_t val) {
-  if ((val & 0b1000) == 8)
-    if ((val & 0b0100) == 4)      // 1
-      if ((val & 0b0010) == 2)    // 11
-        if ((val & 0b0001) == 1)  // 111
-          return 1.0f;
-        else
-          return 0.7229568362236023f;
-      else if ((val & 0b0001) == 1)  // 110
-        return 0.5626170039176941f;
-      else
-        return 0.44070982933044434f;
-    else if ((val & 0b0010) == 2)  // 10
-      if ((val & 0b0001) == 1)     // 101
-        return 0.33791524171829224f;
-      else
-        return 0.24611230194568634f;
-    else if ((val & 0b0001) == 1)  // 100
-      return 0.16093020141124725f;
-    else
-      return 0.07958029955625534f;
-
-  else if ((val & 0b0100) == 4)  // 0
-    if ((val & 0b0010) == 2)     // 01
-      if ((val & 0b0001) == 1)   // 011
-        return -1.f;
-      else
-        return -0.09105003625154495f;
-    else if ((val & 0b0001) == 1)  // 010
-      return -0.18477343022823334f;
-    else
-      return -0.28444138169288635f;
-  else if ((val & 0b0010) == 2)  // 00
-    if ((val & 0b0001) == 1)     // 001
-      return -0.39491748809814453f;
-    else
-      return -0.5250730514526367f;
-  else if ((val & 0b0001) == 1)  // 000
-    return -0.6961928009986877f;
-  else
-    return 0.f;
-}
-
-inline float nf4_dequantize(int8_t val, float absmax) { return nf4_unpack(val) * absmax; }
-
-// Note: In the BNB Nf4 definition, 0 has a non-zero value after dequantization, but Jblas uses 0 for padding, which
-// leads to calculation errors. We ultimately choose to swap the binary bits of -1 and 0 in Nf4 to avoid this
-// conflict.
-inline int8_t nf4_quantize(float x) {
-  if (x > 0.03979014977812767f)
-    if (x > 0.3893125355243683f)      // 1
-      if (x > 0.6427869200706482f)    // 11
-        if (x > 0.8614784181118011f)  // 111
-          return 0b1111;
-        else
-          return 0b1110;
-      else if (x > 0.5016634166240692f)  // 110
-        return 0b1101;
-      else
-        return 0b1100;
-    else if (x > 0.2035212516784668f)  // 10
-      if (x > 0.2920137718319893f)     // 101
-        return 0b1011;
-      else
-        return 0b1010;
-    else if (x > 0.1202552504837513f)  // 100
-      return 0b1001;
-    else
-      return 0b1000;
-  else if (x > -0.33967943489551544f)  // 0
-    if (x > -0.13791173323988914f)     // 01
-      if (x > -0.045525018125772476f)  // 011
-        return 0b0000;
-      else
-        return 0b0110;
-    else if (x > -0.23460740596055984f)  // 010
-      return 0b0101;
-    else
-      return 0b0100;
-  else if (x > -0.6106329262256622f)  // 00
-    if (x > -0.4599952697753906f)     // 001
-      return 0b0011;
-    else
-      return 0b0010;
-  else if (x > -0.8480964004993439f)  // 000
-    return 0b0001;
-  else
-    return 0b0111;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_unpack(int8_t v) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_unpack(v);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_unpack(v);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_unpack(v);
-    default:
-      break;
-  }
-  return std::numeric_limits<float>::quiet_NaN();
-}
-
-template <JBLAS_DTYPE F4_T>
-inline float f4_dequantize(int8_t v, float scale) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  return f4_unpack<F4_T>(v) * scale;
-}
-
-template <JBLAS_DTYPE F4_T>
-inline int8_t f4_quantize(float x) {
-  static_assert(F4_T == JBLAS_DTYPE::F4_BNB || F4_T == JBLAS_DTYPE::F4_NF4 || F4_T == JBLAS_DTYPE::F4_E2M1,
-                "Unsupported F4 type");
-  switch (F4_T) {
-    case JBLAS_DTYPE::F4_BNB:
-      return fp4_bnb_quantize(x);
-    case JBLAS_DTYPE::F4_NF4:
-      return nf4_quantize(x);
-    case JBLAS_DTYPE::F4_E2M1:
-      return fp4_e2m1_quantize(x);
-    default:
-      break;
-  }
-  return static_cast<int8_t>(0);
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T, int _PACK_ROW, typename _S_T>
-inline JBLAS_CODE decompress_kblock_f4_fp(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                          _S_T* scales, int k_offset, int kblock, int NPad, int8_t* tmp,
-                                          size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    int kpos = (k_offset + i) / kblock;
-    auto sptr = scales + kpos * NPad;
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      float scale0, scale1, dst0, dst1;
-      int s0_idx, s1_idx;
-      s0_idx = j / _PACK_ROW;
-      s1_idx = (j + 1) / _PACK_ROW;
-      scale0 = static_cast<float>(sptr[s0_idx]);
-      scale1 = static_cast<float>(sptr[s1_idx]);
-      dst0 = f4_dequantize<F4_T>(tmp.x, scale0);
-      dst1 = f4_dequantize<F4_T>(tmp.y, scale1);
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(dst0);
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(dst1);
-    }
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE F4_T, typename _DST_T>
-inline JBLAS_CODE decompress_kblock_f4_fp_noscale(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src,
-                                                  int ld_dst, int8_t* tmp, size_t tmpsize) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += 2) {
-      auto tmp = srcptr[i * ld_src / 2 + j / 2];
-      dstptr[i * ld_dst + j + 0] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.x));
-      dstptr[i * ld_dst + j + 1] = static_cast<_DST_T>(f4_unpack<F4_T>(tmp.y));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d_dw2highw(const void* srcptr, void* dstptr, int row, int col, int srcstride,
-                                           int dststride) {
-  auto bsrcptr = (char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      std::memcpy(bdstptr + i * dststride + j * sizeof(jblas::utils::bf16),
-                  bsrcptr + i * srcstride + j * sizeof(float) + 2, sizeof(jblas::utils::bf16));
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE memcpy2d(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride) {
-  auto bsrcptr = (const char*)srcptr;
-  auto bdstptr = (char*)dstptr;
-  for (int i = 0; i < row; i++) {
-    std::memcpy(bdstptr + i * dststride, bsrcptr + i * srcstride, col);
-  }
-  return JblasSuccess;
-}
-
-template <JBLAS_DTYPE S4_T>
-inline JBLAS_CODE quantize_f32_sign_int_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                                 int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto s8_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      float scale = maxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>(srcptr[(j + ij) * ld_src + i] * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float amax = 0.f, max = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        if (amax < std::abs(v)) {
-          amax = std::abs(v);
-          max = v;
-        }
-      }
-      float scale = max / -8.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = srcptr[(j + ij) * ld_src + i] * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-    auto s8_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        maxval = std::max(maxval, srcptr[(j + ij) * ld_src + i]);
-        minval = std::min(minval, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (maxval - minval) / 255;
-      float rscale = 1.f / scale;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      int8_t bzp = utils::cast<float, int8_t>((0 - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = utils::cast<float, int8_t>((srcptr[(j + ij) * ld_src + i] - fmedium) * rscale);
-      }
-    };
-    auto s4_fullrange_calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto v = srcptr[(j + ij) * ld_src + i];
-        maxval = std::max(maxval, v);
-        minval = std::min(minval, v);
-      }
-      float max = std::abs(maxval) < std::abs(minval) ? minval - maxval : maxval - minval;
-      float scale = max / -16.f;
-      float rscale = scale != 0.f ? 1.f / scale : 0.f;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (maxval + minval) / 2;
-      ;
-      int8_t bzp = utils::cast<float, int8_t>((0.f - fmedium) * rscale);
-      zero_points[j / raw_blocksize * ld_dst + i] = bzp;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto quant_v = (srcptr[(j + ij) * ld_src + i] - fmedium) * rscale;
-        int8_t x = std::min(static_cast<int8_t>(15), static_cast<int8_t>(quant_v + 8.5f));
-        dstptr[(j + ij) * ld_dst + i] = x << 4;
-      }
-    };
-
-    auto dispatch_calc = [&](int blocksize) {
-      switch (S4_T) {
-        case JBLAS_DTYPE::S8:
-        case JBLAS_DTYPE::S4_CLIP:
-          if (zero_points == nullptr) {
-            s8_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s8_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        case JBLAS_DTYPE::S4_FULLRANGE:
-          if (zero_points == nullptr) {
-            s4_fullrange_calc_store_scale_and_quantv_sym(blocksize);
-          } else {
-            s4_fullrange_calc_store_scale_and_quantv_asym(blocksize);
-          }
-          break;
-        default:
-          assert(false);
-          break;
-      }
-    };
-
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-template <JBLAS_DTYPE F4_T>
-inline JBLAS_CODE quantize_f32_f4_rowblock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src,
-                                           int ld_dst, float* scales, int8_t* zero_points, int blocksize) {
-  int raw_blocksize = blocksize;
-  for (int i = 0; i < col; i++) {
-    int align_row_loop = row / blocksize * blocksize;
-    int j = 0;
-    auto calc_store_scale_and_quantv_sym = [&](int blocksize) {
-      float absmax = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        absmax = std::max(absmax, std::abs(srcptr[(j + ij) * ld_src + i]));
-      }
-      scales[j / raw_blocksize * ld_dst + i] = absmax;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>(srcptr[(j + ij) * ld_src + i] * (1.f / absmax));
-      }
-    };
-    auto calc_store_scale_and_quantv_asym = [&](int blocksize) {
-      float amax = 0;
-      float amin = 0;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        amax = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-        amin = std::max(amax, srcptr[(j + ij) * ld_src + i]);
-      }
-      float scale = (amax - amin) / 2;
-      scales[j / raw_blocksize * ld_dst + i] = scale;
-      float fmedium = (amax + amin) / 2;
-      zero_points[j / raw_blocksize * ld_dst + i] = f4_quantize<F4_T>((0 - fmedium) * (1.f / scale));
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        dstptr[(j + ij) * ld_dst + i] = f4_quantize<F4_T>((srcptr[(j + ij) * ld_src + i] - fmedium) * (1.f / scale));
-      }
-    };
-    auto dispatch_calc = [&](int blocksize) {
-      if (zero_points == nullptr) {
-        calc_store_scale_and_quantv_sym(blocksize);
-      } else {
-        calc_store_scale_and_quantv_asym(blocksize);
-      }
-    };
-    for (; j < align_row_loop; j += blocksize) dispatch_calc(blocksize);
-    if (j < row) dispatch_calc(row - align_row_loop);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_u8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr,
-                                          int ld_dst, float* scales, int ld_scale, uint8_t* zps, int blocksize,
-                                          float* blkreduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = 0.f;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = static_cast<float>(zp);
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-    if (j < col) {
-      float maxval = 0.f;
-      float minval = 0.f;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        maxval = std::max(fsrc, maxval);
-        minval = std::min(fsrc, minval);
-      }
-      float scale = (maxval - minval) / 255;
-      uint8_t zp = utils::cast<float, uint8_t>((0 - minval) / scale);
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      zps[j / blocksize + i * ld_scale] = zp;
-      int sum = 0;
-      auto zpf = float(zp);
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto qtmp = utils::cast<float, int>(fsrc * rscale);
-        sum += qtmp;
-        dstptr[(j + ij) + i * ld_dst] = utils::cast<float, uint8_t>(zpf + qtmp);
-      }
-      if (blkreduce) {
-        blkreduce[j / blocksize + i * ld_scale] = sum * scale;
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-inline JBLAS_CODE quantize_fp_s8_colblock(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                          float* scales, int ld_scale, int blocksize, float* reduce) {
-  int colblk = utils::padto_le(col, blocksize);
-  for (int i = 0; i < row; i++) {
-    size_t j = 0;
-    for (; j < colblk; j += blocksize) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      int sum = 0;
-      scales[j / blocksize + i * ld_scale] = scale;
-      for (size_t ij = 0; ij < blocksize; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        auto tmp = utils::cast<float, int8_t>(fsrc * rscale);
-        dstptr[(j + ij) + i * ld_dst] = tmp;
-        sum += tmp;
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-    if (j < col) {
-      float absmaxval = std::numeric_limits<float>::min();
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        absmaxval = std::max(std::abs(fsrc), absmaxval);
-      }
-      float scale = absmaxval / 127;
-      float rscale = 1.f / scale;
-      scales[j / blocksize + i * ld_scale] = scale;
-      int sum = 0;
-      for (size_t ij = j; ij < col; ij++) {
-        auto fsrc = static_cast<float>(srcptr[(j + ij) + i * ld_src]);
-        dstptr[(ij) + i * ld_dst] = utils::cast<float, int8_t>(fsrc * rscale);
-        sum += dstptr[(ij) + i * ld_dst];
-      }
-      if (reduce) reduce[j / blocksize + i * ld_scale] = sum * scale;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE alphabeta_f32_f32(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                                           const float* src1ptr, const int src1step, float* dstptr, const int dststep,
-                                           const int M, const int N) {
-  if (beta != 0.f) {
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < N; j++) {
-        dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j] + beta * src1ptr[i * src1step + j];
-      }
-    }
-    return JblasSuccess;
-  }
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = alpha * srcptr[i * srcstep + j];
-    }
-  }
-  return JblasSuccess;
-}
-template <typename SCA_T>
-static inline JBLAS_CODE accum_alphaN_f32_f32(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                                              const int dststep, const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = static_cast<float>(alpha[j]) * srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accum_f32_f32(const float* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                       const int M, const int N) {
-  for (size_t i = 0; i < M; i++) {
-    for (size_t j = 0; j < N; j++) {
-      dstptr[i * dststep + j] = srcptr[i * srcstep + j] + dstptr[i * dststep + j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE quanout_s32_u32(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                                         const int dststep, const int M, const int N, float scaleSrc, float scaleDst,
-                                         int zpDst) {
-  float factor = alpha * scaleSrc / scaleDst;
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * factor;
-      dstptr[i * dststep + j] = utils::cast<float, uint8_t>(fsrc + static_cast<float>(zpDst));
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename SCAB_T>
-static inline JBLAS_CODE dequant_s32_fp32(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep,
-                                          const int M, const int N, const float* scaleA, const int ldsa,
-                                          const SCAB_T* scaleB) {
-  for (int i = 0; i < M; i++) {
-    float scale = scaleA[i * ldsa];
-    for (int j = 0; j < N; j++) {
-      float fsrc = static_cast<float>(srcptr[i * srcstep + j]) * static_cast<float>(scaleB[j]) * scale;
-      dstptr[i * dststep + j] = fsrc;
-    }
-  }
-  return JblasSuccess;
-}
-
-inline JBLAS_CODE minmax_f32_kblock(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                    int fsize_minmax, int blocksize) {
-  for (int i = 0; i < row; i++) {
-    if (col >= blocksize) {
-      for (int icol = 0; icol < col; icol += blocksize) {
-        float maxval = std::numeric_limits<float>::min();
-        float minval = std::numeric_limits<float>::max();
-        for (int ii = 0; ii < blocksize; ii++) {
-          maxval = std::max(srcptr[i * ld_src + icol + ii], maxval);
-          minval = std::min(srcptr[i * ld_src + icol + ii], minval);
-        }
-        auto colptr = &minmaxptr[i * ld_minmax + icol / blocksize * fsize_minmax];
-        colptr[0] = minval;
-        colptr[1] = maxval;
-      }
-    } else {
-      float maxval = std::numeric_limits<float>::min();
-      float minval = std::numeric_limits<float>::max();
-      for (int icol = 0; icol < col; icol++) {
-        maxval = std::max(srcptr[i * ld_src + icol], maxval);
-        minval = std::min(srcptr[i * ld_src + icol], minval);
-      }
-      minmaxptr[i * ld_minmax + 0] = minval;
-      minmaxptr[i * ld_minmax + 1] = maxval;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE accumulate_dequantize_s32_f32(const int32_t* srcptr, float* dstptr, float alpha, float beta,
-                                                       int row, int col, int ld_src, int ld_dst, float* ascales,
-                                                       int ldas, float* wscales) {
-  for (int irow = 0; irow < row; irow++) {
-    for (int icol = 0; icol < col; icol++) {
-      float scale = ascales[irow * ldas] * wscales[icol] * alpha;
-      dstptr[irow * ld_dst + icol] = scale * srcptr[irow * ld_src + icol] + beta * dstptr[irow * ld_dst + icol];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE broadcast_u8(int num, const uint8_t& srcval, uint8_t* dstptr) {
-  int i = 0;
-  for (; i < num; i++) {
-    dstptr[i] = srcval;
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE quant_s8_row_reduce_sum(const int8_t* srcptr, int ldsrc, const float* scales,
-                                                 const int8_t* zero_points, int row, int col, _RT* reduce) {
-  std::memset(reduce, 0, sizeof(reduce[0]) * col);
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j++) {
-      if (zero_points != nullptr) {
-        reduce[j] += static_cast<_RT>((static_cast<float>(srcptr[i * ldsrc + j]) - static_cast<float>(zero_points[j])) *
-                                      static_cast<float>(scales[j]));
-      } else {
-        reduce[j] += static_cast<_RT>(srcptr[i * ldsrc + j] * scales[j]);
-      }
-    }
-  }
-  return JblasSuccess;
-}
-
-template <typename _RT>
-static inline JBLAS_CODE row_reduce_sum(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-  for (int j = 0; j < col; j++) {
-    float tmp = 0.f;
-    for (int i = 0; i < row; i++) {
-      tmp += srcptr[i * ldsrc + j];
-    }
-    reduce[j] = static_cast<_RT>(tmp);
-  }
-  return JblasSuccess;
-}
-
-template <typename SRC_T>
-static inline JBLAS_CODE col_block_reduce_sum(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize,
-                                              float* reduce, int ldr) {
-  for (int i = 0; i < row; i++) {
-    for (int j = 0; j < col; j += blocksize) {
-      auto tmp = 0.f;
-      for (size_t jj = 0; jj < blocksize; jj++) {
-        if (j + jj < col) {
-          tmp += srcptr[i * ldsrc + j + jj];
-        }
-      }
-      reduce[i * ldr + j / blocksize] = tmp;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_act_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto zpf = static_cast<float>(zps[i * lds]) * scales[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= zpf * reduce[j];
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_wei_zeropoint_bias(float* accptr, int ldacc, int row, int col, int8_t* zps,
-                                                   float* scales, int lds, const float* reduce) {
-  for (int i = 0; i < row; i++) {
-    auto reducef = reduce[i * lds];
-    for (int j = 0; j < col; j++) {
-      accptr[i * ldacc + j] -= static_cast<float>(zps[j]) * scales[j] * reducef;
-    }
-  }
-  return JblasSuccess;
-}
-
-static inline JBLAS_CODE remove_zeropoint_bias(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                               float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                               const float* reduceb) {
-  for (int i = 0; i < row; i++) {
-    auto reduceaf = reducea[i * lds];
-    auto zpaf = static_cast<float>(zpa[i * lds]) * scalea[i * lds];
-    for (int j = 0; j < col; j++) {
-      auto zpbf = static_cast<float>(zpb[j]) * scaleb[j];
-      accptr[i * ldacc + j] -= zpbf * reduceaf;
-      accptr[i * ldacc + j] -= zpaf * reduceb[j];
-      accptr[i * ldacc + j] -= zpaf * zpbf * k;
-    }
-  }
-  return JblasSuccess;
-}
-}  // namespace ref
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
deleted file mode 100644
index d25b72ee2fa4d..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/kernel_wrapper.h
+++ /dev/null
@@ -1,702 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#include <array>
-#include <cassert>
-#include <type_traits>
-
-#include "jblas/jit_blas.h"
-#include "jit_blas_utils.h"
-#include "kernel_avx2.h"
-#include "kernel_avx512f.h"
-#include "kernel_avx512_bf16.h"
-#include "kernel_jit.h"
-#include "kernel_ref.h"
-
-namespace jblas {
-namespace kernel {
-namespace wrapper {
-template <int NTile, int RowPack>
-class PaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_interleave_cvt<T_SRC, T_DST, RowPack>::forward(
-          src, dst, NTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int NTile, int RowPack>
-class RevertPaddingInterleaveMN {
-  // M x N ===> N/NTile x M/RowPack x NTile x RowPack (leading dim stride = NTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    return ref::revert_padding_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, NTile, RowPack);
-  }
-};
-
-template <int MTile, int ColPack>
-class PaddingTransInterleaveMN {
-  // row and cols are in terms of src
-  // M x N ===> M/MTile x N/ColPack x MTile x ColPack (leading dim stride = MTile * dststride)
- public:
-  template <JBLAS_ISA ISA_T, typename T_SRC, typename T_DST = T_SRC>
-  static JBLAS_CODE forward(const T_SRC* src, T_DST* dst, int row, int col, int row_pad, int col_pad, int src_step,
-                            int dst_step) {
-    // Note: rows/cols and i/j are in terms of src
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      const auto kern_ret = kernel::avx512f::padding_trans_interleave_cvt<T_SRC, T_DST, ColPack>::forward(
-          src, dst, MTile, row, col, row_pad, col_pad, src_step, dst_step);
-      if (kern_ret != JblasNotSupport) return kern_ret;
-    }
-    return ref::padding_trans_interleave(src, dst, row, col, row_pad, col_pad, src_step, dst_step, MTile, ColPack);
-  }
-};
-
-class Memcpy2D {
- public:
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, typename... Eltops>
-  static JBLAS_CODE forward(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                            void* const_elt_v = nullptr, Eltops... ops) {
-    auto ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                     const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward<_SRC_T, _DST_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                  const_elt_v, ops...);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(sizeof...(ops) == 0);                      // no post ops
-    static_assert(sizeof(_SRC_T) == sizeof(_DST_T));  // no conversion
-    return kernel::ref::memcpy2d(srcptr, dstptr, row, col * sizeof(_SRC_T), srcstep * sizeof(_SRC_T),
-                                 dststep * sizeof(_DST_T));
-  }
-
-  template <JBLAS_ISA ISA_T, typename _SRC_T, typename _DST_T, JBLAS_ELTWISEOP OP_T>
-  static JBLAS_CODE forward1(const _SRC_T* srcptr, _DST_T* dstptr, int row, int col, int srcstep, int dststep,
-                             void* const_elt_v = nullptr) {
-    auto ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = kernel::jit::JitMemcpy2DAvx512f::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                            const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      ret = kernel::jit::JitMemcpy2DAvx2::forward1<_SRC_T, _DST_T, OP_T>(srcptr, dstptr, row, col, srcstep, dststep,
-                                                                         const_elt_v);
-      if (ret == JblasSuccess) {
-        return ret;
-      }
-    }
-#endif
-    assert(false);  // no ref implementation
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp32CvtBf16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (utils::isa_base<ISA_T>::amx_bf16) {
-      return kernel::avx512_bf16::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride,
-                                                              zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return kernel::avx512f::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return kernel::avx2::fp32_cvt_bf16_2D_write_back(srcptr, dstptr, row, col, srcstride, dststride, zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<float, utils::bf16>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-class Memcpy2DFp32CvtFp16 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp32_cvt_fp16_2D_write_back(
-          reinterpret_cast<const float*>(srcptr), reinterpret_cast<utils::fp16*>(dstptr), row, col,
-          srcstride / sizeof(float), dststride / sizeof(utils::fp16), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DFp16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileFP16()
-    if constexpr (utils::isa_base<ISA_T>::avx512_fp16) {
-      return kernel::avx512f::fp16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::fp16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::fp16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return JblasNotSupport;
-  }
-};
-
-class Memcpy2DBf16CvtFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(void* srcptr, void* dstptr, int row, int col, int srcstride, int dststride,
-                            bool zeropadding) {
-#if CompileBF16()
-    if constexpr (ISA_T >= JblasAMX_BF16) {
-      return kernel::avx512_bf16::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX512F()
-    if constexpr (ISA_T >= JblasAVX512F) {
-      return kernel::avx512f::bf16_cvt_fp32_2D_write_back(  //
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (ISA_T >= JblasAVX2) {
-      return kernel::avx2::bf16_cvt_fp32_2D_write_back(
-          reinterpret_cast<const utils::bf16*>(srcptr), reinterpret_cast<float*>(dstptr), row, col,
-          srcstride / sizeof(utils::bf16), dststride / sizeof(float), zeropadding);
-    }
-#endif
-    return kernel::ref::dt_cvt_2D_write_back<utils::bf16, float>(srcptr, dstptr, row, col, srcstride, dststride,
-                                                                 zeropadding);
-  }
-};
-
-template <int NTILE>
-class CompressS8S4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::int4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_s8_s4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int NTILE>
-class CompressFp4 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, jblas::utils::f4x2* dstptr, int row, int col, int ld_src,
-                                   int ld_dst) {
-    return ref::compress_f4<NTILE>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <typename _T>
-class Transpose2D {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const _T* srcptr, _T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    return ref::transpose2d(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class QuantizeSignIntRowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f &&
-                  S4_T != JBLAS_DTYPE::S4_FULLRANGE) {  // TODO(zhe): support simd version s4_fullrange quantization.
-      return avx512f::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                           zero_points, blocksize);
-    }
-#endif
-    return ref::quantize_f32_sign_int_rowblock<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-  }
-};
-
-class QuantizeF4RowBlock {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   float* scales, int8_t* zero_points, int blocksize) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     blocksize);
-    }
-#endif
-    return ref::quantize_f32_f4_rowblock<F4_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                               blocksize);
-  }
-};
-
-class QuantizeU8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, uint8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, uint8_t* zps, int blocksize, float* blkreduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                     blocksize, blkreduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::quantize_fp_u8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps,
-                                                  blocksize, blkreduce);
-    }
-#endif
-    return ref::quantize_fp_u8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, zps, blocksize,
-                                        blkreduce);
-  }
-};
-
-class QuantizeS8ColBlock {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(int row, int col, const SRC_T* srcptr, int ld_src, int8_t* dstptr, int ld_dst,
-                                   float* scales, int ld_scale, int blocksize, float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quantize_fp_s8_colblock<SRC_T>(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale,
-                                                     blocksize, reduce);
-    }
-#endif
-    return ref::quantize_fp_s8_colblock(row, col, srcptr, ld_src, dstptr, ld_dst, scales, ld_scale, blocksize, reduce);
-  }
-};
-
-class Broadcast {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(int num, const uint8_t& srcval, uint8_t* dstptr) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::broadcast_u8(num, srcval, dstptr);
-    }
-#endif
-    return ref::broadcast_u8(num, srcval, dstptr);
-  }
-};
-
-class AccumulateDequantizeS32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int32_t* srcptr, float* dstptr, float alpha, float beta, int row, int col,
-                                   int ld_src, int ld_dst, float* ascales, int ldas, float* wscales) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales,
-                                                    ldas, wscales);
-    }
-#endif
-    return ref::accumulate_dequantize_s32_f32(srcptr, dstptr, alpha, beta, row, col, ld_src, ld_dst, ascales, ldas,
-                                              wscales);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW, typename _Z_T = int8_t>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename _SCA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   _SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad, void* tmp,
-                                   size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(
-          srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    // AVX2 device only focus on fp32 data and layout
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<_SCA_T, float> && std::is_same_v<_DST_T, float> &&
-                  _PACK_ROW == 1) {
-      if (zero_points == nullptr) {
-        ret = avx2::decompress_kblock_bit4_packrow1<true>(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                          k_offset, kblock, NPad, &avx2::dequant_s8_N_avx2<48, true>,
-                                                          &avx2::convert_s4_s8_16_sse<S4_T>,
-                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      } else {
-        ret = avx2::decompress_kblock_bit4_packrow1<false>(
-            srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset, kblock, NPad,
-            &avx2::dequant_s8_N_avx2<48, false>, &avx2::convert_s4_s8_16_sse<S4_T>, reinterpret_cast<int8_t*>(tmp),
-            tmpsize);
-      }
-
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    ret = ref::decompress_kblock_s4_fp<S4_T, _DST_T, _PACK_ROW, _SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, zero_points, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-    return ret;
-  }
-};
-
-template <typename _DST_T>  // zero points always be int8_t, not compressed
-class DecompressKBlockS4S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                           reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_s4_s8fp<S4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T, int _PACK_ROW>
-class DecompressKBlockF4Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int k_offset, int kblock, int NPad, void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      ret = avx512f::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                             scales, k_offset, kblock, NPad,
-                                                                             reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float>) {
-      ret = avx2::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                          scales, k_offset, kblock, NPad,
-                                                                          reinterpret_cast<int8_t*>(tmp), tmpsize);
-      if (ret == JblasSuccess) return ret;
-    }
-#endif
-    return ref::decompress_kblock_f4_fp<F4_T, _DST_T, _PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                        scales, k_offset, kblock, NPad,
-                                                                        reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-template <typename _DST_T>
-class DecompressKBlockF4FpNoscale {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE F4_T>
-  static inline JBLAS_CODE forward(utils::f4x2* srcptr, _DST_T* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   void* tmp, size_t tmpsize) {
-    JBLAS_CODE ret = JblasNotSupport;
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                    reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                                 reinterpret_cast<int8_t*>(tmp), tmpsize);
-    }
-    return ref::decompress_kblock_f4_fp_noscale<F4_T, _DST_T>(srcptr, dstptr, row, col, ld_src, ld_dst,
-                                                              reinterpret_cast<int8_t*>(tmp), tmpsize);
-  }
-};
-
-class DecompressKBlockS4S8 {
- public:
-  template <JBLAS_ISA ISA_T, JBLAS_DTYPE S4_T>
-  static inline JBLAS_CODE forward(utils::int4x2* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f && S4_T == JBLAS_DTYPE::S4_CLIP) {
-      return jit::decompress_s4_s8(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-#endif
-    return ref::decompress_s4_s8<S4_T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-template <int PACK_ROW>
-class DecompressKBlockS8F32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, float* dstptr, int row, int col, int ld_src, int ld_dst,
-                                   SCA_T* scales, int8_t* zero_points, int k_offset, int kblock, int NPad) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return jit::DequanKBlockS8F32::forward_avx512f(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points,
-                                                     k_offset, kblock, NPad);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2 && std::is_same_v<SCA_T, float> &&
-                  PACK_ROW == 1) {  // TODO Scale type support
-      return avx2::dequant_kblock_s8_f32(srcptr, dstptr, row, col, ld_src, ld_dst, scales, zero_points, k_offset,
-                                         kblock, NPad);
-    }
-#endif
-    return ref::decompress_kblock_s8_f32<float, PACK_ROW, SCA_T>(srcptr, dstptr, row, col, ld_src, ld_dst, scales,
-                                                                 zero_points, k_offset, kblock, NPad);
-  }
-};
-
-class DecompressKBlockS8S8Fp {
- public:
-  template <JBLAS_ISA ISA_T, typename T>
-  static inline JBLAS_CODE forward(int8_t* srcptr, T* dstptr, int row, int col, int ld_src, int ld_dst) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {  // TODO Scale type support
-      return avx512f::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {  // TODO Scale type support
-      return avx2::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-    }
-    return ref::decompress_kblock_s8_s8fp<T>(srcptr, dstptr, row, col, ld_src, ld_dst);
-  }
-};
-
-class AlphaBetaF32F32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const float* srcptr, const int srcstep, const float beta,
-                            const float* src1ptr, const int src1step, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-#if CompileAVX2()
-    if (utils::isa_base<ISA_T>::avx2) {
-      return avx2::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::alphabeta_f32_f32(alpha, srcptr, srcstep, beta, src1ptr, src1step, dstptr, dststep, M, N);
-  }
-};
-
-class CompFp32BlockScale {
- public:
-  template <JBLAS_ISA ISA_T, typename SCA_T>
-  static JBLAS_CODE forward(const SCA_T* alpha, const float* srcptr, const int srcstep, float* dstptr,
-                            const int dststep, const int M, const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-    }
-    return ref::accum_alphaN_f32_f32(alpha, srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class AccumulateFp32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-    }
-#endif
-    return ref::accum_f32_f32(srcptr, srcstep, dstptr, dststep, M, N);
-  }
-};
-
-class QuanOutS32U32 {
- public:
-  template <JBLAS_ISA ISA_T>
-  static JBLAS_CODE forward(const float alpha, const int32_t* srcptr, const int srcstep, uint8_t* dstptr,
-                            const int dststep, const int M, const int N, float scaleSrc, float scaleDst, int zpDst) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-    }
-#endif
-    return ref::quanout_s32_u32(alpha, srcptr, srcstep, dstptr, dststep, M, N, scaleSrc, scaleDst, zpDst);
-  }
-};
-
-// scaleA ldsa==0 per tensor, ldsa!=0 per M
-// scaleB per channel(N)
-class DequanS32Fp32 {
- public:
-  template <JBLAS_ISA ISA_T, typename SCAB_T>
-  static JBLAS_CODE forward(const int32_t* srcptr, const int srcstep, float* dstptr, const int dststep, const int M,
-                            const int N, const float* scaleA, const int ldsa, const SCAB_T* scaleB) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-    }
-#endif
-    return ref::dequant_s32_fp32(srcptr, srcstep, dstptr, dststep, M, N, scaleA, ldsa, scaleB);
-  }
-};
-
-class MinMaxKBlock {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int row, int col, int ld_src, float* minmaxptr, int ld_minmax,
-                                   int fsize_minmax, int blocksize) {
-    return ref::minmax_f32_kblock(srcptr, row, col, ld_src, minmaxptr, ld_minmax, fsize_minmax, blocksize);
-  }
-};
-
-template <typename _RT>
-class QuantS8RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const int8_t* srcptr, int ldsrc, const float* scales, const int8_t* zero_points,
-                                   int row, int col, _RT* reduce) {
-    return ref::quant_s8_row_reduce_sum(srcptr, ldsrc, scales, zero_points, row, col, reduce);
-  }
-};
-
-template <typename _RT>
-class RowReduceSum {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward(const float* srcptr, int ldsrc, int row, int col, _RT* reduce) {
-    return ref::row_reduce_sum<_RT>(srcptr, ldsrc, row, col, reduce);
-  }
-};
-
-class ColBlockReduceSum {
- public:
-  template <JBLAS_ISA ISA_T, typename SRC_T>
-  static inline JBLAS_CODE forward(const SRC_T* srcptr, int ldsrc, int row, int col, int blocksize, float* reduce,
-                                   int ldr) {
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-    }
-    return ref::col_block_reduce_sum<SRC_T>(srcptr, ldsrc, row, col, blocksize, reduce, ldr);
-  }
-};
-
-class RemoveZeroPointBias {
- public:
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_wei(float* accptr, int ldacc, int row, int col, int8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_wei_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_act(float* accptr, int ldacc, int row, int col, uint8_t* zps, float* scales, int lds,
-                                       const float* reduce) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-    }
-#endif
-    return ref::remove_act_zeropoint_bias(accptr, ldacc, row, col, zps, scales, lds, reduce);
-  }
-  template <JBLAS_ISA ISA_T>
-  static inline JBLAS_CODE forward_both(float* accptr, int ldacc, int row, int col, uint8_t* zpa, int8_t* zpb,
-                                        float* scalea, float* scaleb, int lds, int k, const float* reducea,
-                                        const float* reduceb) {
-#if CompileAVX512F()
-    if constexpr (utils::isa_base<ISA_T>::avx512f) {
-      return avx512f::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea,
-                                            reduceb);
-    }
-#endif
-#if CompileAVX2()
-    if constexpr (utils::isa_base<ISA_T>::avx2) {
-      return avx2::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-    }
-#endif
-    return ref::remove_zeropoint_bias(accptr, ldacc, row, col, zpa, zpb, scalea, scaleb, lds, k, reducea, reduceb);
-  }
-};
-
-}  // namespace wrapper
-}  // namespace kernel
-}  // namespace jblas
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
deleted file mode 100644
index 320593150fca2..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak.h
+++ /dev/null
@@ -1,3313 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#pragma once
-#ifndef XBYAK_XBYAK_H_
-#define XBYAK_XBYAK_H_
-/*!
-        @file xbyak.h
-        @brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
-        @author herumi
-        @url https://github.com/herumi/xbyak
-        @note modified new BSD license
-        http://opensource.org/licenses/BSD-3-Clause
-*/
-#if (not +0) && !defined(XBYAK_NO_OP_NAMES)  // trick to detect whether 'not' is operator or not
-#define XBYAK_NO_OP_NAMES
-#endif
-
-#include <stdio.h>  // for debug print
-#include <assert.h>
-#include <list>
-#include <string>
-#include <algorithm>
-#ifndef NDEBUG
-#include <iostream>
-#endif
-
-// #define XBYAK_DISABLE_AVX512
-
-#if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
-#define XBYAK_USE_MMAP_ALLOCATOR
-#endif
-#if !defined(__GNUC__) || defined(__MINGW32__)
-#undef XBYAK_USE_MMAP_ALLOCATOR
-#endif
-
-#ifdef __GNUC__
-#define XBYAK_GNUC_PREREQ(major, minor) ((__GNUC__)*100 + (__GNUC_MINOR__) >= (major)*100 + (minor))
-#else
-#define XBYAK_GNUC_PREREQ(major, minor) 0
-#endif
-
-// This covers -std=(gnu|c)++(0x|11|1y), -stdlib=libc++, and modern Microsoft.
-#if ((defined(_MSC_VER) && (_MSC_VER >= 1600)) || defined(_LIBCPP_VERSION) || \
-     ((__cplusplus >= 201103) || defined(__GXX_EXPERIMENTAL_CXX0X__)))
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::unordered_multimap
-
-/*
-        Clang/llvm-gcc and ICC-EDG in 'GCC-mode' always claim to be GCC 4.2, using
-        libstdcxx 20070719 (from GCC 4.2.1, the last GPL 2 version).
-*/
-#elif XBYAK_GNUC_PREREQ(4, 5) || (XBYAK_GNUC_PREREQ(4, 2) && __GLIBCXX__ >= 20070719) || defined(__INTEL_COMPILER) || \
-    defined(__llvm__)
-#include <tr1/unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <tr1/unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#elif defined(_MSC_VER) && (_MSC_VER >= 1500) && (_MSC_VER < 1600)
-#include <unordered_set>
-#define XBYAK_STD_UNORDERED_SET std::tr1::unordered_set
-#include <unordered_map>
-#define XBYAK_STD_UNORDERED_MAP std::tr1::unordered_map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::tr1::unordered_multimap
-
-#else
-#include <set>
-#define XBYAK_STD_UNORDERED_SET std::set
-#include <map>
-#define XBYAK_STD_UNORDERED_MAP std::map
-#define XBYAK_STD_UNORDERED_MULTIMAP std::multimap
-#endif
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#include <malloc.h>
-#ifdef _MSC_VER
-#define XBYAK_TLS __declspec(thread)
-#else
-#define XBYAK_TLS __thread
-#endif
-#elif defined(__GNUC__)
-#include <unistd.h>
-#include <sys/mman.h>
-#include <stdlib.h>
-#define XBYAK_TLS __thread
-#endif
-#if defined(__APPLE__) && !defined(XBYAK_DONT_USE_MAP_JIT)
-#define XBYAK_USE_MAP_JIT
-#include <sys/sysctl.h>
-#ifndef MAP_JIT
-#define MAP_JIT 0x800
-#endif
-#endif
-#if !defined(_MSC_VER) || (_MSC_VER >= 1600)
-#include <stdint.h>
-#endif
-
-// MFD_CLOEXEC defined only linux 3.17 or later.
-// Android wraps the memfd_create syscall from API version 30.
-#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
-#undef XBYAK_USE_MEMFD
-#endif
-
-#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
-#define XBYAK64_WIN
-#elif defined(__x86_64__)
-#define XBYAK64_GCC
-#endif
-#if !defined(XBYAK64) && !defined(XBYAK32)
-#if defined(XBYAK64_GCC) || defined(XBYAK64_WIN)
-#define XBYAK64
-#else
-#define XBYAK32
-#endif
-#endif
-
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && _MSC_VER >= 1900)
-#undef XBYAK_TLS
-#define XBYAK_TLS thread_local
-#define XBYAK_VARIADIC_TEMPLATE
-#define XBYAK_NOEXCEPT noexcept
-#else
-#define XBYAK_NOEXCEPT throw()
-#endif
-
-// require c++14 or later
-// Visual Studio 2017 version 15.0 or later
-// g++-6 or later
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4514) /* remove inline function */
-#pragma warning(disable : 4786) /* identifier is too long */
-#pragma warning(disable : 4503) /* name is too long */
-#pragma warning(disable : 4127) /* constant expresison */
-#endif
-
-// disable -Warray-bounds because it may be a bug of gcc. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104603
-#if defined(__GNUC__) && !defined(__clang__)
-#define XBYAK_DISABLE_WARNING_ARRAY_BOUNDS
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-namespace Xbyak {
-
-enum {
-  DEFAULT_MAX_CODE_SIZE = 4096,
-  VERSION = 0x6730 /* 0xABCD = A.BC(.D) */
-};
-
-#ifndef MIE_INTEGER_TYPE_DEFINED
-#define MIE_INTEGER_TYPE_DEFINED
-// for backward compatibility
-typedef uint64_t uint64;
-typedef int64_t sint64;
-typedef uint32_t uint32;
-typedef uint16_t uint16;
-typedef uint8_t uint8;
-#endif
-
-#ifndef MIE_ALIGN
-#ifdef _MSC_VER
-#define MIE_ALIGN(x) __declspec(align(x))
-#else
-#define MIE_ALIGN(x) __attribute__((aligned(x)))
-#endif
-#endif
-#ifndef MIE_PACK  // for shufps
-#define MIE_PACK(x, y, z, w) ((x)*64 + (y)*16 + (z)*4 + (w))
-#endif
-
-enum {
-  ERR_NONE = 0,
-  ERR_BAD_ADDRESSING,
-  ERR_CODE_IS_TOO_BIG,
-  ERR_BAD_SCALE,
-  ERR_ESP_CANT_BE_INDEX,
-  ERR_BAD_COMBINATION,
-  ERR_BAD_SIZE_OF_REGISTER,
-  ERR_IMM_IS_TOO_BIG,
-  ERR_BAD_ALIGN,
-  ERR_LABEL_IS_REDEFINED,
-  ERR_LABEL_IS_TOO_FAR,
-  ERR_LABEL_IS_NOT_FOUND,
-  ERR_CODE_ISNOT_COPYABLE,
-  ERR_BAD_PARAMETER,
-  ERR_CANT_PROTECT,
-  ERR_CANT_USE_64BIT_DISP,
-  ERR_OFFSET_IS_TOO_BIG,
-  ERR_MEM_SIZE_IS_NOT_SPECIFIED,
-  ERR_BAD_MEM_SIZE,
-  ERR_BAD_ST_COMBINATION,
-  ERR_OVER_LOCAL_LABEL,  // not used
-  ERR_UNDER_LOCAL_LABEL,
-  ERR_CANT_ALLOC,
-  ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW,
-  ERR_BAD_PROTECT_MODE,
-  ERR_BAD_PNUM,
-  ERR_BAD_TNUM,
-  ERR_BAD_VSIB_ADDRESSING,
-  ERR_CANT_CONVERT,
-  ERR_LABEL_ISNOT_SET_BY_L,
-  ERR_LABEL_IS_ALREADY_SET_BY_L,
-  ERR_BAD_LABEL_STR,
-  ERR_MUNMAP,
-  ERR_OPMASK_IS_ALREADY_SET,
-  ERR_ROUNDING_IS_ALREADY_SET,
-  ERR_K0_IS_INVALID,
-  ERR_EVEX_IS_INVALID,
-  ERR_SAE_IS_INVALID,
-  ERR_ER_IS_INVALID,
-  ERR_INVALID_BROADCAST,
-  ERR_INVALID_OPMASK_WITH_MEMORY,
-  ERR_INVALID_ZERO,
-  ERR_INVALID_RIP_IN_AUTO_GROW,
-  ERR_INVALID_MIB_ADDRESS,
-  ERR_X2APIC_IS_NOT_SUPPORTED,
-  ERR_NOT_SUPPORTED,
-  ERR_SAME_REGS_ARE_INVALID,
-  ERR_INTERNAL  // Put it at last.
-};
-
-inline const char* ConvertErrorToString(int err) {
-  static const char* errTbl[] = {"none",
-                                 "bad addressing",
-                                 "code is too big",
-                                 "bad scale",
-                                 "esp can't be index",
-                                 "bad combination",
-                                 "bad size of register",
-                                 "imm is too big",
-                                 "bad align",
-                                 "label is redefined",
-                                 "label is too far",
-                                 "label is not found",
-                                 "code is not copyable",
-                                 "bad parameter",
-                                 "can't protect",
-                                 "can't use 64bit disp(use (void*))",
-                                 "offset is too big",
-                                 "MEM size is not specified",
-                                 "bad mem size",
-                                 "bad st combination",
-                                 "over local label",
-                                 "under local label",
-                                 "can't alloc",
-                                 "T_SHORT is not supported in AutoGrow",
-                                 "bad protect mode",
-                                 "bad pNum",
-                                 "bad tNum",
-                                 "bad vsib addressing",
-                                 "can't convert",
-                                 "label is not set by L()",
-                                 "label is already set by L()",
-                                 "bad label string",
-                                 "err munmap",
-                                 "opmask is already set",
-                                 "rounding is already set",
-                                 "k0 is invalid",
-                                 "evex is invalid",
-                                 "sae(suppress all exceptions) is invalid",
-                                 "er(embedded rounding) is invalid",
-                                 "invalid broadcast",
-                                 "invalid opmask with memory",
-                                 "invalid zero",
-                                 "invalid rip in AutoGrow",
-                                 "invalid mib address",
-                                 "x2APIC is not supported",
-                                 "not supported",
-                                 "same regs are invalid",
-                                 "internal error"};
-  assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
-  return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
-}
-
-#ifdef XBYAK_NO_EXCEPTION
-namespace local {
-
-inline int& GetErrorRef() {
-  static XBYAK_TLS int err = 0;
-  return err;
-}
-
-inline void SetError(int err) {
-  if (local::GetErrorRef()) return;  // keep the first err code
-  local::GetErrorRef() = err;
-}
-
-}  // namespace local
-
-inline void ClearError() { local::GetErrorRef() = 0; }
-inline int GetError() { return Xbyak::local::GetErrorRef(); }
-
-#define XBYAK_THROW(err)         \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return;                      \
-  }
-#define XBYAK_THROW_RET(err, r)  \
-  {                              \
-    Xbyak::local::SetError(err); \
-    return r;                    \
-  }
-
-#else
-class Error : public std::exception {
-  int err_;
-
- public:
-  explicit Error(int err) : err_(err) {
-    if (err_ < 0 || err_ > ERR_INTERNAL) {
-      err_ = ERR_INTERNAL;
-    }
-  }
-  operator int() const { return err_; }
-  const char* what() const XBYAK_NOEXCEPT { return ConvertErrorToString(err_); }
-};
-
-// dummy functions
-inline void ClearError() {}
-inline int GetError() { return 0; }
-
-inline const char* ConvertErrorToString(const Error& err) { return err.what(); }
-
-#define XBYAK_THROW(err) \
-  { throw Error(err); }
-#define XBYAK_THROW_RET(err, r) \
-  { throw Error(err); }
-
-#endif
-
-inline void* AlignedMalloc(size_t size, size_t alignment) {
-#ifdef __MINGW32__
-  return __mingw_aligned_malloc(size, alignment);
-#elif defined(_WIN32)
-  return _aligned_malloc(size, alignment);
-#else
-  void* p;
-  int ret = posix_memalign(&p, alignment, size);
-  return (ret == 0) ? p : 0;
-#endif
-}
-
-inline void AlignedFree(void* p) {
-#ifdef __MINGW32__
-  __mingw_aligned_free(p);
-#elif defined(_MSC_VER)
-  _aligned_free(p);
-#else
-  free(p);
-#endif
-}
-
-template <class To, class From>
-inline const To CastTo(From p) XBYAK_NOEXCEPT {
-  return (const To)(size_t)(p);
-}
-namespace inner {
-
-#ifdef _WIN32
-struct SystemInfo {
-  SYSTEM_INFO info;
-  SystemInfo() { GetSystemInfo(&info); }
-};
-#endif
-// static const size_t ALIGN_PAGE_SIZE = 4096;
-inline size_t getPageSize() {
-#ifdef _WIN32
-  static const SystemInfo si;
-  return si.info.dwPageSize;
-#elif defined(__GNUC__)
-  static const long pageSize = sysconf(_SC_PAGESIZE);
-  if (pageSize > 0) {
-    return (size_t)pageSize;
-  }
-#endif
-  return 4096;
-}
-
-inline bool IsInDisp8(uint32_t x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
-inline bool IsInInt32(uint64_t x) { return ~uint64_t(0x7fffffffu) <= x || x <= 0x7FFFFFFFU; }
-
-inline uint32_t VerifyInInt32(uint64_t x) {
-#if defined(XBYAK64) && !defined(__ILP32__)
-  if (!IsInInt32(x)) XBYAK_THROW_RET(ERR_OFFSET_IS_TOO_BIG, 0)
-#endif
-  return static_cast<uint32_t>(x);
-}
-
-enum LabelMode {
-  LasIs,   // as is
-  Labs,    // absolute
-  LaddTop  // (addr + top) for mov(reg, label) with AutoGrow
-};
-
-}  // namespace inner
-
-/*
-        custom allocator
-*/
-struct Allocator {
-  explicit Allocator(const std::string& = "") {}  // same interface with MmapAllocator
-  virtual uint8_t* alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, inner::getPageSize())); }
-  virtual void free(uint8_t* p) { AlignedFree(p); }
-  virtual ~Allocator() {}
-  /* override to return false if you call protect() manually */
-  virtual bool useProtect() const { return true; }
-};
-
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-#ifdef XBYAK_USE_MAP_JIT
-namespace util {
-
-inline int getMacOsVersionPure() {
-  char buf[64];
-  size_t size = sizeof(buf);
-  int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
-  if (err != 0) return 0;
-  char* endp;
-  int major = strtol(buf, &endp, 10);
-  if (*endp != '.') return 0;
-  return major;
-}
-
-inline int getMacOsVersion() {
-  static const int version = getMacOsVersionPure();
-  return version;
-}
-
-}  // namespace util
-#endif
-class MmapAllocator : public Allocator {
-  struct Allocation {
-    size_t size;
-#if defined(XBYAK_USE_MEMFD)
-    // fd_ is only used with XBYAK_USE_MEMFD. We keep the file open
-    // during the lifetime of each allocation in order to support
-    // checkpoint/restore by unprivileged users.
-    int fd;
-#endif
-  };
-  const std::string name_;  // only used with XBYAK_USE_MEMFD
-  typedef XBYAK_STD_UNORDERED_MAP<uintptr_t, Allocation> AllocationList;
-  AllocationList allocList_;
-
- public:
-  explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
-  uint8_t* alloc(size_t size) {
-    const size_t alignedSizeM1 = inner::getPageSize() - 1;
-    size = (size + alignedSizeM1) & ~alignedSizeM1;
-#if defined(MAP_ANONYMOUS)
-    int mode = MAP_PRIVATE | MAP_ANONYMOUS;
-#elif defined(MAP_ANON)
-    int mode = MAP_PRIVATE | MAP_ANON;
-#else
-#error "not supported"
-#endif
-#if defined(XBYAK_USE_MAP_JIT)
-    const int mojaveVersion = 18;
-    if (util::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
-#endif
-    int fd = -1;
-#if defined(XBYAK_USE_MEMFD)
-    fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
-    if (fd != -1) {
-      mode = MAP_SHARED;
-      if (ftruncate(fd, size) != 0) {
-        close(fd);
-        XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-      }
-    }
-#endif
-    void* p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
-    if (p == MAP_FAILED) {
-      if (fd != -1) close(fd);
-      XBYAK_THROW_RET(ERR_CANT_ALLOC, 0)
-    }
-    assert(p);
-    Allocation& alloc = allocList_[(uintptr_t)p];
-    alloc.size = size;
-#if defined(XBYAK_USE_MEMFD)
-    alloc.fd = fd;
-#endif
-    return (uint8_t*)p;
-  }
-  void free(uint8_t* p) {
-    if (p == 0) return;
-    AllocationList::iterator i = allocList_.find((uintptr_t)p);
-    if (i == allocList_.end()) XBYAK_THROW(ERR_BAD_PARAMETER)
-    if (munmap((void*)i->first, i->second.size) < 0) XBYAK_THROW(ERR_MUNMAP)
-#if defined(XBYAK_USE_MEMFD)
-    if (i->second.fd != -1) close(i->second.fd);
-#endif
-    allocList_.erase(i);
-  }
-};
-#else
-typedef Allocator MmapAllocator;
-#endif
-
-class Address;
-class Reg;
-
-class Operand {
-  static const uint8_t EXT8BIT = 0x20;
-  unsigned int idx_ : 6;  // 0..31 + EXT8BIT = 1 if spl/bpl/sil/dil
-  unsigned int kind_ : 10;
-  unsigned int bit_ : 14;
-
- protected:
-  unsigned int zero_ : 1;
-  unsigned int mask_ : 3;
-  unsigned int rounding_ : 3;
-  void setIdx(int idx) { idx_ = idx; }
-
- public:
-  enum Kind {
-    NONE = 0,
-    MEM = 1 << 0,
-    REG = 1 << 1,
-    MMX = 1 << 2,
-    FPU = 1 << 3,
-    XMM = 1 << 4,
-    YMM = 1 << 5,
-    ZMM = 1 << 6,
-    OPMASK = 1 << 7,
-    BNDREG = 1 << 8,
-    TMM = 1 << 9
-  };
-  enum Code {
-#ifdef XBYAK64
-    RAX = 0,
-    RCX,
-    RDX,
-    RBX,
-    RSP,
-    RBP,
-    RSI,
-    RDI,
-    R8,
-    R9,
-    R10,
-    R11,
-    R12,
-    R13,
-    R14,
-    R15,
-    R8D = 8,
-    R9D,
-    R10D,
-    R11D,
-    R12D,
-    R13D,
-    R14D,
-    R15D,
-    R8W = 8,
-    R9W,
-    R10W,
-    R11W,
-    R12W,
-    R13W,
-    R14W,
-    R15W,
-    R8B = 8,
-    R9B,
-    R10B,
-    R11B,
-    R12B,
-    R13B,
-    R14B,
-    R15B,
-    SPL = 4,
-    BPL,
-    SIL,
-    DIL,
-#endif
-    EAX = 0,
-    ECX,
-    EDX,
-    EBX,
-    ESP,
-    EBP,
-    ESI,
-    EDI,
-    AX = 0,
-    CX,
-    DX,
-    BX,
-    SP,
-    BP,
-    SI,
-    DI,
-    AL = 0,
-    CL,
-    DL,
-    BL,
-    AH,
-    CH,
-    DH,
-    BH
-  };
-  XBYAK_CONSTEXPR Operand() : idx_(0), kind_(0), bit_(0), zero_(0), mask_(0), rounding_(0) {}
-  XBYAK_CONSTEXPR Operand(int idx, Kind kind, int bit, bool ext8bit = 0)
-      : idx_(static_cast<uint8_t>(idx | (ext8bit ? EXT8BIT : 0))),
-        kind_(kind),
-        bit_(bit),
-        zero_(0),
-        mask_(0),
-        rounding_(0) {
-    assert((bit_ & (bit_ - 1)) == 0);  // bit must be power of two
-  }
-  XBYAK_CONSTEXPR Kind getKind() const { return static_cast<Kind>(kind_); }
-  XBYAK_CONSTEXPR int getIdx() const { return idx_ & (EXT8BIT - 1); }
-  XBYAK_CONSTEXPR bool isNone() const { return kind_ == 0; }
-  XBYAK_CONSTEXPR bool isMMX() const { return is(MMX); }
-  XBYAK_CONSTEXPR bool isXMM() const { return is(XMM); }
-  XBYAK_CONSTEXPR bool isYMM() const { return is(YMM); }
-  XBYAK_CONSTEXPR bool isZMM() const { return is(ZMM); }
-  XBYAK_CONSTEXPR bool isTMM() const { return is(TMM); }
-  XBYAK_CONSTEXPR bool isXMEM() const { return is(XMM | MEM); }
-  XBYAK_CONSTEXPR bool isYMEM() const { return is(YMM | MEM); }
-  XBYAK_CONSTEXPR bool isZMEM() const { return is(ZMM | MEM); }
-  XBYAK_CONSTEXPR bool isOPMASK() const { return is(OPMASK); }
-  XBYAK_CONSTEXPR bool isBNDREG() const { return is(BNDREG); }
-  XBYAK_CONSTEXPR bool isREG(int bit = 0) const { return is(REG, bit); }
-  XBYAK_CONSTEXPR bool isMEM(int bit = 0) const { return is(MEM, bit); }
-  XBYAK_CONSTEXPR bool isFPU() const { return is(FPU); }
-  XBYAK_CONSTEXPR bool isExt8bit() const { return (idx_ & EXT8BIT) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx() const { return (getIdx() & 8) != 0; }
-  XBYAK_CONSTEXPR bool isExtIdx2() const { return (getIdx() & 16) != 0; }
-  XBYAK_CONSTEXPR bool hasEvex() const { return isZMM() || isExtIdx2() || getOpmaskIdx() || getRounding(); }
-  XBYAK_CONSTEXPR bool hasRex() const { return isExt8bit() || isREG(64) || isExtIdx(); }
-  XBYAK_CONSTEXPR bool hasZero() const { return zero_; }
-  XBYAK_CONSTEXPR int getOpmaskIdx() const { return mask_; }
-  XBYAK_CONSTEXPR int getRounding() const { return rounding_; }
-  void setKind(Kind kind) {
-    if ((kind & (XMM | YMM | ZMM | TMM)) == 0) return;
-    kind_ = kind;
-    bit_ = kind == XMM ? 128 : kind == YMM ? 256 : kind == ZMM ? 512 : 8192;
-  }
-  // err if MMX/FPU/OPMASK/BNDREG
-  void setBit(int bit);
-  void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true) {
-    if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
-    mask_ = idx;
-  }
-  void setRounding(int idx) {
-    if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
-    rounding_ = idx;
-  }
-  void setZero() { zero_ = true; }
-  // ah, ch, dh, bh?
-  bool isHigh8bit() const {
-    if (!isBit(8)) return false;
-    if (isExt8bit()) return false;
-    const int idx = getIdx();
-    return AH <= idx && idx <= BH;
-  }
-  // any bit is accetable if bit == 0
-  XBYAK_CONSTEXPR bool is(int kind, uint32_t bit = 0) const {
-    return (kind == 0 || (kind_ & kind)) && (bit == 0 || (bit_ & bit));  // cf. you can set (8|16)
-  }
-  XBYAK_CONSTEXPR bool isBit(uint32_t bit) const { return (bit_ & bit) != 0; }
-  XBYAK_CONSTEXPR uint32_t getBit() const { return bit_; }
-  const char* toString() const {
-    const int idx = getIdx();
-    if (kind_ == REG) {
-      if (isExt8bit()) {
-        static const char* tbl[4] = {"spl", "bpl", "sil", "dil"};
-        return tbl[idx - 4];
-      }
-      static const char* tbl[4][16] = {
-          {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh", "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b",
-           "r15b"},
-          {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di", "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w",
-           "r15w"},
-          {"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi", "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d",
-           "r15d"},
-          {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14",
-           "r15"},
-      };
-      return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx];
-    } else if (isOPMASK()) {
-      static const char* tbl[8] = {"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7"};
-      return tbl[idx];
-    } else if (isTMM()) {
-      static const char* tbl[8] = {"tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7"};
-      return tbl[idx];
-    } else if (isZMM()) {
-      static const char* tbl[32] = {"zmm0",  "zmm1",  "zmm2",  "zmm3",  "zmm4",  "zmm5",  "zmm6",  "zmm7",
-                                    "zmm8",  "zmm9",  "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
-                                    "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
-                                    "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31"};
-      return tbl[idx];
-    } else if (isYMM()) {
-      static const char* tbl[32] = {"ymm0",  "ymm1",  "ymm2",  "ymm3",  "ymm4",  "ymm5",  "ymm6",  "ymm7",
-                                    "ymm8",  "ymm9",  "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
-                                    "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
-                                    "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31"};
-      return tbl[idx];
-    } else if (isXMM()) {
-      static const char* tbl[32] = {"xmm0",  "xmm1",  "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
-                                    "xmm8",  "xmm9",  "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
-                                    "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
-                                    "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31"};
-      return tbl[idx];
-    } else if (isMMX()) {
-      static const char* tbl[8] = {"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"};
-      return tbl[idx];
-    } else if (isFPU()) {
-      static const char* tbl[8] = {"st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7"};
-      return tbl[idx];
-    } else if (isBNDREG()) {
-      static const char* tbl[4] = {"bnd0", "bnd1", "bnd2", "bnd3"};
-      return tbl[idx];
-    }
-    XBYAK_THROW_RET(ERR_INTERNAL, 0);
-  }
-  bool isEqualIfNotInherited(const Operand& rhs) const {
-    return idx_ == rhs.idx_ && kind_ == rhs.kind_ && bit_ == rhs.bit_ && zero_ == rhs.zero_ && mask_ == rhs.mask_ &&
-           rounding_ == rhs.rounding_;
-  }
-  bool operator==(const Operand& rhs) const;
-  bool operator!=(const Operand& rhs) const { return !operator==(rhs); }
-  const Address& getAddress() const;
-  const Reg& getReg() const;
-};
-
-inline void Operand::setBit(int bit) {
-  if (bit != 8 && bit != 16 && bit != 32 && bit != 64 && bit != 128 && bit != 256 && bit != 512 && bit != 8192)
-    goto ERR;
-  if (isBit(bit)) return;
-  if (is(MEM | OPMASK)) {
-    bit_ = bit;
-    return;
-  }
-  if (is(REG | XMM | YMM | ZMM | TMM)) {
-    int idx = getIdx();
-    // err if converting ah, bh, ch, dh
-    if (isREG(8) && (4 <= idx && idx < 8) && !isExt8bit()) goto ERR;
-    Kind kind = REG;
-    switch (bit) {
-      case 8:
-        if (idx >= 16) goto ERR;
-#ifdef XBYAK32
-        if (idx >= 4) goto ERR;
-#else
-        if (4 <= idx && idx < 8) idx |= EXT8BIT;
-#endif
-        break;
-      case 16:
-      case 32:
-      case 64:
-        if (idx >= 16) goto ERR;
-        break;
-      case 128:
-        kind = XMM;
-        break;
-      case 256:
-        kind = YMM;
-        break;
-      case 512:
-        kind = ZMM;
-        break;
-      case 8192:
-        kind = TMM;
-        break;
-    }
-    idx_ = idx;
-    kind_ = kind;
-    bit_ = bit;
-    if (bit >= 128) return;  // keep mask_ and rounding_
-    mask_ = 0;
-    rounding_ = 0;
-    return;
-  }
-ERR:
-  XBYAK_THROW(ERR_CANT_CONVERT)
-}
-
-class Label;
-
-struct Reg8;
-struct Reg16;
-struct Reg32;
-#ifdef XBYAK64
-struct Reg64;
-#endif
-class Reg : public Operand {
- public:
-  XBYAK_CONSTEXPR Reg() {}
-  XBYAK_CONSTEXPR Reg(int idx, Kind kind, int bit = 0, bool ext8bit = false) : Operand(idx, kind, bit, ext8bit) {}
-  // convert to Reg8/Reg16/Reg32/Reg64/XMM/YMM/ZMM
-  Reg changeBit(int bit) const {
-    Reg r(*this);
-    r.setBit(bit);
-    return r;
-  }
-  uint8_t getRexW() const { return isREG(64) ? 8 : 0; }
-  uint8_t getRexR() const { return isExtIdx() ? 4 : 0; }
-  uint8_t getRexX() const { return isExtIdx() ? 2 : 0; }
-  uint8_t getRexB() const { return isExtIdx() ? 1 : 0; }
-  uint8_t getRex(const Reg& base = Reg()) const {
-    uint8_t rex = getRexW() | getRexR() | base.getRexW() | base.getRexB();
-    if (rex || isExt8bit() || base.isExt8bit()) rex |= 0x40;
-    return rex;
-  }
-  Reg8 cvt8() const;
-  Reg16 cvt16() const;
-  Reg32 cvt32() const;
-#ifdef XBYAK64
-  Reg64 cvt64() const;
-#endif
-};
-
-inline const Reg& Operand::getReg() const {
-  assert(!isMEM());
-  return static_cast<const Reg&>(*this);
-}
-
-struct Reg8 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg8(int idx = 0, bool ext8bit = false) : Reg(idx, Operand::REG, 8, ext8bit) {}
-};
-
-struct Reg16 : public Reg {
-  explicit XBYAK_CONSTEXPR Reg16(int idx = 0) : Reg(idx, Operand::REG, 16) {}
-};
-
-struct Mmx : public Reg {
-  explicit XBYAK_CONSTEXPR Mmx(int idx = 0, Kind kind = Operand::MMX, int bit = 64) : Reg(idx, kind, bit) {}
-};
-
-struct EvexModifierRounding {
-  enum { T_RN_SAE = 1, T_RD_SAE = 2, T_RU_SAE = 3, T_RZ_SAE = 4, T_SAE = 5 };
-  explicit XBYAK_CONSTEXPR EvexModifierRounding(int rounding) : rounding(rounding) {}
-  int rounding;
-};
-struct EvexModifierZero {
-  XBYAK_CONSTEXPR EvexModifierZero() {}
-};
-
-struct Xmm : public Mmx {
-  explicit XBYAK_CONSTEXPR Xmm(int idx = 0, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) {}
-  XBYAK_CONSTEXPR Xmm(Kind kind, int idx) : Mmx(idx, kind, kind == XMM ? 128 : kind == YMM ? 256 : 512) {}
-  Xmm operator|(const EvexModifierRounding& emr) const {
-    Xmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-  Xmm copyAndSetIdx(int idx) const {
-    Xmm ret(*this);
-    ret.setIdx(idx);
-    return ret;
-  }
-  Xmm copyAndSetKind(Operand::Kind kind) const {
-    Xmm ret(*this);
-    ret.setKind(kind);
-    return ret;
-  }
-};
-
-struct Ymm : public Xmm {
-  explicit XBYAK_CONSTEXPR Ymm(int idx = 0, Kind kind = Operand::YMM, int bit = 256) : Xmm(idx, kind, bit) {}
-  Ymm operator|(const EvexModifierRounding& emr) const {
-    Ymm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-struct Zmm : public Ymm {
-  explicit XBYAK_CONSTEXPR Zmm(int idx = 0) : Ymm(idx, Operand::ZMM, 512) {}
-  Zmm operator|(const EvexModifierRounding& emr) const {
-    Zmm r(*this);
-    r.setRounding(emr.rounding);
-    return r;
-  }
-};
-
-#ifdef XBYAK64
-struct Tmm : public Reg {
-  explicit XBYAK_CONSTEXPR Tmm(int idx = 0, Kind kind = Operand::TMM, int bit = 8192) : Reg(idx, kind, bit) {}
-};
-#endif
-
-struct Opmask : public Reg {
-  explicit XBYAK_CONSTEXPR Opmask(int idx = 0) : Reg(idx, Operand::OPMASK, 64) {}
-};
-
-struct BoundsReg : public Reg {
-  explicit XBYAK_CONSTEXPR BoundsReg(int idx = 0) : Reg(idx, Operand::BNDREG, 128) {}
-};
-
-template <class T>
-T operator|(const T& x, const Opmask& k) {
-  T r(x);
-  r.setOpmaskIdx(k.getIdx());
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierZero&) {
-  T r(x);
-  r.setZero();
-  return r;
-}
-template <class T>
-T operator|(const T& x, const EvexModifierRounding& emr) {
-  T r(x);
-  r.setRounding(emr.rounding);
-  return r;
-}
-
-struct Fpu : public Reg {
-  explicit XBYAK_CONSTEXPR Fpu(int idx = 0) : Reg(idx, Operand::FPU, 32) {}
-};
-
-struct Reg32e : public Reg {
-  explicit XBYAK_CONSTEXPR Reg32e(int idx, int bit) : Reg(idx, Operand::REG, bit) {}
-};
-struct Reg32 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg32(int idx = 0) : Reg32e(idx, 32) {}
-};
-#ifdef XBYAK64
-struct Reg64 : public Reg32e {
-  explicit XBYAK_CONSTEXPR Reg64(int idx = 0) : Reg32e(idx, 64) {}
-};
-struct RegRip {
-  int64_t disp_;
-  const Label* label_;
-  bool isAddr_;
-  explicit XBYAK_CONSTEXPR RegRip(int64_t disp = 0, const Label* label = 0, bool isAddr = false)
-      : disp_(disp), label_(label), isAddr_(isAddr) {}
-  friend const RegRip operator+(const RegRip& r, int disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, int64_t disp) { return RegRip(r.disp_ + disp, r.label_, r.isAddr_); }
-  friend const RegRip operator-(const RegRip& r, int64_t disp) { return RegRip(r.disp_ - disp, r.label_, r.isAddr_); }
-  friend const RegRip operator+(const RegRip& r, const Label& label) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_, &label);
-  }
-  friend const RegRip operator+(const RegRip& r, const void* addr) {
-    if (r.label_ || r.isAddr_) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegRip());
-    return RegRip(r.disp_ + (int64_t)addr, 0, true);
-  }
-};
-#endif
-
-inline Reg8 Reg::cvt8() const {
-  Reg r = changeBit(8);
-  return Reg8(r.getIdx(), r.isExt8bit());
-}
-
-inline Reg16 Reg::cvt16() const { return Reg16(changeBit(16).getIdx()); }
-
-inline Reg32 Reg::cvt32() const { return Reg32(changeBit(32).getIdx()); }
-
-#ifdef XBYAK64
-inline Reg64 Reg::cvt64() const { return Reg64(changeBit(64).getIdx()); }
-#endif
-
-#ifndef XBYAK_DISABLE_SEGMENT
-// not derived from Reg
-class Segment {
-  int idx_;
-
- public:
-  enum { es, cs, ss, ds, fs, gs };
-  explicit XBYAK_CONSTEXPR Segment(int idx) : idx_(idx) { assert(0 <= idx_ && idx_ < 6); }
-  int getIdx() const { return idx_; }
-  const char* toString() const {
-    static const char tbl[][3] = {"es", "cs", "ss", "ds", "fs", "gs"};
-    return tbl[idx_];
-  }
-};
-#endif
-
-class RegExp {
- public:
-#ifdef XBYAK64
-  enum { i32e = 32 | 64 };
-#else
-  enum { i32e = 32 };
-#endif
-  XBYAK_CONSTEXPR RegExp(size_t disp = 0) : scale_(0), disp_(disp) {}
-  XBYAK_CONSTEXPR RegExp(const Reg& r, int scale = 1) : scale_(scale), disp_(0) {
-    if (!r.isREG(i32e) && !r.is(Reg::XMM | Reg::YMM | Reg::ZMM | Reg::TMM)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (scale == 0) return;
-    if (scale != 1 && scale != 2 && scale != 4 && scale != 8) XBYAK_THROW(ERR_BAD_SCALE)
-    if (r.getBit() >= 128 || scale != 1) {  // xmm/ymm is always index
-      index_ = r;
-    } else {
-      base_ = r;
-    }
-  }
-  bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
-  RegExp optimize() const {
-    RegExp exp = *this;
-    // [reg * 2] => [reg + reg]
-    if (index_.isBit(i32e) && !base_.getBit() && scale_ == 2) {
-      exp.base_ = index_;
-      exp.scale_ = 1;
-    }
-    return exp;
-  }
-  bool operator==(const RegExp& rhs) const {
-    return base_ == rhs.base_ && index_ == rhs.index_ && disp_ == rhs.disp_ && scale_ == rhs.scale_;
-  }
-  const Reg& getBase() const { return base_; }
-  const Reg& getIndex() const { return index_; }
-  int getScale() const { return scale_; }
-  size_t getDisp() const { return disp_; }
-  XBYAK_CONSTEXPR void verify() const {
-    if (base_.getBit() >= 128) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    if (index_.getBit() && index_.getBit() <= 64) {
-      if (index_.getIdx() == Operand::ESP) XBYAK_THROW(ERR_ESP_CANT_BE_INDEX)
-      if (base_.getBit() && base_.getBit() != index_.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  friend RegExp operator+(const RegExp& a, const RegExp& b);
-  friend RegExp operator-(const RegExp& e, size_t disp);
-  uint8_t getRex() const {
-    uint8_t rex = index_.getRexX() | base_.getRexB();
-    return rex ? uint8_t(rex | 0x40) : 0;
-  }
-
- private:
-  /*
-          [base_ + index_ * scale_ + disp_]
-          base : Reg32e, index : Reg32e(w/o esp), Xmm, Ymm
-  */
-  Reg base_;
-  Reg index_;
-  int scale_;
-  size_t disp_;
-};
-
-inline RegExp operator+(const RegExp& a, const RegExp& b) {
-  if (a.index_.getBit() && b.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-  RegExp ret = a;
-  if (!ret.index_.getBit()) {
-    ret.index_ = b.index_;
-    ret.scale_ = b.scale_;
-  }
-  if (b.base_.getBit()) {
-    if (ret.base_.getBit()) {
-      if (ret.index_.getBit()) XBYAK_THROW_RET(ERR_BAD_ADDRESSING, RegExp())
-      // base + base => base + index * 1
-      ret.index_ = b.base_;
-      // [reg + esp] => [esp + reg]
-      if (ret.index_.getIdx() == Operand::ESP) std::swap(ret.base_, ret.index_);
-      ret.scale_ = 1;
-    } else {
-      ret.base_ = b.base_;
-    }
-  }
-  ret.disp_ += b.disp_;
-  return ret;
-}
-inline RegExp operator*(const Reg& r, int scale) { return RegExp(r, scale); }
-inline RegExp operator*(int scale, const Reg& r) { return r * scale; }
-inline RegExp operator-(const RegExp& e, size_t disp) {
-  RegExp ret = e;
-  ret.disp_ -= disp;
-  return ret;
-}
-
-// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
-void* const AutoGrow = (void*)1;           //-V566
-void* const DontSetProtectRWE = (void*)2;  //-V566
-
-class CodeArray {
-  enum Type {
-    USER_BUF = 1,  // use userPtr(non alignment, non protect)
-    ALLOC_BUF,     // use new(alignment, protect)
-    AUTO_GROW      // automatically move and grow memory if necessary
-  };
-  CodeArray(const CodeArray& rhs);
-  void operator=(const CodeArray&);
-  bool isAllocType() const { return type_ == ALLOC_BUF || type_ == AUTO_GROW; }
-  struct AddrInfo {
-    size_t codeOffset;  // position to write
-    size_t jmpAddr;     // value to write
-    int jmpSize;        // size of jmpAddr
-    inner::LabelMode mode;
-    AddrInfo(size_t _codeOffset, size_t _jmpAddr, int _jmpSize, inner::LabelMode _mode)
-        : codeOffset(_codeOffset), jmpAddr(_jmpAddr), jmpSize(_jmpSize), mode(_mode) {}
-    uint64_t getVal(const uint8_t* top) const {
-      uint64_t disp = (mode == inner::LaddTop) ? jmpAddr + size_t(top)
-                      : (mode == inner::LasIs) ? jmpAddr
-                                               : jmpAddr - size_t(top);
-      if (jmpSize == 4) disp = inner::VerifyInInt32(disp);
-      return disp;
-    }
-  };
-  typedef std::list<AddrInfo> AddrInfoList;
-  AddrInfoList addrInfoList_;
-  const Type type_;
-#ifdef XBYAK_USE_MMAP_ALLOCATOR
-  MmapAllocator defaultAllocator_;
-#else
-  Allocator defaultAllocator_;
-#endif
-  Allocator* alloc_;
-
- protected:
-  size_t maxSize_;
-  uint8_t* top_;
-  size_t size_;
-  bool isCalledCalcJmpAddress_;
-
-  bool useProtect() const { return alloc_->useProtect(); }
-  /*
-          allocate new memory and copy old data to the new area
-  */
-  void growMemory() {
-    const size_t newSize = (std::max<size_t>)(DEFAULT_MAX_CODE_SIZE, maxSize_ * 2);
-    uint8_t* newTop = alloc_->alloc(newSize);
-    if (newTop == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    for (size_t i = 0; i < size_; i++) newTop[i] = top_[i];
-    alloc_->free(top_);
-    top_ = newTop;
-    maxSize_ = newSize;
-  }
-  /*
-          calc jmp address for AutoGrow mode
-  */
-  void calcJmpAddress() {
-    if (isCalledCalcJmpAddress_) return;
-    for (AddrInfoList::const_iterator i = addrInfoList_.begin(), ie = addrInfoList_.end(); i != ie; ++i) {
-      uint64_t disp = i->getVal(top_);
-      rewrite(i->codeOffset, disp, i->jmpSize);
-    }
-    isCalledCalcJmpAddress_ = true;
-  }
-
- public:
-  enum ProtectMode {
-    PROTECT_RW = 0,   // read/write
-    PROTECT_RWE = 1,  // read/write/exec
-    PROTECT_RE = 2    // read/exec
-  };
-  explicit CodeArray(size_t maxSize, void* userPtr = 0, Allocator* allocator = 0)
-      : type_(userPtr == AutoGrow                              ? AUTO_GROW
-              : (userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF
-                                                               : USER_BUF),
-        alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_),
-        maxSize_(maxSize),
-        top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1))),
-        size_(0),
-        isCalledCalcJmpAddress_(false) {
-    if (maxSize_ > 0 && top_ == 0) XBYAK_THROW(ERR_CANT_ALLOC)
-    if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
-      alloc_->free(top_);
-      XBYAK_THROW(ERR_CANT_PROTECT)
-    }
-  }
-  virtual ~CodeArray() {
-    if (isAllocType()) {
-      if (useProtect()) setProtectModeRW(false);
-      alloc_->free(top_);
-    }
-  }
-  bool setProtectMode(ProtectMode mode, bool throwException = true) {
-    bool isOK = protect(top_, maxSize_, mode);
-    if (isOK) return true;
-    if (throwException) XBYAK_THROW_RET(ERR_CANT_PROTECT, false)
-    return false;
-  }
-  bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
-  bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
-  void resetSize() {
-    size_ = 0;
-    addrInfoList_.clear();
-    isCalledCalcJmpAddress_ = false;
-  }
-  void db(int code) {
-    if (size_ >= maxSize_) {
-      if (type_ == AUTO_GROW) {
-        growMemory();
-      } else {
-        XBYAK_THROW(ERR_CODE_IS_TOO_BIG)
-      }
-    }
-    top_[size_++] = static_cast<uint8_t>(code);
-  }
-  void db(const uint8_t* code, size_t codeSize) {
-    for (size_t i = 0; i < codeSize; i++) db(code[i]);
-  }
-  void db(uint64_t code, size_t codeSize) {
-    if (codeSize > 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    for (size_t i = 0; i < codeSize; i++) db(static_cast<uint8_t>(code >> (i * 8)));
-  }
-  void dw(uint32_t code) { db(code, 2); }
-  void dd(uint32_t code) { db(code, 4); }
-  void dq(uint64_t code) { db(code, 8); }
-  const uint8_t* getCode() const { return top_; }
-  template <class F>
-  const F getCode() const {
-    return reinterpret_cast<F>(top_);
-  }
-  const uint8_t* getCurr() const { return &top_[size_]; }
-  template <class F>
-  const F getCurr() const {
-    return reinterpret_cast<F>(&top_[size_]);
-  }
-  size_t getSize() const { return size_; }
-  void setSize(size_t size) {
-    if (size > maxSize_) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-    size_ = size;
-  }
-  void dump() const {
-    const uint8_t* p = getCode();
-    size_t bufSize = getSize();
-    size_t remain = bufSize;
-    for (int i = 0; i < 4; i++) {
-      size_t disp = 16;
-      if (remain < 16) {
-        disp = remain;
-      }
-      for (size_t j = 0; j < 16; j++) {
-        if (j < disp) {
-          printf("%02X", p[i * 16 + j]);
-        }
-      }
-      putchar('\n');
-      remain -= disp;
-      if (remain == 0) {
-        break;
-      }
-    }
-  }
-  /*
-          @param offset [in] offset from top
-          @param disp [in] offset from the next of jmp
-          @param size [in] write size(1, 2, 4, 8)
-  */
-  void rewrite(size_t offset, uint64_t disp, size_t size) {
-    assert(offset < maxSize_);
-    if (size != 1 && size != 2 && size != 4 && size != 8) XBYAK_THROW(ERR_BAD_PARAMETER)
-    uint8_t* const data = top_ + offset;
-    for (size_t i = 0; i < size; i++) {
-      data[i] = static_cast<uint8_t>(disp >> (i * 8));
-    }
-  }
-  void save(size_t offset, size_t val, int size, inner::LabelMode mode) {
-    addrInfoList_.push_back(AddrInfo(offset, val, size, mode));
-  }
-  bool isAutoGrow() const { return type_ == AUTO_GROW; }
-  bool isCalledCalcJmpAddress() const { return isCalledCalcJmpAddress_; }
-  /**
-          change exec permission of memory
-          @param addr [in] buffer address
-          @param size [in] buffer size
-          @param protectMode [in] mode(RW/RWE/RE)
-          @return true(success), false(failure)
-  */
-  static inline bool protect(const void* addr, size_t size, int protectMode) {
-#if defined(_WIN32)
-    const DWORD c_rw = PAGE_READWRITE;
-    const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
-    const DWORD c_re = PAGE_EXECUTE_READ;
-    DWORD mode;
-#else
-    const int c_rw = PROT_READ | PROT_WRITE;
-    const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
-    const int c_re = PROT_READ | PROT_EXEC;
-    int mode;
-#endif
-    switch (protectMode) {
-      case PROTECT_RW:
-        mode = c_rw;
-        break;
-      case PROTECT_RWE:
-        mode = c_rwe;
-        break;
-      case PROTECT_RE:
-        mode = c_re;
-        break;
-      default:
-        return false;
-    }
-#if defined(_WIN32)
-    DWORD oldProtect;
-    return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
-#elif defined(__GNUC__)
-    size_t pageSize = sysconf(_SC_PAGESIZE);
-    size_t iaddr = reinterpret_cast<size_t>(addr);
-    size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
-    return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
-#else
-    return true;
-#endif
-  }
-  /**
-          get aligned memory pointer
-          @param addr [in] address
-          @param alignedSize [in] power of two
-          @return aligned addr by alingedSize
-  */
-  static inline uint8_t* getAlignedAddress(uint8_t* addr, size_t alignedSize = 16) {
-    return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) &
-                                      ~(alignedSize - static_cast<size_t>(1)));
-  }
-};
-
-class Address : public Operand {
- public:
-  enum Mode { M_ModRM, M_64bitDisp, M_rip, M_ripAddr };
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegExp& e)
-      : Operand(0, MEM, sizeBit), e_(e), label_(0), mode_(M_ModRM), broadcast_(broadcast) {
-    e_.verify();
-  }
-#ifdef XBYAK64
-  explicit XBYAK_CONSTEXPR Address(size_t disp)
-      : Operand(0, MEM, 64), e_(disp), label_(0), mode_(M_64bitDisp), broadcast_(false) {}
-  XBYAK_CONSTEXPR Address(uint32_t sizeBit, bool broadcast, const RegRip& addr)
-      : Operand(0, MEM, sizeBit),
-        e_(addr.disp_),
-        label_(addr.label_),
-        mode_(addr.isAddr_ ? M_ripAddr : M_rip),
-        broadcast_(broadcast) {}
-#endif
-  RegExp getRegExp(bool optimize = true) const { return optimize ? e_.optimize() : e_; }
-  Mode getMode() const { return mode_; }
-  bool is32bit() const { return e_.getBase().getBit() == 32 || e_.getIndex().getBit() == 32; }
-  bool isOnlyDisp() const { return !e_.getBase().getBit() && !e_.getIndex().getBit(); }  // for mov eax
-  size_t getDisp() const { return e_.getDisp(); }
-  uint8_t getRex() const {
-    if (mode_ != M_ModRM) return 0;
-    return getRegExp().getRex();
-  }
-  bool is64bitDisp() const { return mode_ == M_64bitDisp; }  // for moffset
-  bool isBroadcast() const { return broadcast_; }
-  const Label* getLabel() const { return label_; }
-  bool operator==(const Address& rhs) const {
-    return getBit() == rhs.getBit() && e_ == rhs.e_ && label_ == rhs.label_ && mode_ == rhs.mode_ &&
-           broadcast_ == rhs.broadcast_;
-  }
-  bool operator!=(const Address& rhs) const { return !operator==(rhs); }
-  bool isVsib() const { return e_.isVsib(); }
-
- private:
-  RegExp e_;
-  const Label* label_;
-  Mode mode_;
-  bool broadcast_;
-};
-
-inline const Address& Operand::getAddress() const {
-  assert(isMEM());
-  return static_cast<const Address&>(*this);
-}
-
-inline bool Operand::operator==(const Operand& rhs) const {
-  if (isMEM() && rhs.isMEM()) return this->getAddress() == rhs.getAddress();
-  return isEqualIfNotInherited(rhs);
-}
-
-class AddressFrame {
-  void operator=(const AddressFrame&);
-  AddressFrame(const AddressFrame&);
-
- public:
-  const uint32_t bit_;
-  const bool broadcast_;
-  explicit XBYAK_CONSTEXPR AddressFrame(uint32_t bit, bool broadcast = false) : bit_(bit), broadcast_(broadcast) {}
-  Address operator[](const RegExp& e) const { return Address(bit_, broadcast_, e); }
-  Address operator[](const void* disp) const {
-    return Address(bit_, broadcast_, RegExp(reinterpret_cast<size_t>(disp)));
-  }
-#ifdef XBYAK64
-  Address operator[](uint64_t disp) const { return Address(disp); }
-  Address operator[](const RegRip& addr) const { return Address(bit_, broadcast_, addr); }
-#endif
-};
-
-struct JmpLabel {
-  size_t endOfJmp; /* offset from top to the end address of jmp */
-  int jmpSize;
-  inner::LabelMode mode;
-  size_t disp;  // disp for [rip + disp]
-  explicit JmpLabel(size_t endOfJmp = 0, int jmpSize = 0, inner::LabelMode mode = inner::LasIs, size_t disp = 0)
-      : endOfJmp(endOfJmp), jmpSize(jmpSize), mode(mode), disp(disp) {}
-};
-
-class LabelManager;
-
-class Label {
-  mutable LabelManager* mgr;
-  mutable int id;
-  friend class LabelManager;
-
- public:
-  Label() : mgr(0), id(0) {}
-  Label(const Label& rhs);
-  Label& operator=(const Label& rhs);
-  ~Label();
-  void clear() {
-    mgr = 0;
-    id = 0;
-  }
-  int getId() const { return id; }
-  const uint8_t* getAddress() const;
-
-  // backward compatibility
-  static inline std::string toStr(int num) {
-    char buf[16];
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-    _snprintf_s
-#else
-    snprintf
-#endif
-        (buf, sizeof(buf), ".%08x", num);
-    return buf;
-  }
-};
-
-class LabelManager {
-  // for string label
-  struct SlabelVal {
-    size_t offset;
-    SlabelVal(size_t offset) : offset(offset) {}
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<std::string, SlabelVal> SlabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<std::string, const JmpLabel> SlabelUndefList;
-  struct SlabelState {
-    SlabelDefList defList;
-    SlabelUndefList undefList;
-  };
-  typedef std::list<SlabelState> StateList;
-  // for Label class
-  struct ClabelVal {
-    ClabelVal(size_t offset = 0) : offset(offset), refCount(1) {}
-    size_t offset;
-    int refCount;
-  };
-  typedef XBYAK_STD_UNORDERED_MAP<int, ClabelVal> ClabelDefList;
-  typedef XBYAK_STD_UNORDERED_MULTIMAP<int, const JmpLabel> ClabelUndefList;
-  typedef XBYAK_STD_UNORDERED_SET<Label*> LabelPtrList;
-
-  CodeArray* base_;
-  // global : stateList_.front(), local : stateList_.back()
-  StateList stateList_;
-  mutable int labelId_;
-  ClabelDefList clabelDefList_;
-  ClabelUndefList clabelUndefList_;
-  LabelPtrList labelPtrList_;
-
-  int getId(const Label& label) const {
-    if (label.id == 0) label.id = labelId_++;
-    return label.id;
-  }
-  template <class DefList, class UndefList, class T>
-  void define_inner(DefList& defList, UndefList& undefList, const T& labelId, size_t addrOffset) {
-    // add label
-    typename DefList::value_type item(labelId, addrOffset);
-    std::pair<typename DefList::iterator, bool> ret = defList.insert(item);
-    if (!ret.second) XBYAK_THROW(ERR_LABEL_IS_REDEFINED)
-    // search undefined label
-    for (;;) {
-      typename UndefList::iterator itr = undefList.find(labelId);
-      if (itr == undefList.end()) break;
-      const JmpLabel* jmp = &itr->second;
-      const size_t offset = jmp->endOfJmp - jmp->jmpSize;
-      size_t disp;
-      if (jmp->mode == inner::LaddTop) {
-        disp = addrOffset;
-      } else if (jmp->mode == inner::Labs) {
-        disp = size_t(base_->getCurr());
-      } else {
-        disp = addrOffset - jmp->endOfJmp + jmp->disp;
-#ifdef XBYAK64
-        if (jmp->jmpSize <= 4 && !inner::IsInInt32(disp)) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-        if (jmp->jmpSize == 1 && !inner::IsInDisp8((uint32_t)disp)) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      }
-      if (base_->isAutoGrow()) {
-        base_->save(offset, disp, jmp->jmpSize, jmp->mode);
-      } else {
-        base_->rewrite(offset, disp, jmp->jmpSize);
-      }
-      undefList.erase(itr);
-    }
-  }
-  template <class DefList, class T>
-  bool getOffset_inner(const DefList& defList, size_t* offset, const T& label) const {
-    typename DefList::const_iterator i = defList.find(label);
-    if (i == defList.end()) return false;
-    *offset = i->second.offset;
-    return true;
-  }
-  friend class Label;
-  void incRefCount(int id, Label* label) {
-    clabelDefList_[id].refCount++;
-    labelPtrList_.insert(label);
-  }
-  void decRefCount(int id, Label* label) {
-    labelPtrList_.erase(label);
-    ClabelDefList::iterator i = clabelDefList_.find(id);
-    if (i == clabelDefList_.end()) return;
-    if (i->second.refCount == 1) {
-      clabelDefList_.erase(id);
-    } else {
-      --i->second.refCount;
-    }
-  }
-  template <class T>
-  bool hasUndefinedLabel_inner(const T& list) const {
-#ifndef NDEBUG
-    for (typename T::const_iterator i = list.begin(); i != list.end(); ++i) {
-      std::cerr << "undefined label:" << i->first << std::endl;
-    }
-#endif
-    return !list.empty();
-  }
-  // detach all labels linked to LabelManager
-  void resetLabelPtrList() {
-    for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
-      (*i)->clear();
-    }
-    labelPtrList_.clear();
-  }
-
- public:
-  LabelManager() { reset(); }
-  ~LabelManager() { resetLabelPtrList(); }
-  void reset() {
-    base_ = 0;
-    labelId_ = 1;
-    stateList_.clear();
-    stateList_.push_back(SlabelState());
-    stateList_.push_back(SlabelState());
-    clabelDefList_.clear();
-    clabelUndefList_.clear();
-    resetLabelPtrList();
-  }
-  void enterLocal() { stateList_.push_back(SlabelState()); }
-  void leaveLocal() {
-    if (stateList_.size() <= 2) XBYAK_THROW(ERR_UNDER_LOCAL_LABEL)
-    if (hasUndefinedLabel_inner(stateList_.back().undefList)) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    stateList_.pop_back();
-  }
-  void set(CodeArray* base) { base_ = base; }
-  void defineSlabel(std::string label) {
-    if (label == "@b" || label == "@f") XBYAK_THROW(ERR_BAD_LABEL_STR)
-    if (label == "@@") {
-      SlabelDefList& defList = stateList_.front().defList;
-      SlabelDefList::iterator i = defList.find("@f");
-      if (i != defList.end()) {
-        defList.erase(i);
-        label = "@b";
-      } else {
-        i = defList.find("@b");
-        if (i != defList.end()) {
-          defList.erase(i);
-        }
-        label = "@f";
-      }
-    }
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    define_inner(st.defList, st.undefList, label, base_->getSize());
-  }
-  void defineClabel(Label& label) {
-    define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getSize());
-    label.mgr = this;
-    labelPtrList_.insert(&label);
-  }
-  void assign(Label& dst, const Label& src) {
-    ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
-    if (i == clabelDefList_.end()) XBYAK_THROW(ERR_LABEL_ISNOT_SET_BY_L)
-    define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.offset);
-    dst.mgr = this;
-    labelPtrList_.insert(&dst);
-  }
-  bool getOffset(size_t* offset, std::string& label) const {
-    const SlabelDefList& defList = stateList_.front().defList;
-    if (label == "@b") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@f";
-      } else if (defList.find("@b") == defList.end()) {
-        XBYAK_THROW_RET(ERR_LABEL_IS_NOT_FOUND, false)
-      }
-    } else if (label == "@f") {
-      if (defList.find("@f") != defList.end()) {
-        label = "@b";
-      }
-    }
-    const SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    return getOffset_inner(st.defList, offset, label);
-  }
-  bool getOffset(size_t* offset, const Label& label) const {
-    return getOffset_inner(clabelDefList_, offset, getId(label));
-  }
-  void addUndefinedLabel(const std::string& label, const JmpLabel& jmp) {
-    SlabelState& st = *label.c_str() == '.' ? stateList_.back() : stateList_.front();
-    st.undefList.insert(SlabelUndefList::value_type(label, jmp));
-  }
-  void addUndefinedLabel(const Label& label, const JmpLabel& jmp) {
-    clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
-  }
-  bool hasUndefSlabel() const {
-    for (StateList::const_iterator i = stateList_.begin(), ie = stateList_.end(); i != ie; ++i) {
-      if (hasUndefinedLabel_inner(i->undefList)) return true;
-    }
-    return false;
-  }
-  bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
-  const uint8_t* getCode() const { return base_->getCode(); }
-  bool isReady() const { return !base_->isAutoGrow() || base_->isCalledCalcJmpAddress(); }
-};
-
-inline Label::Label(const Label& rhs) {
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-}
-inline Label& Label::operator=(const Label& rhs) {
-  if (id) XBYAK_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
-  id = rhs.id;
-  mgr = rhs.mgr;
-  if (mgr) mgr->incRefCount(id, this);
-  return *this;
-}
-inline Label::~Label() {
-  if (id && mgr) mgr->decRefCount(id, this);
-}
-inline const uint8_t* Label::getAddress() const {
-  if (mgr == 0 || !mgr->isReady()) return 0;
-  size_t offset;
-  if (!mgr->getOffset(&offset, *this)) return 0;
-  return mgr->getCode() + offset;
-}
-
-typedef enum { DefaultEncoding, VexEncoding, EvexEncoding } PreferredEncoding;
-
-class CodeGenerator : public CodeArray {
- public:
-  enum LabelType {
-    T_SHORT,
-    T_NEAR,
-    T_FAR,  // far jump
-    T_AUTO  // T_SHORT if possible
-  };
-
- private:
-  CodeGenerator operator=(const CodeGenerator&);  // don't call
-#ifdef XBYAK64
-  enum {i32e = 32 | 64, BIT = 64};
-  static const uint64_t dummyAddr = uint64_t(0x1122334455667788ull);
-  typedef Reg64 NativeReg;
-#else
-  enum {i32e = 32, BIT = 32};
-  static const size_t dummyAddr = 0x12345678;
-  typedef Reg32 NativeReg;
-#endif
-  // (XMM, XMM|MEM)
-  static inline bool isXMM_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isXMM() || op2.isMEM());
-  }
-  // (MMX, MMX|MEM) or (XMM, XMM|MEM)
-  static inline bool isXMMorMMX_MEM(const Operand& op1, const Operand& op2) {
-    return (op1.isMMX() && (op2.isMMX() || op2.isMEM())) || isXMM_XMMorMEM(op1, op2);
-  }
-  // (XMM, MMX|MEM)
-  static inline bool isXMM_MMXorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isMMX() || op2.isMEM());
-  }
-  // (MMX, XMM|MEM)
-  static inline bool isMMX_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isMMX() && (op2.isXMM() || op2.isMEM());
-  }
-  // (XMM, REG32|MEM)
-  static inline bool isXMM_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isXMM() && (op2.isREG(i32e) || op2.isMEM());
-  }
-  // (REG32, XMM|MEM)
-  static inline bool isREG32_XMMorMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
-  }
-  // (REG32, REG32|MEM)
-  static inline bool isREG32_REG32orMEM(const Operand& op1, const Operand& op2) {
-    return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
-  }
-  static inline bool isValidSSE(const Operand& op1) {
-    // SSE instructions do not support XMM16 - XMM31
-    return !(op1.isXMM() && op1.getIdx() >= 16);
-  }
-  void rex(const Operand& op1, const Operand& op2 = Operand()) {
-    uint8_t rex = 0;
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM()) std::swap(p1, p2);
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isMEM()) {
-      const Address& addr = p2->getAddress();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      rex = addr.getRex() | p1->getReg().getRex();
-    } else {
-      // ModRM(reg, base);
-      rex = op2.getReg().getRex(op1.getReg());
-    }
-    // except movsx(16bit, 32/64bit)
-    if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
-    if (rex) db(rex);
-  }
-  enum AVXtype {
-    // low 3 bit
-    T_N1 = 1,
-    T_N2 = 2,
-    T_N4 = 3,
-    T_N8 = 4,
-    T_N16 = 5,
-    T_N32 = 6,
-    T_NX_MASK = 7,
-    //
-    T_N_VL = 1 << 3,     // N * (1, 2, 4) for VL
-    T_DUP = 1 << 4,      // N = (8, 32, 64)
-    T_66 = 1 << 5,       // pp = 1
-    T_F3 = 1 << 6,       // pp = 2
-    T_F2 = T_66 | T_F3,  // pp = 3
-    T_ER_R = 1 << 7,     // reg{er}
-    T_0F = 1 << 8,
-    T_0F38 = 1 << 9,
-    T_0F3A = 1 << 10,
-    T_L0 = 1 << 11,
-    T_L1 = 1 << 12,
-    T_W0 = 1 << 13,
-    T_W1 = 1 << 14,
-    T_EW0 = 1 << 15,
-    T_EW1 = 1 << 16,
-    T_YMM = 1 << 17,  // support YMM, ZMM
-    T_EVEX = 1 << 18,
-    T_ER_X = 1 << 19,       // xmm{er}
-    T_ER_Y = 1 << 20,       // ymm{er}
-    T_ER_Z = 1 << 21,       // zmm{er}
-    T_SAE_X = 1 << 22,      // xmm{sae}
-    T_SAE_Y = 1 << 23,      // ymm{sae}
-    T_SAE_Z = 1 << 24,      // zmm{sae}
-    T_MUST_EVEX = 1 << 25,  // contains T_EVEX
-    T_B32 = 1 << 26,        // m32bcst
-    T_B64 = 1 << 27,        // m64bcst
-    T_B16 = T_B32 | T_B64,  // m16bcst (Be careful)
-    T_M_K = 1 << 28,        // mem{k}
-    T_VSIB = 1 << 29,
-    T_MEM_EVEX = 1 << 30,  // use evex if mem
-    T_FP16 = 1 << 31,      // avx512-fp16
-    T_MAP5 = T_FP16 | T_0F,
-    T_MAP6 = T_FP16 | T_0F38,
-    T_XXX
-  };
-  // T_66 = 1, T_F3 = 2, T_F2 = 3
-  uint32_t getPP(int type) const { return (type >> 5) & 3; }
-  void vex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false) {
-    int w = (type & T_W1) ? 1 : 0;
-    bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
-    bool r = reg.isExtIdx();
-    bool b = base.isExtIdx();
-    int idx = v ? v->getIdx() : 0;
-    if ((idx | reg.getIdx() | base.getIdx()) >= 16) XBYAK_THROW(ERR_BAD_COMBINATION)
-    uint32_t pp = getPP(type);
-    uint32_t vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
-    if (!b && !x && !w && (type & T_0F)) {
-      db(0xC5);
-      db((r ? 0 : 0x80) | vvvv);
-    } else {
-      uint32_t mmmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-      db(0xC4);
-      db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm);
-      db((w << 7) | vvvv);
-    }
-    db(code);
-  }
-  void verifySAE(const Reg& r, int type) const {
-    if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_SAE_IS_INVALID)
-  }
-  void verifyER(const Reg& r, int type) const {
-    if ((type & T_ER_R) && r.isREG(32 | 64)) return;
-    if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
-    XBYAK_THROW(ERR_ER_IS_INVALID)
-  }
-  // (a, b, c) contains non zero two or three values then err
-  int verifyDuplicate(int a, int b, int c, int err) {
-    int v = a | b | c;
-    if ((a > 0 && a != v) + (b > 0 && b != v) + (c > 0 && c != v) > 0) XBYAK_THROW_RET(err, 0)
-    return v;
-  }
-  int evex(const Reg& reg, const Reg& base, const Operand* v, int type, int code, bool x = false, bool b = false,
-           int aaa = 0, uint32_t VL = 0, bool Hi16Vidx = false) {
-    if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
-    int w = (type & T_EW1) ? 1 : 0;
-    uint32_t mmm = (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
-    if (type & T_FP16) mmm |= 4;
-    uint32_t pp = getPP(type);
-    int idx = v ? v->getIdx() : 0;
-    uint32_t vvvv = ~idx;
-
-    bool R = !reg.isExtIdx();
-    bool X = x ? false : !base.isExtIdx2();
-    bool B = !base.isExtIdx();
-    bool Rp = !reg.isExtIdx2();
-    int LL;
-    int rounding =
-        verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
-    int disp8N = 1;
-    if (rounding) {
-      if (rounding == EvexModifierRounding::T_SAE) {
-        verifySAE(base, type);
-        LL = 0;
-      } else {
-        verifyER(base, type);
-        LL = rounding - 1;
-      }
-      b = true;
-    } else {
-      if (v) VL = (std::max)(VL, v->getBit());
-      VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
-      LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
-      if (b) {
-        disp8N = ((type & T_B16) == T_B16) ? 2 : (type & T_B32) ? 4 : 8;
-      } else if (type & T_DUP) {
-        disp8N = VL == 128 ? 8 : VL == 256 ? 32 : 64;
-      } else {
-        if ((type & (T_NX_MASK | T_N_VL)) == 0) {
-          type |= T_N16 | T_N_VL;  // default
-        }
-        int low = type & T_NX_MASK;
-        if (low > 0) {
-          disp8N = 1 << (low - 1);
-          if (type & T_N_VL) disp8N *= (VL == 512 ? 4 : VL == 256 ? 2 : 1);
-        }
-      }
-    }
-    bool Vp = !((v ? v->isExtIdx2() : 0) | Hi16Vidx);
-    bool z = reg.hasZero() || base.hasZero() || (v ? v->hasZero() : false);
-    if (aaa == 0)
-      aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0),
-                            ERR_OPMASK_IS_ALREADY_SET);
-    if (aaa == 0) z = 0;  // clear T_z if mask is not set
-    db(0x62);
-    db((R ? 0x80 : 0) | (X ? 0x40 : 0) | (B ? 0x20 : 0) | (Rp ? 0x10 : 0) | mmm);
-    db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | 4 | (pp & 3));
-    db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (Vp ? 8 : 0) | (aaa & 7));
-    db(code);
-    return disp8N;
-  }
-  void setModRM(int mod, int r1, int r2) { db(static_cast<uint8_t>((mod << 6) | ((r1 & 7) << 3) | (r2 & 7))); }
-  void setSIB(const RegExp& e, int reg, int disp8N = 0) {
-    uint64_t disp64 = e.getDisp();
-#if defined(XBYAK64) && !defined(__ILP32__)
-#ifdef XBYAK_OLD_DISP_CHECK
-    // treat 0xffffffff as 0xffffffffffffffff
-    uint64_t high = disp64 >> 32;
-    if (high != 0 && high != 0xFFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#else
-    // displacement should be a signed 32-bit value, so also check sign bit
-    uint64_t high = disp64 >> 31;
-    if (high != 0 && high != 0x1FFFFFFFF) XBYAK_THROW(ERR_OFFSET_IS_TOO_BIG)
-#endif
-#endif
-    uint32_t disp = static_cast<uint32_t>(disp64);
-    const Reg& base = e.getBase();
-    const Reg& index = e.getIndex();
-    const int baseIdx = base.getIdx();
-    const int baseBit = base.getBit();
-    const int indexBit = index.getBit();
-    enum { mod00 = 0, mod01 = 1, mod10 = 2 };
-    int mod = mod10;  // disp32
-    if (!baseBit || ((baseIdx & 7) != Operand::EBP && disp == 0)) {
-      mod = mod00;
-    } else {
-      if (disp8N == 0) {
-        if (inner::IsInDisp8(disp)) {
-          mod = mod01;
-        }
-      } else {
-        // disp must be casted to signed
-        uint32_t t = static_cast<uint32_t>(static_cast<int>(disp) / disp8N);
-        if ((disp % disp8N) == 0 && inner::IsInDisp8(t)) {
-          disp = t;
-          mod = mod01;
-        }
-      }
-    }
-    const int newBaseIdx = baseBit ? (baseIdx & 7) : Operand::EBP;
-    /* ModR/M = [2:3:3] = [Mod:reg/code:R/M] */
-    bool hasSIB = indexBit || (baseIdx & 7) == Operand::ESP;
-#ifdef XBYAK64
-    if (!baseBit && !indexBit) hasSIB = true;
-#endif
-    if (hasSIB) {
-      setModRM(mod, reg, Operand::ESP);
-      /* SIB = [2:3:3] = [SS:index:base(=rm)] */
-      const int idx = indexBit ? (index.getIdx() & 7) : Operand::ESP;
-      const int scale = e.getScale();
-      const int SS = (scale == 8) ? 3 : (scale == 4) ? 2 : (scale == 2) ? 1 : 0;
-      setModRM(SS, idx, newBaseIdx);
-    } else {
-      setModRM(mod, reg, newBaseIdx);
-    }
-    if (mod == mod01) {
-      db(disp);
-    } else if (mod == mod10 || (mod == mod00 && !baseBit)) {
-      dd(disp);
-    }
-  }
-  LabelManager labelMgr_;
-  bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
-  void opModR(const Reg& reg1, const Reg& reg2, int code0, int code1 = NONE, int code2 = NONE) {
-    rex(reg2, reg1);
-    db(code0 | (reg1.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    setModRM(3, reg1.getIdx(), reg2.getIdx());
-  }
-  void opModM(const Address& addr, const Reg& reg, int code0, int code1 = NONE, int code2 = NONE, int immSize = 0) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    rex(addr, reg);
-    db(code0 | (reg.isBit(8) ? 0 : 1));
-    if (code1 != NONE) db(code1);
-    if (code2 != NONE) db(code2);
-    opAddr(addr, reg.getIdx(), immSize);
-  }
-  void opLoadSeg(const Address& addr, const Reg& reg, int code0, int code1 = NONE) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    rex(addr, reg);
-    db(code0);
-    if (code1 != NONE) db(code1);
-    opAddr(addr, reg.getIdx());
-  }
-  void opMIB(const Address& addr, const Reg& reg, int code0, int code1) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    if (addr.getMode() != Address::M_ModRM) XBYAK_THROW(ERR_INVALID_MIB_ADDRESS)
-    if (BIT == 64 && addr.is32bit()) db(0x67);
-    const RegExp& regExp = addr.getRegExp(false);
-    uint8_t rex = regExp.getRex();
-    if (rex) db(rex);
-    db(code0);
-    db(code1);
-    setSIB(regExp, reg.getIdx());
-  }
-  void makeJmp(uint32_t disp, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    const int shortJmpSize = 2;
-    const int longHeaderSize = longPref ? 2 : 1;
-    const int longJmpSize = longHeaderSize + 4;
-    if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
-      db(shortCode);
-      db(disp - shortJmpSize);
-    } else {
-      if (type == T_SHORT) XBYAK_THROW(ERR_LABEL_IS_TOO_FAR)
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(disp - longJmpSize);
-    }
-  }
-  bool isNEAR(LabelType type) const { return type == T_NEAR || (type == T_AUTO && isDefaultJmpNEAR_); }
-  template <class T>
-  void opJmp(T& label, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory(); /* avoid splitting code of jmp */
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) { /* label exists */
-      makeJmp(inner::VerifyInInt32(offset - size_), type, shortCode, longCode, longPref);
-    } else {
-      int jmpSize = 0;
-      if (isNEAR(type)) {
-        jmpSize = 4;
-        if (longPref) db(longPref);
-        db(longCode);
-        dd(0);
-      } else {
-        jmpSize = 1;
-        db(shortCode);
-        db(0);
-      }
-      JmpLabel jmp(size_, jmpSize, inner::LasIs);
-      labelMgr_.addUndefinedLabel(label, jmp);
-    }
-  }
-  void opJmpAbs(const void* addr, LabelType type, uint8_t shortCode, uint8_t longCode, uint8_t longPref = 0) {
-    if (type == T_FAR) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (isAutoGrow()) {
-      if (!isNEAR(type)) XBYAK_THROW(ERR_ONLY_T_NEAR_IS_SUPPORTED_IN_AUTO_GROW)
-      if (size_ + 16 >= maxSize_) growMemory();
-      if (longPref) db(longPref);
-      db(longCode);
-      dd(0);
-      save(size_ - 4, size_t(addr) - size_, 4, inner::Labs);
-    } else {
-      makeJmp(inner::VerifyInInt32(reinterpret_cast<const uint8_t*>(addr) - getCurr()), type, shortCode, longCode,
-              longPref);
-    }
-  }
-  void opJmpOp(const Operand& op, LabelType type, int ext) {
-    const int bit = 16 | i32e;
-    if (type == T_FAR) {
-      if (!op.isMEM(bit)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-      opR_ModM(op, bit, ext + 1, 0xFF, NONE, NONE, false);
-    } else {
-      opR_ModM(op, bit, ext, 0xFF, NONE, NONE, true);
-    }
-  }
-  // reg is reg field of ModRM
-  // immSize is the size for immediate value
-  // disp8N = 0(normal), disp8N = 1(force disp32), disp8N = {2, 4, 8} ; compressed displacement
-  void opAddr(const Address& addr, int reg, int immSize = 0, int disp8N = 0, bool permitVisb = false) {
-    if (!permitVisb && addr.isVsib()) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    if (addr.getMode() == Address::M_ModRM) {
-      setSIB(addr.getRegExp(), reg, disp8N);
-    } else if (addr.getMode() == Address::M_rip || addr.getMode() == Address::M_ripAddr) {
-      setModRM(0, reg, 5);
-      if (addr.getLabel()) {  // [rip + Label]
-        putL_inner(*addr.getLabel(), true, addr.getDisp() - immSize);
-      } else {
-        size_t disp = addr.getDisp();
-        if (addr.getMode() == Address::M_ripAddr) {
-          if (isAutoGrow()) XBYAK_THROW(ERR_INVALID_RIP_IN_AUTO_GROW)
-          disp -= (size_t)getCurr() + 4 + immSize;
-        }
-        dd(inner::VerifyInInt32(disp));
-      }
-    }
-  }
-  /* preCode is for SSSE3/SSE4 */
-  void opGen(const Operand& reg, const Operand& op, int code, int pref, bool isValid(const Operand&, const Operand&),
-             int imm8 = NONE, int preCode = NONE) {
-    if (isValid && !isValid(reg, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (!isValidSSE(reg) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op.isMEM()) {
-      opModM(op.getAddress(), reg.getReg(), 0x0F, preCode, code, (imm8 != NONE) ? 1 : 0);
-    } else {
-      opModR(reg.getReg(), op.getReg(), 0x0F, preCode, code);
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  void opMMX_IMM(const Mmx& mmx, int imm8, int code, int ext) {
-    if (!isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (mmx.isXMM()) db(0x66);
-    opModR(Reg32(ext), mmx, 0x0F, code);
-    db(imm8);
-  }
-  void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE) {
-    opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
-  }
-  void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) {
-    if (!isValidSSE(op1) || !isValidSSE(op2)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (pref != NONE) db(pref);
-    if (op1.isXMM() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), 0x0F, code);
-    } else if (op1.isMEM() && op2.isXMM()) {
-      opModM(op1.getAddress(), op2.getReg(), 0x0F, code | 1);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opExt(const Operand& op, const Mmx& mmx, int code, int imm, bool hasMMX2 = false) {
-    if (!isValidSSE(op) || !isValidSSE(mmx)) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    if (hasMMX2 && op.isREG(i32e)) { /* pextrw is special */
-      if (mmx.isXMM()) db(0x66);
-      opModR(op.getReg(), mmx, 0x0F, 0xC5);
-      db(imm);
-    } else {
-      opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, 0x3A);
-    }
-  }
-  void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE,
-                bool disableRex = false, int immSize = 0) {
-    int opBit = op.getBit();
-    if (disableRex && opBit == 64) opBit = 32;
-    if (op.isREG(bit)) {
-      opModR(Reg(ext, Operand::REG, opBit), op.getReg().changeBit(opBit), code0, code1, code2);
-    } else if (op.isMEM()) {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, opBit), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShift(const Operand& op, int imm, int ext) {
-    verifyMemHasSize(op);
-    opR_ModM(op, 0, ext, (0xC0 | ((imm == 1 ? 1 : 0) << 4)), NONE, NONE, false, (imm != 1) ? 1 : 0);
-    if (imm != 1) db(imm);
-  }
-  void opShift(const Operand& op, const Reg8& _cl, int ext) {
-    if (_cl.getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opR_ModM(op, 0, ext, 0xD2);
-  }
-  void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE,
-               int code2 = NONE, int immSize = 0) {
-    if (condR) {
-      opModR(op1.getReg(), op2.getReg(), code0, code1, code2);
-    } else if (condM) {
-      opModM(op2.getAddress(), op1.getReg(), code0, code1, code2, immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-  void opShxd(const Operand& op, const Reg& reg, uint8_t imm, int code, const Reg8* _cl = 0) {
-    if (_cl && _cl->getIdx() != Operand::CL) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opModRM(reg, op, (op.isREG(16 | i32e) && op.getBit() == reg.getBit()), op.isMEM() && (reg.isREG(16 | i32e)), 0x0F,
-            code | (_cl ? 1 : 0), NONE, _cl ? 0 : 1);
-    if (!_cl) db(imm);
-  }
-  // (REG, REG|MEM), (MEM, REG)
-  void opRM_RM(const Operand& op1, const Operand& op2, int code) {
-    if (op1.isREG() && op2.isMEM()) {
-      opModM(op2.getAddress(), op1.getReg(), code | 2);
-    } else {
-      opModRM(op2, op1, op1.isREG() && op1.getKind() == op2.getKind(), op1.isMEM() && op2.isREG(), code);
-    }
-  }
-  // (REG|MEM, IMM)
-  void opRM_I(const Operand& op, uint32_t imm, int code, int ext) {
-    verifyMemHasSize(op);
-    uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
-    if (op.isBit(8)) immBit = 8;
-    if (op.getBit() < immBit) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-    if (op.isBit(32 | 64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
-    if (op.isREG() && op.getIdx() == 0 &&
-        (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) {  // rax, eax, ax, al
-      rex(op);
-      db(code | 4 | (immBit == 8 ? 0 : 1));
-    } else {
-      int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
-      opR_ModM(op, 0, ext, 0x80 | tmp, NONE, NONE, false, immBit / 8);
-    }
-    db(imm, immBit / 8);
-  }
-  void opIncDec(const Operand& op, int code, int ext) {
-    verifyMemHasSize(op);
-#ifndef XBYAK64
-    if (op.isREG() && !op.isBit(8)) {
-      rex(op);
-      db(code | op.getIdx());
-      return;
-    }
-#endif
-    code = 0xFE;
-    if (op.isREG()) {
-      opModR(Reg(ext, Operand::REG, op.getBit()), op.getReg(), code);
-    } else {
-      opModM(op.getAddress(), Reg(ext, Operand::REG, op.getBit()), code);
-    }
-  }
-  void opPushPop(const Operand& op, int code, int ext, int alt) {
-    int bit = op.getBit();
-    if (bit == 16 || bit == BIT) {
-      if (bit == 16) db(0x66);
-      if (op.isREG()) {
-        if (op.getReg().getIdx() >= 8) db(0x41);
-        db(alt | (op.getIdx() & 7));
-        return;
-      }
-      if (op.isMEM()) {
-        opModM(op.getAddress(), Reg(ext, Operand::REG, 32), code);
-        return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void verifyMemHasSize(const Operand& op) const {
-    if (op.isMEM() && op.getBit() == 0) XBYAK_THROW(ERR_MEM_SIZE_IS_NOT_SPECIFIED)
-  }
-  /*
-          mov(r, imm) = db(imm, mov_imm(r, imm))
-  */
-  int mov_imm(const Reg& reg, uint64_t imm) {
-    int bit = reg.getBit();
-    const int idx = reg.getIdx();
-    int code = 0xB0 | ((bit == 8 ? 0 : 1) << 3);
-    if (bit == 64 && (imm & ~uint64_t(0xffffffffu)) == 0) {
-      rex(Reg32(idx));
-      bit = 32;
-    } else {
-      rex(reg);
-      if (bit == 64 && inner::IsInInt32(imm)) {
-        db(0xC7);
-        code = 0xC0;
-        bit = 32;
-      }
-    }
-    db(code | (idx & 7));
-    return bit / 8;
-  }
-  template <class T>
-  void putL_inner(T& label, bool relative = false, size_t disp = 0) {
-    const int jmpSize = relative ? 4 : (int)sizeof(size_t);
-    if (isAutoGrow() && size_ + 16 >= maxSize_) growMemory();
-    size_t offset = 0;
-    if (labelMgr_.getOffset(&offset, label)) {
-      if (relative) {
-        db(inner::VerifyInInt32(offset + disp - size_ - jmpSize), jmpSize);
-      } else if (isAutoGrow()) {
-        db(uint64_t(0), jmpSize);
-        save(size_ - jmpSize, offset, jmpSize, inner::LaddTop);
-      } else {
-        db(size_t(top_) + offset, jmpSize);
-      }
-      return;
-    }
-    db(uint64_t(0), jmpSize);
-    JmpLabel jmp(size_, jmpSize, (relative ? inner::LasIs : isAutoGrow() ? inner::LaddTop : inner::Labs), disp);
-    labelMgr_.addUndefinedLabel(label, jmp);
-  }
-  void opMovxx(const Reg& reg, const Operand& op, uint8_t code) {
-    if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    int w = op.isBit(16);
-    bool cond = reg.isREG() && (reg.getBit() > op.getBit());
-    opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
-  }
-  void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext) {
-    if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
-    uint8_t code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    if (m64ext && addr.isBit(64)) ext = m64ext;
-
-    rex(addr, st0);
-    db(code);
-    opAddr(addr, ext);
-  }
-  // use code1 if reg1 == st0
-  // use code2 if reg1 != st0 && reg2 == st0
-  void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32_t code1, uint32_t code2) {
-    uint32_t code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
-    if (!code) XBYAK_THROW(ERR_BAD_ST_COMBINATION)
-    db(uint8_t(code >> 8));
-    db(uint8_t(code | (reg1.getIdx() | reg2.getIdx())));
-  }
-  void opFpu(const Fpu& reg, uint8_t code1, uint8_t code2) {
-    db(code1);
-    db(code2 | reg.getIdx());
-  }
-  void opVex(const Reg& r, const Operand* p1, const Operand& op2, int type, int code, int imm8 = NONE) {
-    if (op2.isMEM()) {
-      const Address& addr = op2.getAddress();
-      const RegExp& regExp = addr.getRegExp();
-      const Reg& base = regExp.getBase();
-      const Reg& index = regExp.getIndex();
-      if (BIT == 64 && addr.is32bit()) db(0x67);
-      int disp8N = 0;
-      bool x = index.isExtIdx();
-      if ((type & (T_MUST_EVEX | T_MEM_EVEX)) || r.hasEvex() || (p1 && p1->hasEvex()) || addr.isBroadcast() ||
-          addr.getOpmaskIdx()) {
-        int aaa = addr.getOpmaskIdx();
-        if (aaa && !(type & T_M_K)) XBYAK_THROW(ERR_INVALID_OPMASK_WITH_MEMORY)
-        bool b = false;
-        if (addr.isBroadcast()) {
-          if (!(type & (T_B32 | T_B64))) XBYAK_THROW(ERR_INVALID_BROADCAST)
-          b = true;
-        }
-        int VL = regExp.isVsib() ? index.getBit() : 0;
-        disp8N = evex(r, base, p1, type, code, x, b, aaa, VL, index.isExtIdx2());
-      } else {
-        vex(r, base, p1, type, code, x);
-      }
-      opAddr(addr, r.getIdx(), (imm8 != NONE) ? 1 : 0, disp8N, (type & T_VSIB) != 0);
-    } else {
-      const Reg& base = op2.getReg();
-      if ((type & T_MUST_EVEX) || r.hasEvex() || (p1 && p1->hasEvex()) || base.hasEvex()) {
-        evex(r, base, p1, type, code);
-      } else {
-        vex(r, base, p1, type, code);
-      }
-      setModRM(3, r.getIdx(), base.getIdx());
-    }
-    if (imm8 != NONE) db(imm8);
-  }
-  // (r, r, r/m) if isR_R_RM
-  // (r, r/m, r)
-  void opGpr(const Reg32e& r, const Operand& op1, const Operand& op2, int type, uint8_t code, bool isR_R_RM,
-             int imm8 = NONE) {
-    const Operand* p1 = &op1;
-    const Operand* p2 = &op2;
-    if (!isR_R_RM) std::swap(p1, p2);
-    const unsigned int bit = r.getBit();
-    if (p1->getBit() != bit || (p2->isREG() && p2->getBit() != bit)) XBYAK_THROW(ERR_BAD_COMBINATION)
-    type |= (bit == 64) ? T_W1 : T_W0;
-    opVex(r, p1, *p2, type, code, imm8);
-  }
-  void opAVX_X_X_XM(const Xmm& x1, const Operand& op1, const Operand& op2, int type, int code0, int imm8 = NONE) {
-    const Xmm* x2 = static_cast<const Xmm*>(&op1);
-    const Operand* op = &op2;
-    if (op2.isNone()) {  // (x1, op1) -> (x1, x1, op1)
-      x2 = &x1;
-      op = &op1;
-    }
-    // (x1, x2, op)
-    if (!((x1.isXMM() && x2->isXMM()) ||
-          ((type & T_YMM) && ((x1.isYMM() && x2->isYMM()) || (x1.isZMM() && x2->isZMM())))))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x1, x2, *op, type, code0, imm8);
-  }
-  void opAVX_K_X_XM(const Opmask& k, const Xmm& x2, const Operand& op3, int type, int code0, int imm8 = NONE) {
-    if (!op3.isMEM() && (x2.getKind() != op3.getKind())) XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(k, &x2, op3, type, code0, imm8);
-  }
-  // (x, x/m), (y, x/m256), (z, y/m)
-  void checkCvt1(const Operand& x, const Operand& op) const {
-    if (!op.isMEM() && !(x.is(Operand::XMM | Operand::YMM) && op.isXMM()) && !(x.isZMM() && op.isYMM()))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/m), (x, y/m256), (y, z/m)
-  void checkCvt2(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opCvt(const Xmm& x, const Operand& op, int type, int code) {
-    Operand::Kind kind = x.isXMM() ? (op.isBit(256) ? Operand::YMM : Operand::XMM) : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  void opCvt2(const Xmm& x, const Operand& op, int type, int code) {
-    checkCvt2(x, op);
-    opCvt(x, op, type, code);
-  }
-  void opCvt3(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int type64, int type32, uint8_t code) {
-    if (!(x1.isXMM() && x2.isXMM() && (op.isREG(i32e) || op.isMEM()))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    Xmm x(op.getIdx());
-    const Operand* p = op.isREG() ? &x : &op;
-    opVex(x1, &x2, *p, type | (op.isBit(64) ? type64 : type32), code);
-  }
-  // (x, x/y/xword/yword), (y, z/m)
-  void checkCvt4(const Xmm& x, const Operand& op) const {
-    if (!(x.isXMM() && op.is(Operand::XMM | Operand::YMM | Operand::MEM) && op.isBit(128 | 256)) &&
-        !(x.isYMM() && op.is(Operand::ZMM | Operand::MEM)))
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  // (x, x/y/z/xword/yword/zword)
-  void opCvt5(const Xmm& x, const Operand& op, int type, int code) {
-    if (!(x.isXMM() && op.isBit(128 | 256 | 512))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
-    opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
-  }
-  const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; }
-  // support (x, x/m, imm), (y, y/m, imm)
-  void opAVX_X_XM_IMM(const Xmm& x, const Operand& op, int type, int code, int imm8 = NONE) {
-    opAVX_X_X_XM(x, cvtIdx0(x), op, type, code, imm8);
-  }
-  // QQQ:need to refactor
-  void opSp1(const Reg& reg, const Operand& op, uint8_t pref, uint8_t code0, uint8_t code1) {
-    if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
-    if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (is16bit) db(0x66);
-    db(pref);
-    opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1);
-  }
-  void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8_t code, int mode) {
-    const RegExp& regExp = addr.getRegExp();
-    if (!regExp.isVsib(128 | 256)) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    const int y_vx_y = 0;
-    const int y_vy_y = 1;
-    //		const int x_vy_x = 2;
-    const bool isAddrYMM = regExp.getIndex().getBit() == 256;
-    if (!x1.isXMM() || isAddrYMM || !x2.isXMM()) {
-      bool isOK = false;
-      if (mode == y_vx_y) {
-        isOK = x1.isYMM() && !isAddrYMM && x2.isYMM();
-      } else if (mode == y_vy_y) {
-        isOK = x1.isYMM() && isAddrYMM && x2.isYMM();
-      } else {  // x_vy_x
-        isOK = !x1.isYMM() && isAddrYMM && !x2.isYMM();
-      }
-      if (!isOK) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    }
-    int i1 = x1.getIdx();
-    int i2 = regExp.getIndex().getIdx();
-    int i3 = x2.getIdx();
-    if (i1 == i2 || i1 == i3 || i2 == i3) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opAVX_X_X_XM(isAddrYMM ? Ymm(i1) : x1, isAddrYMM ? Ymm(i3) : x2, addr, type, code);
-  }
-  enum { xx_yy_zz = 0, xx_yx_zy = 1, xx_xy_yz = 2 };
-  void checkGather2(const Xmm& x1, const Reg& x2, int mode) const {
-    if (x1.isXMM() && x2.isXMM()) return;
-    switch (mode) {
-      case xx_yy_zz:
-        if ((x1.isYMM() && x2.isYMM()) || (x1.isZMM() && x2.isZMM())) return;
-        break;
-      case xx_yx_zy:
-        if ((x1.isYMM() && x2.isXMM()) || (x1.isZMM() && x2.isYMM())) return;
-        break;
-      case xx_xy_yz:
-        if ((x1.isXMM() && x2.isYMM()) || (x1.isYMM() && x2.isZMM())) return;
-        break;
-    }
-    XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-  }
-  void opGather2(const Xmm& x, const Address& addr, int type, uint8_t code, int mode) {
-    if (x.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    const RegExp& regExp = addr.getRegExp();
-    checkGather2(x, regExp.getIndex(), mode);
-    int maskIdx = x.getOpmaskIdx();
-    if ((type & T_M_K) && addr.getOpmaskIdx()) maskIdx = addr.getOpmaskIdx();
-    if (maskIdx == 0) XBYAK_THROW(ERR_K0_IS_INVALID);
-    if (!(type & T_M_K) && x.getIdx() == regExp.getIndex().getIdx()) XBYAK_THROW(ERR_SAME_REGS_ARE_INVALID);
-    opVex(x, 0, addr, type, code);
-  }
-  /*
-          xx_xy_yz ; mode = true
-          xx_xy_xz ; mode = false
-  */
-  void opVmov(const Operand& op, const Xmm& x, int type, uint8_t code, bool mode) {
-    if (mode) {
-      if (!op.isMEM() && !((op.isXMM() && x.isXMM()) || (op.isXMM() && x.isYMM()) || (op.isYMM() && x.isZMM())))
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-    } else {
-      if (!op.isMEM() && !op.isXMM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-    opVex(x, 0, op, type, code);
-  }
-  void opGatherFetch(const Address& addr, const Xmm& x, int type, uint8_t code, Operand::Kind kind) {
-    if (addr.hasZero()) XBYAK_THROW(ERR_INVALID_ZERO)
-    if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
-    opVex(x, 0, addr, type, code);
-  }
-  void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, int type, int code0, PreferredEncoding encoding) {
-    opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code0);
-  }
-  int orEvexIf(PreferredEncoding encoding) {
-    if (encoding == DefaultEncoding) {
-      encoding = defaultEncoding_;
-    }
-    if (encoding == EvexEncoding) {
-#ifdef XBYAK_DISABLE_AVX512
-      XBYAK_THROW(ERR_EVEX_IS_INVALID)
-#endif
-      return T_MUST_EVEX;
-    }
-    return 0;
-  }
-  void opInOut(const Reg& a, const Reg& d, uint8_t code) {
-    if (a.getIdx() == Operand::AL && d.getIdx() == Operand::DX && d.getBit() == 16) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          return;
-        case 32:
-          db(code + 1);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-  void opInOut(const Reg& a, uint8_t code, uint8_t v) {
-    if (a.getIdx() == Operand::AL) {
-      switch (a.getBit()) {
-        case 8:
-          db(code);
-          db(v);
-          return;
-        case 16:
-          db(0x66);
-          db(code + 1);
-          db(v);
-          return;
-        case 32:
-          db(code + 1);
-          db(v);
-          return;
-      }
-    }
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-  }
-#ifdef XBYAK64
-  void opAMX(const Tmm& t1, const Address& addr, int type, int code0) {
-    // require both base and index
-    const RegExp exp = addr.getRegExp(false);
-    if (exp.getBase().getBit() == 0 || exp.getIndex().getBit() == 0) XBYAK_THROW(ERR_NOT_SUPPORTED)
-    opVex(t1, &tmm0, addr, type, code0);
-  }
-#endif
- public:
-  unsigned int getVersion() const { return VERSION; }
-  using CodeArray::db;
-  const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
-  const Zmm zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
-  const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
-  const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
-  const Zmm &zm0, &zm1, &zm2, &zm3, &zm4, &zm5, &zm6, &zm7;
-  const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
-  const Reg16 ax, cx, dx, bx, sp, bp, si, di;
-  const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
-  const AddressFrame ptr, byte, word, dword, qword, xword, yword, zword;  // xword is same as oword of NASM
-  const AddressFrame ptr_b, xword_b, yword_b, zword_b;  // broadcast such as {1to2}, {1to4}, {1to8}, {1to16}, {b}
-  const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
-  const Opmask k0, k1, k2, k3, k4, k5, k6, k7;
-  const BoundsReg bnd0, bnd1, bnd2, bnd3;
-  const EvexModifierRounding T_sae, T_rn_sae, T_rd_sae, T_ru_sae,
-      T_rz_sae;                // {sae}, {rn-sae}, {rd-sae}, {ru-sae}, {rz-sae}
-  const EvexModifierZero T_z;  // {z}
-#ifdef XBYAK64
-  const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
-  const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
-  const Reg16 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w;
-  const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
-  const Reg8 spl, bpl, sil, dil;
-  const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  const Xmm xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm22, xmm23;
-  const Xmm xmm24, xmm25, xmm26, xmm27, xmm28, xmm29, xmm30, xmm31;
-  const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
-  const Ymm ymm16, ymm17, ymm18, ymm19, ymm20, ymm21, ymm22, ymm23;
-  const Ymm ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
-  const Zmm zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
-  const Zmm zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
-  const Zmm zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
-  const Tmm tmm0, tmm1, tmm2, tmm3, tmm4, tmm5, tmm6, tmm7;
-  const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;  // for my convenience
-  const Xmm &xm16, &xm17, &xm18, &xm19, &xm20, &xm21, &xm22, &xm23;
-  const Xmm &xm24, &xm25, &xm26, &xm27, &xm28, &xm29, &xm30, &xm31;
-  const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
-  const Ymm &ym16, &ym17, &ym18, &ym19, &ym20, &ym21, &ym22, &ym23;
-  const Ymm &ym24, &ym25, &ym26, &ym27, &ym28, &ym29, &ym30, &ym31;
-  const Zmm &zm8, &zm9, &zm10, &zm11, &zm12, &zm13, &zm14, &zm15;
-  const Zmm &zm16, &zm17, &zm18, &zm19, &zm20, &zm21, &zm22, &zm23;
-  const Zmm &zm24, &zm25, &zm26, &zm27, &zm28, &zm29, &zm30, &zm31;
-  const RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-  const Segment es, cs, ss, ds, fs, gs;
-#endif
- private:
-  bool isDefaultJmpNEAR_;
-  PreferredEncoding defaultEncoding_;
-
- public:
-  void L(const std::string& label) { labelMgr_.defineSlabel(label); }
-  void L(Label& label) { labelMgr_.defineClabel(label); }
-  Label L() {
-    Label label;
-    L(label);
-    return label;
-  }
-  void inLocalLabel() { labelMgr_.enterLocal(); }
-  void outLocalLabel() { labelMgr_.leaveLocal(); }
-  /*
-          assign src to dst
-          require
-          dst : does not used by L()
-          src : used by L()
-  */
-  void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
-  /*
-          put address of label to buffer
-          @note the put size is 4(32-bit), 8(64-bit)
-  */
-  void putL(std::string label) { putL_inner(label); }
-  void putL(const Label& label) { putL_inner(label); }
-
-  // set default type of `jmp` of undefined label to T_NEAR
-  void setDefaultJmpNEAR(bool isNear) { isDefaultJmpNEAR_ = isNear; }
-  void jmp(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 4); }
-  void jmp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const char* label, LabelType type = T_AUTO) { jmp(std::string(label), type); }
-  void jmp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0xEB, 0xE9, 0); }
-  void jmp(const void* addr, LabelType type = T_AUTO) { opJmpAbs(addr, type, 0xEB, 0xE9); }
-
-  void call(const Operand& op, LabelType type = T_AUTO) { opJmpOp(op, type, 2); }
-  // call(string label), not const std::string&
-  void call(std::string label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  void call(const char* label) { call(std::string(label)); }
-  void call(const Label& label) { opJmp(label, T_NEAR, 0, 0xE8, 0); }
-  // call(function pointer)
-#ifdef XBYAK_VARIADIC_TEMPLATE
-  template <class Ret, class... Params>
-  void call(Ret (*func)(Params...)) {
-    call(reinterpret_cast<const void*>(func));
-  }
-#endif
-  void call(const void* addr) { opJmpAbs(addr, T_NEAR, 0, 0xE8); }
-
-  void test(const Operand& op, const Reg& reg) {
-    opModRM(reg, op, op.isREG() && (op.getKind() == reg.getKind()), op.isMEM(), 0x84);
-  }
-  void test(const Operand& op, uint32_t imm) {
-    verifyMemHasSize(op);
-    int immSize = (std::min)(op.getBit() / 8, 4U);
-    if (op.isREG() && op.getIdx() == 0) {  // al, ax, eax
-      rex(op);
-      db(0xA8 | (op.isBit(8) ? 0 : 1));
-    } else {
-      opR_ModM(op, 0, 0, 0xF6, NONE, NONE, false, immSize);
-    }
-    db(imm, immSize);
-  }
-  void imul(const Reg& reg, const Operand& op) {
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x0F, 0xAF);
-  }
-  void imul(const Reg& reg, const Operand& op, int imm) {
-    int s = inner::IsInDisp8(imm) ? 1 : 0;
-    int immSize = s ? 1 : reg.isREG(16) ? 2 : 4;
-    opModRM(reg, op, op.isREG() && (reg.getKind() == op.getKind()), op.isMEM(), 0x69 | (s << 1), NONE, NONE, immSize);
-    db(imm, immSize);
-  }
-  void push(const Operand& op) { opPushPop(op, 0xFF, 6, 0x50); }
-  void pop(const Operand& op) { opPushPop(op, 0x8F, 0, 0x58); }
-  void push(const AddressFrame& af, uint32_t imm) {
-    if (af.bit_ == 8) {
-      db(0x6A);
-      db(imm);
-    } else if (af.bit_ == 16) {
-      db(0x66);
-      db(0x68);
-      dw(imm);
-    } else {
-      db(0x68);
-      dd(imm);
-    }
-  }
-  /* use "push(word, 4)" if you want "push word 4" */
-  void push(uint32_t imm) {
-    if (inner::IsInDisp8(imm)) {
-      push(byte, imm);
-    } else {
-      push(dword, imm);
-    }
-  }
-  void mov(const Operand& reg1, const Operand& reg2) {
-    const Reg* reg = 0;
-    const Address* addr = 0;
-    uint8_t code = 0;
-    if (reg1.isREG() && reg1.getIdx() == 0 && reg2.isMEM()) {  // mov eax|ax|al, [disp]
-      reg = &reg1.getReg();
-      addr = &reg2.getAddress();
-      code = 0xA0;
-    } else if (reg1.isMEM() && reg2.isREG() && reg2.getIdx() == 0) {  // mov [disp], eax|ax|al
-      reg = &reg2.getReg();
-      addr = &reg1.getAddress();
-      code = 0xA2;
-    }
-#ifdef XBYAK64
-    if (addr && addr->is64bitDisp()) {
-      if (code) {
-        rex(*reg);
-        db(reg1.isREG(8) ? 0xA0 : reg1.isREG() ? 0xA1 : reg2.isREG(8) ? 0xA2 : 0xA3);
-        db(addr->getDisp(), 8);
-      } else {
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      }
-    } else
-#else
-    if (code && addr->isOnlyDisp()) {
-      rex(*reg, *addr);
-      db(code | (reg->isBit(8) ? 0 : 1));
-      dd(static_cast<uint32_t>(addr->getDisp()));
-    } else
-#endif
-    {
-      opRM_RM(reg1, reg2, 0x88);
-    }
-  }
-  void mov(const Operand& op, uint64_t imm) {
-    if (op.isREG()) {
-      const int size = mov_imm(op.getReg(), imm);
-      db(imm, size);
-    } else if (op.isMEM()) {
-      verifyMemHasSize(op);
-      int immSize = op.getBit() / 8;
-      if (immSize <= 4) {
-        int64_t s = int64_t(imm) >> (immSize * 8);
-        if (s != 0 && s != -1) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-      } else {
-        if (!inner::IsInInt32(imm)) XBYAK_THROW(ERR_IMM_IS_TOO_BIG)
-        immSize = 4;
-      }
-      opModM(op.getAddress(), Reg(0, Operand::REG, op.getBit()), 0xC6, NONE, NONE, immSize);
-      db(static_cast<uint32_t>(imm), immSize);
-    } else {
-      XBYAK_THROW(ERR_BAD_COMBINATION)
-    }
-  }
-
-  // The template is used to avoid ambiguity when the 2nd argument is 0.
-  // When the 2nd argument is 0 the call goes to
-  // `void mov(const Operand& op, uint64_t imm)`.
-  template <typename T1, typename T2>
-  void mov(const T1&, const T2*) {
-    T1::unexpected;
-  }
-  void mov(const NativeReg& reg, const Label& label) {
-    mov_imm(reg, dummyAddr);
-    putL(label);
-  }
-  void xchg(const Operand& op1, const Operand& op2) {
-    const Operand *p1 = &op1, *p2 = &op2;
-    if (p1->isMEM() || (p2->isREG(16 | i32e) && p2->getIdx() == 0)) {
-      p1 = &op2;
-      p2 = &op1;
-    }
-    if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
-    if (p2->isREG() && (p1->isREG(16 | i32e) && p1->getIdx() == 0)
-#ifdef XBYAK64
-        && (p2->getIdx() != 0 || !p1->isREG(32))
-#endif
-    ) {
-      rex(*p2, *p1);
-      db(0x90 | (p2->getIdx() & 7));
-      return;
-    }
-    opModRM(*p1, *p2, (p1->isREG() && p2->isREG() && (p1->getBit() == p2->getBit())), p2->isMEM(),
-            0x86 | (p1->isBit(8) ? 0 : 1));
-  }
-
-#ifndef XBYAK_DISABLE_SEGMENT
-  void push(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x06);
-        break;
-      case Segment::cs:
-        db(0x0E);
-        break;
-      case Segment::ss:
-        db(0x16);
-        break;
-      case Segment::ds:
-        db(0x1E);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA0);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA8);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void pop(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x07);
-        break;
-      case Segment::cs:
-        XBYAK_THROW(ERR_BAD_COMBINATION)
-      case Segment::ss:
-        db(0x17);
-        break;
-      case Segment::ds:
-        db(0x1F);
-        break;
-      case Segment::fs:
-        db(0x0F);
-        db(0xA1);
-        break;
-      case Segment::gs:
-        db(0x0F);
-        db(0xA9);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void putSeg(const Segment& seg) {
-    switch (seg.getIdx()) {
-      case Segment::es:
-        db(0x2E);
-        break;
-      case Segment::cs:
-        db(0x36);
-        break;
-      case Segment::ss:
-        db(0x3E);
-        break;
-      case Segment::ds:
-        db(0x26);
-        break;
-      case Segment::fs:
-        db(0x64);
-        break;
-      case Segment::gs:
-        db(0x65);
-        break;
-      default:
-        assert(0);
-    }
-  }
-  void mov(const Operand& op, const Segment& seg) {
-    opModRM(Reg8(seg.getIdx()), op, op.isREG(16 | i32e), op.isMEM(), 0x8C);
-  }
-  void mov(const Segment& seg, const Operand& op) {
-    opModRM(Reg8(seg.getIdx()), op.isREG(16 | i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op,
-            op.isREG(16 | i32e), op.isMEM(), 0x8E);
-  }
-#endif
-
-  enum { NONE = 256 };
-  // constructor
-  CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void* userPtr = 0, Allocator* allocator = 0)
-      : CodeArray(maxSize, userPtr, allocator),
-        mm0(0),
-        mm1(1),
-        mm2(2),
-        mm3(3),
-        mm4(4),
-        mm5(5),
-        mm6(6),
-        mm7(7),
-        xmm0(0),
-        xmm1(1),
-        xmm2(2),
-        xmm3(3),
-        xmm4(4),
-        xmm5(5),
-        xmm6(6),
-        xmm7(7),
-        ymm0(0),
-        ymm1(1),
-        ymm2(2),
-        ymm3(3),
-        ymm4(4),
-        ymm5(5),
-        ymm6(6),
-        ymm7(7),
-        zmm0(0),
-        zmm1(1),
-        zmm2(2),
-        zmm3(3),
-        zmm4(4),
-        zmm5(5),
-        zmm6(6),
-        zmm7(7)
-        // for my convenience
-        ,
-        xm0(xmm0),
-        xm1(xmm1),
-        xm2(xmm2),
-        xm3(xmm3),
-        xm4(xmm4),
-        xm5(xmm5),
-        xm6(xmm6),
-        xm7(xmm7),
-        ym0(ymm0),
-        ym1(ymm1),
-        ym2(ymm2),
-        ym3(ymm3),
-        ym4(ymm4),
-        ym5(ymm5),
-        ym6(ymm6),
-        ym7(ymm7),
-        zm0(zmm0),
-        zm1(zmm1),
-        zm2(zmm2),
-        zm3(zmm3),
-        zm4(zmm4),
-        zm5(zmm5),
-        zm6(zmm6),
-        zm7(zmm7)
-
-        ,
-        eax(Operand::EAX),
-        ecx(Operand::ECX),
-        edx(Operand::EDX),
-        ebx(Operand::EBX),
-        esp(Operand::ESP),
-        ebp(Operand::EBP),
-        esi(Operand::ESI),
-        edi(Operand::EDI),
-        ax(Operand::AX),
-        cx(Operand::CX),
-        dx(Operand::DX),
-        bx(Operand::BX),
-        sp(Operand::SP),
-        bp(Operand::BP),
-        si(Operand::SI),
-        di(Operand::DI),
-        al(Operand::AL),
-        cl(Operand::CL),
-        dl(Operand::DL),
-        bl(Operand::BL),
-        ah(Operand::AH),
-        ch(Operand::CH),
-        dh(Operand::DH),
-        bh(Operand::BH),
-        ptr(0),
-        byte(8),
-        word(16),
-        dword(32),
-        qword(64),
-        xword(128),
-        yword(256),
-        zword(512),
-        ptr_b(0, true),
-        xword_b(128, true),
-        yword_b(256, true),
-        zword_b(512, true),
-        st0(0),
-        st1(1),
-        st2(2),
-        st3(3),
-        st4(4),
-        st5(5),
-        st6(6),
-        st7(7),
-        k0(0),
-        k1(1),
-        k2(2),
-        k3(3),
-        k4(4),
-        k5(5),
-        k6(6),
-        k7(7),
-        bnd0(0),
-        bnd1(1),
-        bnd2(2),
-        bnd3(3),
-        T_sae(EvexModifierRounding::T_SAE),
-        T_rn_sae(EvexModifierRounding::T_RN_SAE),
-        T_rd_sae(EvexModifierRounding::T_RD_SAE),
-        T_ru_sae(EvexModifierRounding::T_RU_SAE),
-        T_rz_sae(EvexModifierRounding::T_RZ_SAE),
-        T_z()
-#ifdef XBYAK64
-        ,
-        rax(Operand::RAX),
-        rcx(Operand::RCX),
-        rdx(Operand::RDX),
-        rbx(Operand::RBX),
-        rsp(Operand::RSP),
-        rbp(Operand::RBP),
-        rsi(Operand::RSI),
-        rdi(Operand::RDI),
-        r8(Operand::R8),
-        r9(Operand::R9),
-        r10(Operand::R10),
-        r11(Operand::R11),
-        r12(Operand::R12),
-        r13(Operand::R13),
-        r14(Operand::R14),
-        r15(Operand::R15),
-        r8d(8),
-        r9d(9),
-        r10d(10),
-        r11d(11),
-        r12d(12),
-        r13d(13),
-        r14d(14),
-        r15d(15),
-        r8w(8),
-        r9w(9),
-        r10w(10),
-        r11w(11),
-        r12w(12),
-        r13w(13),
-        r14w(14),
-        r15w(15),
-        r8b(8),
-        r9b(9),
-        r10b(10),
-        r11b(11),
-        r12b(12),
-        r13b(13),
-        r14b(14),
-        r15b(15),
-        spl(Operand::SPL, true),
-        bpl(Operand::BPL, true),
-        sil(Operand::SIL, true),
-        dil(Operand::DIL, true),
-        xmm8(8),
-        xmm9(9),
-        xmm10(10),
-        xmm11(11),
-        xmm12(12),
-        xmm13(13),
-        xmm14(14),
-        xmm15(15),
-        xmm16(16),
-        xmm17(17),
-        xmm18(18),
-        xmm19(19),
-        xmm20(20),
-        xmm21(21),
-        xmm22(22),
-        xmm23(23),
-        xmm24(24),
-        xmm25(25),
-        xmm26(26),
-        xmm27(27),
-        xmm28(28),
-        xmm29(29),
-        xmm30(30),
-        xmm31(31),
-        ymm8(8),
-        ymm9(9),
-        ymm10(10),
-        ymm11(11),
-        ymm12(12),
-        ymm13(13),
-        ymm14(14),
-        ymm15(15),
-        ymm16(16),
-        ymm17(17),
-        ymm18(18),
-        ymm19(19),
-        ymm20(20),
-        ymm21(21),
-        ymm22(22),
-        ymm23(23),
-        ymm24(24),
-        ymm25(25),
-        ymm26(26),
-        ymm27(27),
-        ymm28(28),
-        ymm29(29),
-        ymm30(30),
-        ymm31(31),
-        zmm8(8),
-        zmm9(9),
-        zmm10(10),
-        zmm11(11),
-        zmm12(12),
-        zmm13(13),
-        zmm14(14),
-        zmm15(15),
-        zmm16(16),
-        zmm17(17),
-        zmm18(18),
-        zmm19(19),
-        zmm20(20),
-        zmm21(21),
-        zmm22(22),
-        zmm23(23),
-        zmm24(24),
-        zmm25(25),
-        zmm26(26),
-        zmm27(27),
-        zmm28(28),
-        zmm29(29),
-        zmm30(30),
-        zmm31(31),
-        tmm0(0),
-        tmm1(1),
-        tmm2(2),
-        tmm3(3),
-        tmm4(4),
-        tmm5(5),
-        tmm6(6),
-        tmm7(7)
-        // for my convenience
-        ,
-        xm8(xmm8),
-        xm9(xmm9),
-        xm10(xmm10),
-        xm11(xmm11),
-        xm12(xmm12),
-        xm13(xmm13),
-        xm14(xmm14),
-        xm15(xmm15),
-        xm16(xmm16),
-        xm17(xmm17),
-        xm18(xmm18),
-        xm19(xmm19),
-        xm20(xmm20),
-        xm21(xmm21),
-        xm22(xmm22),
-        xm23(xmm23),
-        xm24(xmm24),
-        xm25(xmm25),
-        xm26(xmm26),
-        xm27(xmm27),
-        xm28(xmm28),
-        xm29(xmm29),
-        xm30(xmm30),
-        xm31(xmm31),
-        ym8(ymm8),
-        ym9(ymm9),
-        ym10(ymm10),
-        ym11(ymm11),
-        ym12(ymm12),
-        ym13(ymm13),
-        ym14(ymm14),
-        ym15(ymm15),
-        ym16(ymm16),
-        ym17(ymm17),
-        ym18(ymm18),
-        ym19(ymm19),
-        ym20(ymm20),
-        ym21(ymm21),
-        ym22(ymm22),
-        ym23(ymm23),
-        ym24(ymm24),
-        ym25(ymm25),
-        ym26(ymm26),
-        ym27(ymm27),
-        ym28(ymm28),
-        ym29(ymm29),
-        ym30(ymm30),
-        ym31(ymm31),
-        zm8(zmm8),
-        zm9(zmm9),
-        zm10(zmm10),
-        zm11(zmm11),
-        zm12(zmm12),
-        zm13(zmm13),
-        zm14(zmm14),
-        zm15(zmm15),
-        zm16(zmm16),
-        zm17(zmm17),
-        zm18(zmm18),
-        zm19(zmm19),
-        zm20(zmm20),
-        zm21(zmm21),
-        zm22(zmm22),
-        zm23(zmm23),
-        zm24(zmm24),
-        zm25(zmm25),
-        zm26(zmm26),
-        zm27(zmm27),
-        zm28(zmm28),
-        zm29(zmm29),
-        zm30(zmm30),
-        zm31(zmm31),
-        rip()
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-        ,
-        es(Segment::es),
-        cs(Segment::cs),
-        ss(Segment::ss),
-        ds(Segment::ds),
-        fs(Segment::fs),
-        gs(Segment::gs)
-#endif
-        ,
-        isDefaultJmpNEAR_(false),
-        defaultEncoding_(EvexEncoding) {
-    labelMgr_.set(this);
-  }
-  void reset() {
-    ClearError();
-    resetSize();
-    labelMgr_.reset();
-    labelMgr_.set(this);
-  }
-  bool hasUndefinedLabel() const { return labelMgr_.hasUndefSlabel() || labelMgr_.hasUndefClabel(); }
-  /*
-          MUST call ready() to complete generating code if you use AutoGrow mode.
-          It is not necessary for the other mode if hasUndefinedLabel() is true.
-  */
-  void ready(ProtectMode mode = PROTECT_RWE) {
-    if (hasUndefinedLabel()) XBYAK_THROW(ERR_LABEL_IS_NOT_FOUND)
-    if (isAutoGrow()) {
-      calcJmpAddress();
-      if (useProtect()) setProtectMode(mode);
-    }
-  }
-  // set read/exec
-  void readyRE() { return ready(PROTECT_RE); }
-#ifdef XBYAK_TEST
-  void dump(bool doClear = true) {
-    CodeArray::dump();
-    if (doClear) size_ = 0;
-  }
-#endif
-
-#ifdef XBYAK_UNDEF_JNL
-#undef jnl
-#endif
-
-  // set default encoding to select Vex or Evex
-  void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
-
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void nop(size_t size = 1, bool useMultiByteNop = true) {
-    if (!useMultiByteNop) {
-      for (size_t i = 0; i < size; i++) {
-        db(0x90);
-      }
-      return;
-    }
-    /*
-            Intel Architectures Software Developer's Manual Volume 2
-            recommended multi-byte sequence of NOP instruction
-            AMD and Intel seem to agree on the same sequences for up to 9 bytes:
-            https://support.amd.com/TechDocs/55723_SOG_Fam_17h_Processors_3.00.pdf
-    */
-    static const uint8_t nopTbl[9][9] = {
-        {0x90},
-        {0x66, 0x90},
-        {0x0F, 0x1F, 0x00},
-        {0x0F, 0x1F, 0x40, 0x00},
-        {0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
-        {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
-        {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-        {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
-    };
-    const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]);
-    while (size > 0) {
-      size_t len = (std::min)(n, size);
-      const uint8_t* seq = nopTbl[len - 1];
-      db(seq, len);
-      size -= len;
-    }
-  }
-
-#ifndef XBYAK_DONT_READ_LIST
-#include "xbyak_mnemonic.h"
-  /*
-          use single byte nop if useMultiByteNop = false
-  */
-  void align(size_t x = 16, bool useMultiByteNop = true) {
-    if (x == 1) return;
-    if (x < 1 || (x & (x - 1))) XBYAK_THROW(ERR_BAD_ALIGN)
-    if (isAutoGrow()) XBYAK_THROW(ERR_BAD_ALIGN)
-    size_t remain = size_t(getCurr()) % x;
-    if (remain) {
-      nop(x - remain, useMultiByteNop);
-    }
-  }
-#endif
-};
-
-template <>
-inline void CodeGenerator::mov(const NativeReg& reg, const char* label)  // can't use std::string
-{
-  assert(label);
-  mov_imm(reg, dummyAddr);
-  putL(label);
-}
-
-namespace util {
-static const XBYAK_CONSTEXPR Mmx mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7);
-static const XBYAK_CONSTEXPR Xmm xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7);
-static const XBYAK_CONSTEXPR Ymm ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7);
-static const XBYAK_CONSTEXPR Zmm zmm0(0), zmm1(1), zmm2(2), zmm3(3), zmm4(4), zmm5(5), zmm6(6), zmm7(7);
-static const XBYAK_CONSTEXPR Reg32 eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX),
-    esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI);
-static const XBYAK_CONSTEXPR Reg16 ax(Operand::AX), cx(Operand::CX), dx(Operand::DX), bx(Operand::BX), sp(Operand::SP),
-    bp(Operand::BP), si(Operand::SI), di(Operand::DI);
-static const XBYAK_CONSTEXPR Reg8 al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH),
-    ch(Operand::CH), dh(Operand::DH), bh(Operand::BH);
-static const XBYAK_CONSTEXPR AddressFrame ptr(0), byte(8), word(16), dword(32), qword(64), xword(128), yword(256),
-    zword(512);
-static const XBYAK_CONSTEXPR AddressFrame ptr_b(0, true), xword_b(128, true), yword_b(256, true), zword_b(512, true);
-static const XBYAK_CONSTEXPR Fpu st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7);
-static const XBYAK_CONSTEXPR Opmask k0(0), k1(1), k2(2), k3(3), k4(4), k5(5), k6(6), k7(7);
-static const XBYAK_CONSTEXPR BoundsReg bnd0(0), bnd1(1), bnd2(2), bnd3(3);
-static const XBYAK_CONSTEXPR EvexModifierRounding T_sae(EvexModifierRounding::T_SAE),
-    T_rn_sae(EvexModifierRounding::T_RN_SAE), T_rd_sae(EvexModifierRounding::T_RD_SAE),
-    T_ru_sae(EvexModifierRounding::T_RU_SAE), T_rz_sae(EvexModifierRounding::T_RZ_SAE);
-static const XBYAK_CONSTEXPR EvexModifierZero T_z;
-#ifdef XBYAK64
-static const XBYAK_CONSTEXPR Reg64 rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX),
-    rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9),
-    r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15);
-static const XBYAK_CONSTEXPR Reg32 r8d(8), r9d(9), r10d(10), r11d(11), r12d(12), r13d(13), r14d(14), r15d(15);
-static const XBYAK_CONSTEXPR Reg16 r8w(8), r9w(9), r10w(10), r11w(11), r12w(12), r13w(13), r14w(14), r15w(15);
-static const XBYAK_CONSTEXPR Reg8 r8b(8), r9b(9), r10b(10), r11b(11), r12b(12), r13b(13), r14b(14), r15b(15),
-    spl(Operand::SPL, true), bpl(Operand::BPL, true), sil(Operand::SIL, true), dil(Operand::DIL, true);
-static const XBYAK_CONSTEXPR Xmm xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15);
-static const XBYAK_CONSTEXPR Xmm xmm16(16), xmm17(17), xmm18(18), xmm19(19), xmm20(20), xmm21(21), xmm22(22), xmm23(23);
-static const XBYAK_CONSTEXPR Xmm xmm24(24), xmm25(25), xmm26(26), xmm27(27), xmm28(28), xmm29(29), xmm30(30), xmm31(31);
-static const XBYAK_CONSTEXPR Ymm ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15);
-static const XBYAK_CONSTEXPR Ymm ymm16(16), ymm17(17), ymm18(18), ymm19(19), ymm20(20), ymm21(21), ymm22(22), ymm23(23);
-static const XBYAK_CONSTEXPR Ymm ymm24(24), ymm25(25), ymm26(26), ymm27(27), ymm28(28), ymm29(29), ymm30(30), ymm31(31);
-static const XBYAK_CONSTEXPR Zmm zmm8(8), zmm9(9), zmm10(10), zmm11(11), zmm12(12), zmm13(13), zmm14(14), zmm15(15);
-static const XBYAK_CONSTEXPR Zmm zmm16(16), zmm17(17), zmm18(18), zmm19(19), zmm20(20), zmm21(21), zmm22(22), zmm23(23);
-static const XBYAK_CONSTEXPR Zmm zmm24(24), zmm25(25), zmm26(26), zmm27(27), zmm28(28), zmm29(29), zmm30(30), zmm31(31);
-static const XBYAK_CONSTEXPR Zmm tmm0(0), tmm1(1), tmm2(2), tmm3(3), tmm4(4), tmm5(5), tmm6(6), tmm7(7);
-static const XBYAK_CONSTEXPR RegRip rip;
-#endif
-#ifndef XBYAK_DISABLE_SEGMENT
-static const XBYAK_CONSTEXPR Segment es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds),
-    fs(Segment::fs), gs(Segment::gs);
-#endif
-}  // namespace util
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-
-}  // namespace Xbyak
-
-#endif  // XBYAK_XBYAK_H_
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
deleted file mode 100644
index fda7da3c9b7c1..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_bin2hex.h
+++ /dev/null
@@ -1,271 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-enum {
-  B00000000 = 0,
-  B00000001 = 1,
-  B00000010 = 2,
-  B00000011 = 3,
-  B00000100 = 4,
-  B00000101 = 5,
-  B00000110 = 6,
-  B00000111 = 7,
-  B00001000 = 8,
-  B00001001 = 9,
-  B00001010 = 10,
-  B00001011 = 11,
-  B00001100 = 12,
-  B00001101 = 13,
-  B00001110 = 14,
-  B00001111 = 15,
-  B00010000 = 16,
-  B00010001 = 17,
-  B00010010 = 18,
-  B00010011 = 19,
-  B00010100 = 20,
-  B00010101 = 21,
-  B00010110 = 22,
-  B00010111 = 23,
-  B00011000 = 24,
-  B00011001 = 25,
-  B00011010 = 26,
-  B00011011 = 27,
-  B00011100 = 28,
-  B00011101 = 29,
-  B00011110 = 30,
-  B00011111 = 31,
-  B00100000 = 32,
-  B00100001 = 33,
-  B00100010 = 34,
-  B00100011 = 35,
-  B00100100 = 36,
-  B00100101 = 37,
-  B00100110 = 38,
-  B00100111 = 39,
-  B00101000 = 40,
-  B00101001 = 41,
-  B00101010 = 42,
-  B00101011 = 43,
-  B00101100 = 44,
-  B00101101 = 45,
-  B00101110 = 46,
-  B00101111 = 47,
-  B00110000 = 48,
-  B00110001 = 49,
-  B00110010 = 50,
-  B00110011 = 51,
-  B00110100 = 52,
-  B00110101 = 53,
-  B00110110 = 54,
-  B00110111 = 55,
-  B00111000 = 56,
-  B00111001 = 57,
-  B00111010 = 58,
-  B00111011 = 59,
-  B00111100 = 60,
-  B00111101 = 61,
-  B00111110 = 62,
-  B00111111 = 63,
-  B01000000 = 64,
-  B01000001 = 65,
-  B01000010 = 66,
-  B01000011 = 67,
-  B01000100 = 68,
-  B01000101 = 69,
-  B01000110 = 70,
-  B01000111 = 71,
-  B01001000 = 72,
-  B01001001 = 73,
-  B01001010 = 74,
-  B01001011 = 75,
-  B01001100 = 76,
-  B01001101 = 77,
-  B01001110 = 78,
-  B01001111 = 79,
-  B01010000 = 80,
-  B01010001 = 81,
-  B01010010 = 82,
-  B01010011 = 83,
-  B01010100 = 84,
-  B01010101 = 85,
-  B01010110 = 86,
-  B01010111 = 87,
-  B01011000 = 88,
-  B01011001 = 89,
-  B01011010 = 90,
-  B01011011 = 91,
-  B01011100 = 92,
-  B01011101 = 93,
-  B01011110 = 94,
-  B01011111 = 95,
-  B01100000 = 96,
-  B01100001 = 97,
-  B01100010 = 98,
-  B01100011 = 99,
-  B01100100 = 100,
-  B01100101 = 101,
-  B01100110 = 102,
-  B01100111 = 103,
-  B01101000 = 104,
-  B01101001 = 105,
-  B01101010 = 106,
-  B01101011 = 107,
-  B01101100 = 108,
-  B01101101 = 109,
-  B01101110 = 110,
-  B01101111 = 111,
-  B01110000 = 112,
-  B01110001 = 113,
-  B01110010 = 114,
-  B01110011 = 115,
-  B01110100 = 116,
-  B01110101 = 117,
-  B01110110 = 118,
-  B01110111 = 119,
-  B01111000 = 120,
-  B01111001 = 121,
-  B01111010 = 122,
-  B01111011 = 123,
-  B01111100 = 124,
-  B01111101 = 125,
-  B01111110 = 126,
-  B01111111 = 127,
-  B10000000 = 128,
-  B10000001 = 129,
-  B10000010 = 130,
-  B10000011 = 131,
-  B10000100 = 132,
-  B10000101 = 133,
-  B10000110 = 134,
-  B10000111 = 135,
-  B10001000 = 136,
-  B10001001 = 137,
-  B10001010 = 138,
-  B10001011 = 139,
-  B10001100 = 140,
-  B10001101 = 141,
-  B10001110 = 142,
-  B10001111 = 143,
-  B10010000 = 144,
-  B10010001 = 145,
-  B10010010 = 146,
-  B10010011 = 147,
-  B10010100 = 148,
-  B10010101 = 149,
-  B10010110 = 150,
-  B10010111 = 151,
-  B10011000 = 152,
-  B10011001 = 153,
-  B10011010 = 154,
-  B10011011 = 155,
-  B10011100 = 156,
-  B10011101 = 157,
-  B10011110 = 158,
-  B10011111 = 159,
-  B10100000 = 160,
-  B10100001 = 161,
-  B10100010 = 162,
-  B10100011 = 163,
-  B10100100 = 164,
-  B10100101 = 165,
-  B10100110 = 166,
-  B10100111 = 167,
-  B10101000 = 168,
-  B10101001 = 169,
-  B10101010 = 170,
-  B10101011 = 171,
-  B10101100 = 172,
-  B10101101 = 173,
-  B10101110 = 174,
-  B10101111 = 175,
-  B10110000 = 176,
-  B10110001 = 177,
-  B10110010 = 178,
-  B10110011 = 179,
-  B10110100 = 180,
-  B10110101 = 181,
-  B10110110 = 182,
-  B10110111 = 183,
-  B10111000 = 184,
-  B10111001 = 185,
-  B10111010 = 186,
-  B10111011 = 187,
-  B10111100 = 188,
-  B10111101 = 189,
-  B10111110 = 190,
-  B10111111 = 191,
-  B11000000 = 192,
-  B11000001 = 193,
-  B11000010 = 194,
-  B11000011 = 195,
-  B11000100 = 196,
-  B11000101 = 197,
-  B11000110 = 198,
-  B11000111 = 199,
-  B11001000 = 200,
-  B11001001 = 201,
-  B11001010 = 202,
-  B11001011 = 203,
-  B11001100 = 204,
-  B11001101 = 205,
-  B11001110 = 206,
-  B11001111 = 207,
-  B11010000 = 208,
-  B11010001 = 209,
-  B11010010 = 210,
-  B11010011 = 211,
-  B11010100 = 212,
-  B11010101 = 213,
-  B11010110 = 214,
-  B11010111 = 215,
-  B11011000 = 216,
-  B11011001 = 217,
-  B11011010 = 218,
-  B11011011 = 219,
-  B11011100 = 220,
-  B11011101 = 221,
-  B11011110 = 222,
-  B11011111 = 223,
-  B11100000 = 224,
-  B11100001 = 225,
-  B11100010 = 226,
-  B11100011 = 227,
-  B11100100 = 228,
-  B11100101 = 229,
-  B11100110 = 230,
-  B11100111 = 231,
-  B11101000 = 232,
-  B11101001 = 233,
-  B11101010 = 234,
-  B11101011 = 235,
-  B11101100 = 236,
-  B11101101 = 237,
-  B11101110 = 238,
-  B11101111 = 239,
-  B11110000 = 240,
-  B11110001 = 241,
-  B11110010 = 242,
-  B11110011 = 243,
-  B11110100 = 244,
-  B11110101 = 245,
-  B11110110 = 246,
-  B11110111 = 247,
-  B11111000 = 248,
-  B11111001 = 249,
-  B11111010 = 250,
-  B11111011 = 251,
-  B11111100 = 252,
-  B11111101 = 253,
-  B11111110 = 254,
-  B11111111 = 255
-};
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
deleted file mode 100644
index 533b1712a7669..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_mnemonic.h
+++ /dev/null
@@ -1,4728 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-const char* getVersionString() const { return "6.73"; }
-void aadd(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0x0FC); }
-void aand(const Address& addr, const Reg32e& reg) {
-  db(0x66);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void adc(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x10, 2); }
-void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
-void adcx(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0x66, isREG32_REG32orMEM, NONE, 0x38); }
-void add(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x00, 0); }
-void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
-void addpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x66, isXMM_XMMorMEM); }
-void addps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0x100, isXMM_XMMorMEM); }
-void addsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF2, isXMM_XMMorMEM); }
-void addss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x58, 0xF3, isXMM_XMMorMEM); }
-void addsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0x66, isXMM_XMMorMEM); }
-void addsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xD0, 0xF2, isXMM_XMMorMEM); }
-void adox(const Reg32e& reg, const Operand& op) { opGen(reg, op, 0xF6, 0xF3, isREG32_REG32orMEM, NONE, 0x38); }
-void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void aeskeygenassist(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void and_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x20, 4); }
-void and_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x20); }
-void andn(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_0F38, 0xf2, true); }
-void andnpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x66, isXMM_XMMorMEM); }
-void andnps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x55, 0x100, isXMM_XMMorMEM); }
-void andpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x66, isXMM_XMMorMEM); }
-void andps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x54, 0x100, isXMM_XMMorMEM); }
-void aor(const Address& addr, const Reg32e& reg) {
-  db(0xF2);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void axor(const Address& addr, const Reg32e& reg) {
-  db(0xF3);
-  opModM(addr, reg, 0x0F, 0x38, 0x0FC);
-}
-void bextr(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf7, false); }
-void blendpd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, T_0F38, 0xf3, false); }
-void bnd() { db(0xF2); }
-void bndcl(const BoundsReg& bnd, const Operand& op) {
-  db(0xF3);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndcn(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1B, NONE, !op.isMEM());
-}
-void bndcu(const BoundsReg& bnd, const Operand& op) {
-  db(0xF2);
-  opR_ModM(op, i32e, bnd.getIdx(), 0x0F, 0x1A, NONE, !op.isMEM());
-}
-void bndldx(const BoundsReg& bnd, const Address& addr) { opMIB(addr, bnd, 0x0F, 0x1A); }
-void bndmk(const BoundsReg& bnd, const Address& addr) {
-  db(0xF3);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const Address& addr, const BoundsReg& bnd) {
-  db(0x66);
-  opModM(addr, bnd, 0x0F, 0x1B);
-}
-void bndmov(const BoundsReg& bnd, const Operand& op) {
-  db(0x66);
-  opModRM(bnd, op, op.isBNDREG(), op.isMEM(), 0x0F, 0x1A);
-}
-void bndstx(const Address& addr, const BoundsReg& bnd) { opMIB(addr, bnd, 0x0F, 0x1B); }
-void bsf(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
-void bsr(const Reg& reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
-void bswap(const Reg32e& reg) { opModR(Reg32(1), reg, 0x0F); }
-void bt(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xA3);
-}
-void bt(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 4, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btc(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xBB);
-}
-void btc(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 7, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void btr(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xB3);
-}
-void btr(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 6, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bts(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, op.isREG(16 | 32 | 64) && op.getBit() == reg.getBit(), op.isMEM(), 0x0f, 0xAB);
-}
-void bts(const Operand& op, uint8_t imm) {
-  opR_ModM(op, 16 | 32 | 64, 5, 0x0f, 0xba, NONE, false, 1);
-  db(imm);
-}
-void bzhi(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_0F38, 0xf5, false); }
-void cbw() {
-  db(0x66);
-  db(0x98);
-}
-void cdq() { db(0x99); }
-void clc() { db(0xF8); }
-void cld() { db(0xFC); }
-void cldemote(const Address& addr) { opMIB(addr, eax, 0x0F, 0x1C); }
-void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
-void clflushopt(const Address& addr) {
-  db(0x66);
-  opModM(addr, Reg32(7), 0x0F, 0xAE);
-}
-void cli() { db(0xFA); }
-void clwb(const Address& addr) {
-  db(0x66);
-  opMIB(addr, esi, 0x0F, 0xAE);
-}
-void clzero() {
-  db(0x0F);
-  db(0x01);
-  db(0xFC);
-}
-void cmc() { db(0xF5); }
-void cmova(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmove(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmovg(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovna(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 6);
-}  //-V524
-void cmovnae(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 2);
-}  //-V524
-void cmovnb(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovnbe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 7);
-}  //-V524
-void cmovnc(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 3);
-}  //-V524
-void cmovne(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovng(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 14);
-}  //-V524
-void cmovnge(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 12);
-}  //-V524
-void cmovnl(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 13);
-}  //-V524
-void cmovnle(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 15);
-}  //-V524
-void cmovno(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 1);
-}  //-V524
-void cmovnp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovns(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 9);
-}  //-V524
-void cmovnz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 5);
-}  //-V524
-void cmovo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 0);
-}  //-V524
-void cmovp(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpe(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 10);
-}  //-V524
-void cmovpo(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 11);
-}  //-V524
-void cmovs(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 8);
-}  //-V524
-void cmovz(const Reg& reg, const Operand& op) {
-  opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0x40 | 4);
-}  //-V524
-void cmp(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x38, 7); }
-void cmp(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x38); }
-void cmpeqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 0); }
-void cmpeqps(const Xmm& x, const Operand& op) { cmpps(x, op, 0); }
-void cmpeqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 0); }
-void cmpeqss(const Xmm& x, const Operand& op) { cmpss(x, op, 0); }
-void cmplepd(const Xmm& x, const Operand& op) { cmppd(x, op, 2); }
-void cmpleps(const Xmm& x, const Operand& op) { cmpps(x, op, 2); }
-void cmplesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 2); }
-void cmpless(const Xmm& x, const Operand& op) { cmpss(x, op, 2); }
-void cmpltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 1); }
-void cmpltps(const Xmm& x, const Operand& op) { cmpps(x, op, 1); }
-void cmpltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 1); }
-void cmpltss(const Xmm& x, const Operand& op) { cmpss(x, op, 1); }
-void cmpneqpd(const Xmm& x, const Operand& op) { cmppd(x, op, 4); }
-void cmpneqps(const Xmm& x, const Operand& op) { cmpps(x, op, 4); }
-void cmpneqsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 4); }
-void cmpneqss(const Xmm& x, const Operand& op) { cmpss(x, op, 4); }
-void cmpnlepd(const Xmm& x, const Operand& op) { cmppd(x, op, 6); }
-void cmpnleps(const Xmm& x, const Operand& op) { cmpps(x, op, 6); }
-void cmpnlesd(const Xmm& x, const Operand& op) { cmpsd(x, op, 6); }
-void cmpnless(const Xmm& x, const Operand& op) { cmpss(x, op, 6); }
-void cmpnltpd(const Xmm& x, const Operand& op) { cmppd(x, op, 5); }
-void cmpnltps(const Xmm& x, const Operand& op) { cmpps(x, op, 5); }
-void cmpnltsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 5); }
-void cmpnltss(const Xmm& x, const Operand& op) { cmpss(x, op, 5); }
-void cmpordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 7); }
-void cmpordps(const Xmm& x, const Operand& op) { cmpps(x, op, 7); }
-void cmpordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 7); }
-void cmpordss(const Xmm& x, const Operand& op) { cmpss(x, op, 7); }
-void cmppd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x66, isXMM_XMMorMEM, imm8); }
-void cmpps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0x100, isXMM_XMMorMEM, imm8); }
-void cmpsb() { db(0xA6); }
-void cmpsd() { db(0xA7); }
-void cmpsd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF2, isXMM_XMMorMEM, imm8); }
-void cmpss(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC2, 0xF3, isXMM_XMMorMEM, imm8); }
-void cmpsw() {
-  db(0x66);
-  db(0xA7);
-}
-void cmpunordpd(const Xmm& x, const Operand& op) { cmppd(x, op, 3); }
-void cmpunordps(const Xmm& x, const Operand& op) { cmpps(x, op, 3); }
-void cmpunordsd(const Xmm& x, const Operand& op) { cmpsd(x, op, 3); }
-void cmpunordss(const Xmm& x, const Operand& op) { cmpss(x, op, 3); }
-void cmpxchg(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xB0 | (reg.isBit(8) ? 0 : 1));
-}
-void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xC7); }
-void comisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x66, isXMM_XMMorMEM); }
-void comiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2F, 0x100, isXMM_XMMorMEM); }
-void cpuid() {
-  db(0x0F);
-  db(0xA2);
-}
-void crc32(const Reg32e& reg, const Operand& op) {
-  if (reg.isBit(32) && op.isBit(16)) db(0x66);
-  db(0xF2);
-  opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
-}
-void cvtdq2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF3, isXMM_XMMorMEM); }
-void cvtdq2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x100, isXMM_XMMorMEM); }
-void cvtpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0xF2, isXMM_XMMorMEM); }
-void cvtpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x66, isMMX_XMMorMEM); }
-void cvtpd2ps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x66, isXMM_XMMorMEM); }
-void cvtpi2pd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x66, isXMM_MMXorMEM); }
-void cvtpi2ps(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0x100, isXMM_MMXorMEM); }
-void cvtps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0x66, isXMM_XMMorMEM); }
-void cvtps2pd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0x100, isXMM_XMMorMEM); }
-void cvtps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0x100, isMMX_XMMorMEM); }
-void cvtsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF2, isREG32_XMMorMEM); }
-void cvtsd2ss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF2, isXMM_XMMorMEM); }
-void cvtsi2sd(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF2, isXMM_REG32orMEM); }
-void cvtsi2ss(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2A, 0xF3, isXMM_REG32orMEM); }
-void cvtss2sd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5A, 0xF3, isXMM_XMMorMEM); }
-void cvtss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2D, 0xF3, isREG32_XMMorMEM); }
-void cvttpd2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xE6, 0x66, isXMM_XMMorMEM); }
-void cvttpd2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x66, isMMX_XMMorMEM); }
-void cvttps2dq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5B, 0xF3, isXMM_XMMorMEM); }
-void cvttps2pi(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0x100, isMMX_XMMorMEM); }
-void cvttsd2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF2, isREG32_XMMorMEM); }
-void cvttss2si(const Operand& reg, const Operand& op) { opGen(reg, op, 0x2C, 0xF3, isREG32_XMMorMEM); }
-void cwd() {
-  db(0x66);
-  db(0x99);
-}
-void cwde() { db(0x98); }
-void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
-void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
-void divpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x66, isXMM_XMMorMEM); }
-void divps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0x100, isXMM_XMMorMEM); }
-void divsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF2, isXMM_XMMorMEM); }
-void divss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5E, 0xF3, isXMM_XMMorMEM); }
-void dppd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void dpps(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void emms() {
-  db(0x0F);
-  db(0x77);
-}
-void endbr32() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFB);
-}
-void endbr64() {
-  db(0xF3);
-  db(0x0F);
-  db(0x1E);
-  db(0xFA);
-}
-void enter(uint16_t x, uint8_t y) {
-  db(0xC8);
-  dw(x);
-  db(y);
-}
-void extractps(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x17, imm); }
-void f2xm1() {
-  db(0xD9);
-  db(0xF0);
-}
-void fabs() {
-  db(0xD9);
-  db(0xE1);
-}
-void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
-void fadd(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C0, 0xDCC0); }
-void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
-void faddp() {
-  db(0xDE);
-  db(0xC1);
-}
-void faddp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC0); }
-void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
-void fbld(const Address& addr) { opModM(addr, Reg32(4), 0xDF, 0x100); }
-void fbstp(const Address& addr) { opModM(addr, Reg32(6), 0xDF, 0x100); }
-void fchs() {
-  db(0xD9);
-  db(0xE0);
-}
-void fclex() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE2);
-}
-void fcmovb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC0, 0x00C0); }
-void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
-void fcmovbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD0, 0x00D0); }
-void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
-void fcmove(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAC8, 0x00C8); }
-void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
-void fcmovnb(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC0, 0x00C0); }
-void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
-void fcmovnbe(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD0, 0x00D0); }
-void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
-void fcmovne(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBC8, 0x00C8); }
-void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
-void fcmovnu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBD8, 0x00D8); }
-void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
-void fcmovu(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDAD8, 0x00D8); }
-void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
-void fcom() {
-  db(0xD8);
-  db(0xD1);
-}
-void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
-void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
-void fcomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBF0, 0x00F0); }
-void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
-void fcomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFF0, 0x00F0); }
-void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
-void fcomp() {
-  db(0xD8);
-  db(0xD9);
-}
-void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
-void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
-void fcompp() {
-  db(0xDE);
-  db(0xD9);
-}
-void fcos() {
-  db(0xD9);
-  db(0xFF);
-}
-void fdecstp() {
-  db(0xD9);
-  db(0xF6);
-}
-void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
-void fdiv(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F0, 0xDCF8); }
-void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
-void fdivp() {
-  db(0xDE);
-  db(0xF9);
-}
-void fdivp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF8); }
-void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
-void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
-void fdivr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8F8, 0xDCF0); }
-void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
-void fdivrp() {
-  db(0xDE);
-  db(0xF1);
-}
-void fdivrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEF0); }
-void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
-void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
-void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
-void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
-void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
-void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
-void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
-void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
-void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
-void fincstp() {
-  db(0xD9);
-  db(0xF7);
-}
-void finit() {
-  db(0x9B);
-  db(0xDB);
-  db(0xE3);
-}
-void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
-void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
-void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
-void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
-void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
-void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
-void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
-void fld1() {
-  db(0xD9);
-  db(0xE8);
-}
-void fldcw(const Address& addr) { opModM(addr, Reg32(5), 0xD9, 0x100); }
-void fldenv(const Address& addr) { opModM(addr, Reg32(4), 0xD9, 0x100); }
-void fldl2e() {
-  db(0xD9);
-  db(0xEA);
-}
-void fldl2t() {
-  db(0xD9);
-  db(0xE9);
-}
-void fldlg2() {
-  db(0xD9);
-  db(0xEC);
-}
-void fldln2() {
-  db(0xD9);
-  db(0xED);
-}
-void fldpi() {
-  db(0xD9);
-  db(0xEB);
-}
-void fldz() {
-  db(0xD9);
-  db(0xEE);
-}
-void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
-void fmul(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8C8, 0xDCC8); }
-void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
-void fmulp() {
-  db(0xDE);
-  db(0xC9);
-}
-void fmulp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEC8); }
-void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
-void fnclex() {
-  db(0xDB);
-  db(0xE2);
-}
-void fninit() {
-  db(0xDB);
-  db(0xE3);
-}
-void fnop() {
-  db(0xD9);
-  db(0xD0);
-}
-void fnsave(const Address& addr) { opModM(addr, Reg32(6), 0xDD, 0x100); }
-void fnstcw(const Address& addr) { opModM(addr, Reg32(7), 0xD9, 0x100); }
-void fnstenv(const Address& addr) { opModM(addr, Reg32(6), 0xD9, 0x100); }
-void fnstsw(const Address& addr) { opModM(addr, Reg32(7), 0xDD, 0x100); }
-void fnstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xDF);
-  db(0xE0);
-}
-void fpatan() {
-  db(0xD9);
-  db(0xF3);
-}
-void fprem() {
-  db(0xD9);
-  db(0xF8);
-}
-void fprem1() {
-  db(0xD9);
-  db(0xF5);
-}
-void fptan() {
-  db(0xD9);
-  db(0xF2);
-}
-void frndint() {
-  db(0xD9);
-  db(0xFC);
-}
-void frstor(const Address& addr) { opModM(addr, Reg32(4), 0xDD, 0x100); }
-void fsave(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xDD, 0x100);
-}
-void fscale() {
-  db(0xD9);
-  db(0xFD);
-}
-void fsin() {
-  db(0xD9);
-  db(0xFE);
-}
-void fsincos() {
-  db(0xD9);
-  db(0xFB);
-}
-void fsqrt() {
-  db(0xD9);
-  db(0xFA);
-}
-void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
-void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
-void fstcw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xD9, 0x100);
-}
-void fstenv(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(6), 0xD9, 0x100);
-}
-void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
-void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
-void fstsw(const Address& addr) {
-  db(0x9B);
-  opModM(addr, Reg32(7), 0xDD, 0x100);
-}
-void fstsw(const Reg16& r) {
-  if (r.getIdx() != Operand::AX) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x9B);
-  db(0xDF);
-  db(0xE0);
-}
-void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
-void fsub(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E0, 0xDCE8); }
-void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
-void fsubp() {
-  db(0xDE);
-  db(0xE9);
-}
-void fsubp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE8); }
-void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
-void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
-void fsubr(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xD8E8, 0xDCE0); }
-void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
-void fsubrp() {
-  db(0xDE);
-  db(0xE1);
-}
-void fsubrp(const Fpu& reg1) { opFpuFpu(reg1, st0, 0x0000, 0xDEE0); }
-void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
-void ftst() {
-  db(0xD9);
-  db(0xE4);
-}
-void fucom() {
-  db(0xDD);
-  db(0xE1);
-}
-void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
-void fucomi(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDBE8, 0x00E8); }
-void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
-void fucomip(const Fpu& reg1) { opFpuFpu(st0, reg1, 0xDFE8, 0x00E8); }
-void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
-void fucomp() {
-  db(0xDD);
-  db(0xE9);
-}
-void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
-void fucompp() {
-  db(0xDA);
-  db(0xE9);
-}
-void fwait() { db(0x9B); }
-void fxam() {
-  db(0xD9);
-  db(0xE5);
-}
-void fxch() {
-  db(0xD9);
-  db(0xC9);
-}
-void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
-void fxrstor(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0xAE); }
-void fxtract() {
-  db(0xD9);
-  db(0xF4);
-}
-void fyl2x() {
-  db(0xD9);
-  db(0xF1);
-}
-void fyl2xp1() {
-  db(0xD9);
-  db(0xF9);
-}
-void gf2p8affineinvqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8affineqb(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0xCE, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void gf2p8mulb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void haddpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0x66, isXMM_XMMorMEM); }
-void haddps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7C, 0xF2, isXMM_XMMorMEM); }
-void hlt() { db(0xF4); }
-void hsubpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0x66, isXMM_XMMorMEM); }
-void hsubps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x7D, 0xF2, isXMM_XMMorMEM); }
-void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
-void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
-void in_(const Reg& a, const Reg& d) { opInOut(a, d, 0xEC); }
-void in_(const Reg& a, uint8_t v) { opInOut(a, 0xE4, v); }
-void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
-void insertps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void int3() { db(0xCC); }
-void int_(uint8_t x) {
-  db(0xCD);
-  db(x);
-}
-void ja(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }    //-V524
-void ja(const char* label, LabelType type = T_AUTO) { ja(std::string(label), type); }             //-V524
-void ja(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                           //-V524
-void ja(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }     //-V524
-void jae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jae(const char* label, LabelType type = T_AUTO) { jae(std::string(label), type); }           //-V524
-void jae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jb(const char* label, LabelType type = T_AUTO) { jb(std::string(label), type); }             //-V524
-void jb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void jbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jbe(const char* label, LabelType type = T_AUTO) { jbe(std::string(label), type); }           //-V524
-void jbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }    //-V524
-void jc(const char* label, LabelType type = T_AUTO) { jc(std::string(label), type); }             //-V524
-void jc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                           //-V524
-void jc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }     //-V524
-void je(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void je(const char* label, LabelType type = T_AUTO) { je(std::string(label), type); }             //-V524
-void je(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void je(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void jg(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }    //-V524
-void jg(const char* label, LabelType type = T_AUTO) { jg(std::string(label), type); }             //-V524
-void jg(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                           //-V524
-void jg(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }     //-V524
-void jge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jge(const char* label, LabelType type = T_AUTO) { jge(std::string(label), type); }           //-V524
-void jge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }    //-V524
-void jl(const char* label, LabelType type = T_AUTO) { jl(std::string(label), type); }             //-V524
-void jl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                           //-V524
-void jl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }     //-V524
-void jle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jle(const char* label, LabelType type = T_AUTO) { jle(std::string(label), type); }           //-V524
-void jle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jna(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }   //-V524
-void jna(const char* label, LabelType type = T_AUTO) { jna(std::string(label), type); }           //-V524
-void jna(const void* addr) { opJmpAbs(addr, T_NEAR, 0x76, 0x86, 0x0F); }                          //-V524
-void jna(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }    //-V524
-void jnae(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }  //-V524
-void jnae(const char* label, LabelType type = T_AUTO) { jnae(std::string(label), type); }         //-V524
-void jnae(const void* addr) { opJmpAbs(addr, T_NEAR, 0x72, 0x82, 0x0F); }                         //-V524
-void jnae(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }   //-V524
-void jnb(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnb(const char* label, LabelType type = T_AUTO) { jnb(std::string(label), type); }           //-V524
-void jnb(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnb(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jnbe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }  //-V524
-void jnbe(const char* label, LabelType type = T_AUTO) { jnbe(std::string(label), type); }         //-V524
-void jnbe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x77, 0x87, 0x0F); }                         //-V524
-void jnbe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }   //-V524
-void jnc(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }   //-V524
-void jnc(const char* label, LabelType type = T_AUTO) { jnc(std::string(label), type); }           //-V524
-void jnc(const void* addr) { opJmpAbs(addr, T_NEAR, 0x73, 0x83, 0x0F); }                          //-V524
-void jnc(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }    //-V524
-void jne(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jne(const char* label, LabelType type = T_AUTO) { jne(std::string(label), type); }           //-V524
-void jne(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jne(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jng(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }   //-V524
-void jng(const char* label, LabelType type = T_AUTO) { jng(std::string(label), type); }           //-V524
-void jng(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7E, 0x8E, 0x0F); }                          //-V524
-void jng(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }    //-V524
-void jnge(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }  //-V524
-void jnge(const char* label, LabelType type = T_AUTO) { jnge(std::string(label), type); }         //-V524
-void jnge(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7C, 0x8C, 0x0F); }                         //-V524
-void jnge(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }   //-V524
-void jnl(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }   //-V524
-void jnl(const char* label, LabelType type = T_AUTO) { jnl(std::string(label), type); }           //-V524
-void jnl(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7D, 0x8D, 0x0F); }                          //-V524
-void jnl(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }    //-V524
-void jnle(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }  //-V524
-void jnle(const char* label, LabelType type = T_AUTO) { jnle(std::string(label), type); }         //-V524
-void jnle(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7F, 0x8F, 0x0F); }                         //-V524
-void jnle(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }   //-V524
-void jno(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }   //-V524
-void jno(const char* label, LabelType type = T_AUTO) { jno(std::string(label), type); }           //-V524
-void jno(const void* addr) { opJmpAbs(addr, T_NEAR, 0x71, 0x81, 0x0F); }                          //-V524
-void jno(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }    //-V524
-void jnp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jnp(const char* label, LabelType type = T_AUTO) { jnp(std::string(label), type); }           //-V524
-void jnp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jnp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void jns(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }   //-V524
-void jns(const char* label, LabelType type = T_AUTO) { jns(std::string(label), type); }           //-V524
-void jns(const void* addr) { opJmpAbs(addr, T_NEAR, 0x79, 0x89, 0x0F); }                          //-V524
-void jns(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }    //-V524
-void jnz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }   //-V524
-void jnz(const char* label, LabelType type = T_AUTO) { jnz(std::string(label), type); }           //-V524
-void jnz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x75, 0x85, 0x0F); }                          //-V524
-void jnz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }    //-V524
-void jo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }    //-V524
-void jo(const char* label, LabelType type = T_AUTO) { jo(std::string(label), type); }             //-V524
-void jo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x70, 0x80, 0x0F); }                           //-V524
-void jo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }     //-V524
-void jp(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jp(const char* label, LabelType type = T_AUTO) { jp(std::string(label), type); }             //-V524
-void jp(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                           //-V524
-void jp(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }     //-V524
-void jpe(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }   //-V524
-void jpe(const char* label, LabelType type = T_AUTO) { jpe(std::string(label), type); }           //-V524
-void jpe(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7A, 0x8A, 0x0F); }                          //-V524
-void jpe(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }    //-V524
-void jpo(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }   //-V524
-void jpo(const char* label, LabelType type = T_AUTO) { jpo(std::string(label), type); }           //-V524
-void jpo(const void* addr) { opJmpAbs(addr, T_NEAR, 0x7B, 0x8B, 0x0F); }                          //-V524
-void jpo(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }    //-V524
-void js(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }    //-V524
-void js(const char* label, LabelType type = T_AUTO) { js(std::string(label), type); }             //-V524
-void js(const void* addr) { opJmpAbs(addr, T_NEAR, 0x78, 0x88, 0x0F); }                           //-V524
-void js(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }     //-V524
-void jz(const Label& label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }    //-V524
-void jz(const char* label, LabelType type = T_AUTO) { jz(std::string(label), type); }             //-V524
-void jz(const void* addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }                           //-V524
-void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }     //-V524
-void lahf() { db(0x9F); }
-void lddqu(const Xmm& xmm, const Address& addr) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0xF0);
-}
-void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
-void lea(const Reg& reg, const Address& addr) {
-  if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModM(addr, reg, 0x8D);
-}
-void leave() { db(0xC9); }
-void lfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xE8);
-}
-void lfs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB4); }
-void lgs(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB5); }
-void lock() { db(0xF0); }
-void lodsb() { db(0xAC); }
-void lodsd() { db(0xAD); }
-void lodsw() {
-  db(0x66);
-  db(0xAD);
-}
-void loop(const Label& label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loop(const char* label) { loop(std::string(label)); }
-void loop(std::string label) { opJmp(label, T_SHORT, 0xE2, 0, 0); }
-void loope(const Label& label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loope(const char* label) { loope(std::string(label)); }
-void loope(std::string label) { opJmp(label, T_SHORT, 0xE1, 0, 0); }
-void loopne(const Label& label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void loopne(const char* label) { loopne(std::string(label)); }
-void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
-void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0x0F, 0xB2); }
-void lzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBD); }
-void maskmovdqu(const Xmm& reg1, const Xmm& reg2) {
-  db(0x66);
-  opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maskmovq(const Mmx& reg1, const Mmx& reg2) {
-  if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModR(reg1, reg2, 0x0F, 0xF7);
-}
-void maxpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x66, isXMM_XMMorMEM); }
-void maxps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0x100, isXMM_XMMorMEM); }
-void maxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF2, isXMM_XMMorMEM); }
-void maxss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5F, 0xF3, isXMM_XMMorMEM); }
-void mfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF0);
-}
-void minpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x66, isXMM_XMMorMEM); }
-void minps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0x100, isXMM_XMMorMEM); }
-void minsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF2, isXMM_XMMorMEM); }
-void minss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5D, 0xF3, isXMM_XMMorMEM); }
-void monitor() {
-  db(0x0F);
-  db(0x01);
-  db(0xC8);
-}
-void monitorx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFA);
-}
-void movapd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x29);
-}
-void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x66); }
-void movaps(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x29); }
-void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, 0x100); }
-void movbe(const Address& addr, const Reg& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF1); }
-void movbe(const Reg& reg, const Address& addr) { opModM(addr, reg, 0x0F, 0x38, 0xF0); }
-void movd(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x7E);
-}
-void movd(const Mmx& mmx, const Address& addr) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, 0x6E);
-}
-void movd(const Mmx& mmx, const Reg32& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movd(const Reg32& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movddup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF2, isXMM_XMMorMEM, NONE, NONE); }
-void movdir64b(const Reg& reg, const Address& addr) {
-  db(0x66);
-  opModM(addr, reg.cvt32(), 0x0F, 0x38, 0xF8);
-}
-void movdiri(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0x38, 0xF9); }
-void movdq2q(const Mmx& mmx, const Xmm& xmm) {
-  db(0xF2);
-  opModR(mmx, xmm, 0x0F, 0xD6);
-}
-void movdqa(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0x66); }
-void movdqu(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x7F);
-}
-void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, 0xF3); }
-void movhlps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x12); }
-void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x66); }
-void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0x100); }
-void movlhps(const Xmm& reg1, const Xmm& reg2) { opModR(reg1, reg2, 0x0F, 0x16); }
-void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
-void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x100); }
-void movmskpd(const Reg32e& reg, const Xmm& xmm) {
-  db(0x66);
-  movmskps(reg, xmm);
-}
-void movmskps(const Reg32e& reg, const Xmm& xmm) { opModR(reg, xmm, 0x0F, 0x50); }
-void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
-void movntdqa(const Xmm& xmm, const Address& addr) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x38, 0x2A);
-}
-void movnti(const Address& addr, const Reg32e& reg) { opModM(addr, reg, 0x0F, 0xC3); }
-void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0x2B); }
-void movntps(const Address& addr, const Xmm& xmm) { opModM(addr, Mmx(xmm.getIdx()), 0x0F, 0x2B); }
-void movntq(const Address& addr, const Mmx& mmx) {
-  if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opModM(addr, mmx, 0x0F, 0xE7);
-}
-void movq(const Address& addr, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModM(addr, mmx, 0x0F, mmx.isXMM() ? 0xD6 : 0x7F);
-}
-void movq(const Mmx& mmx, const Operand& op) {
-  if (mmx.isXMM()) db(0xF3);
-  opModRM(mmx, op, (mmx.getKind() == op.getKind()), op.isMEM(), 0x0F, mmx.isXMM() ? 0x7E : 0x6F);
-}
-void movq2dq(const Xmm& xmm, const Mmx& mmx) {
-  db(0xF3);
-  opModR(xmm, mmx, 0x0F, 0xD6);
-}
-void movsb() { db(0xA4); }
-void movsd() { db(0xA5); }
-void movsd(const Address& addr, const Xmm& xmm) {
-  db(0xF2);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF2); }
-void movshdup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x16, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movsldup(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x12, 0xF3, isXMM_XMMorMEM, NONE, NONE); }
-void movss(const Address& addr, const Xmm& xmm) {
-  db(0xF3);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0xF3); }
-void movsw() {
-  db(0x66);
-  db(0xA5);
-}
-void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
-void movupd(const Address& addr, const Xmm& xmm) {
-  db(0x66);
-  opModM(addr, xmm, 0x0F, 0x11);
-}
-void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x66); }
-void movups(const Address& addr, const Xmm& xmm) { opModM(addr, xmm, 0x0F, 0x11); }
-void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, 0x100); }
-void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
-void mpsadbw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x42, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
-void mulpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x66, isXMM_XMMorMEM); }
-void mulps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0x100, isXMM_XMMorMEM); }
-void mulsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF2, isXMM_XMMorMEM); }
-void mulss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x59, 0xF3, isXMM_XMMorMEM); }
-void mulx(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf6, true); }
-void mwait() {
-  db(0x0F);
-  db(0x01);
-  db(0xC9);
-}
-void mwaitx() {
-  db(0x0F);
-  db(0x01);
-  db(0xFB);
-}
-void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
-void not_(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
-void or_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x08, 1); }
-void or_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x08); }
-void orpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x66, isXMM_XMMorMEM); }
-void orps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x56, 0x100, isXMM_XMMorMEM); }
-void out_(const Reg& d, const Reg& a) { opInOut(a, d, 0xEE); }
-void out_(uint8_t v, const Reg& a) { opInOut(a, 0xE6, v); }
-void outsb() { db(0x6E); }
-void outsd() { db(0x6F); }
-void outsw() {
-  db(0x66);
-  db(0x6F);
-}
-void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
-void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
-void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
-void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
-void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
-void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
-void paddb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFC); }
-void paddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFE); }
-void paddq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD4); }
-void paddsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEC); }
-void paddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xED); }
-void paddusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDC); }
-void paddusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDD); }
-void paddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFD); }
-void palignr(const Mmx& mmx, const Operand& op, int imm) {
-  opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8_t>(imm), 0x3a);
-}
-void pand(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDB); }
-void pandn(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDF); }
-void pause() {
-  db(0xF3);
-  db(0x90);
-}
-void pavgb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE0); }
-void pavgw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE3); }
-void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pblendw(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0E, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pclmulhqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x11); }
-void pclmulhqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x01); }
-void pclmullqhqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x10); }
-void pclmullqlqdq(const Xmm& xmm, const Operand& op) { pclmulqdq(xmm, op, 0x00); }
-void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void pcmpeqb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x74); }
-void pcmpeqd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x76); }
-void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpeqw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x75); }
-void pcmpestri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpestrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x60, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpgtb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x64); }
-void pcmpgtd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x66); }
-void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pcmpgtw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x65); }
-void pcmpistri(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pcmpistrm(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, imm, 0x3A);
-}
-void pdep(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F2 | T_0F38, 0xf5, true); }
-void pext(const Reg32e& r1, const Reg32e& r2, const Operand& op) { opGpr(r1, r2, op, T_F3 | T_0F38, 0xf5, true); }
-void pextrb(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x14, imm); }
-void pextrd(const Operand& op, const Xmm& xmm, uint8_t imm) { opExt(op, xmm, 0x16, imm); }
-void pextrw(const Operand& op, const Mmx& xmm, uint8_t imm) { opExt(op, xmm, 0x15, imm, true); }
-void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
-void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
-void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
-void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
-void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
-void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
-void pinsrb(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x20, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x22, 0x66, isXMM_REG32orMEM, imm, 0x3A); }
-void pinsrw(const Mmx& mmx, const Operand& op, int imm) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(mmx, op, 0xC4, mmx.isXMM() ? 0x66 : NONE, 0, imm);
-}
-void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
-void pmaddwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF5); }
-void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEE); }
-void pmaxub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDE); }
-void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
-void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
-void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovmskb(const Reg32e& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(reg, mmx, 0x0F, 0xD7);
-}
-void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
-void pmulhuw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE4); }
-void pmulhw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE5); }
-void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void pmullw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD5); }
-void pmuludq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF4); }
-void popcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xB8); }
-void popf() { db(0x9D); }
-void por(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEB); }
-void prefetchit0(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0x18); }
-void prefetchit1(const Address& addr) { opModM(addr, Reg32(6), 0x0F, 0x18); }
-void prefetchnta(const Address& addr) { opModM(addr, Reg32(0), 0x0F, 0x18); }
-void prefetcht0(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x18); }
-void prefetcht1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x18); }
-void prefetcht2(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0x18); }
-void prefetchw(const Address& addr) { opModM(addr, Reg32(1), 0x0F, 0x0D); }
-void prefetchwt1(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0x0D); }
-void psadbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF6); }
-void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
-void pshufd(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x66, imm8); }
-void pshufhw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF3, imm8); }
-void pshuflw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0xF2, imm8); }
-void pshufw(const Mmx& mmx, const Operand& op, uint8_t imm8) { opMMX(mmx, op, 0x70, 0x00, imm8); }
-void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
-void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
-void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
-void pslld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF2); }
-void pslld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 6); }
-void pslldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 7); }
-void psllq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF3); }
-void psllq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 6); }
-void psllw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF1); }
-void psllw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 6); }
-void psrad(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE2); }
-void psrad(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 4); }
-void psraw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE1); }
-void psraw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 4); }
-void psrld(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD2); }
-void psrld(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x72, 2); }
-void psrldq(const Xmm& xmm, int imm8) { opMMX_IMM(xmm, imm8, 0x73, 3); }
-void psrlq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD3); }
-void psrlq(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x73, 2); }
-void psrlw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD1); }
-void psrlw(const Mmx& mmx, int imm8) { opMMX_IMM(mmx, imm8, 0x71, 2); }
-void psubb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF8); }
-void psubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFA); }
-void psubq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xFB); }
-void psubsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE8); }
-void psubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xE9); }
-void psubusb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD8); }
-void psubusw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xD9); }
-void psubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xF9); }
-void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
-void punpckhbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x68); }
-void punpckhdq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6A); }
-void punpckhqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6D, 0x66, isXMM_XMMorMEM); }
-void punpckhwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x69); }
-void punpcklbw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x60); }
-void punpckldq(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x62); }
-void punpcklqdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x6C, 0x66, isXMM_XMMorMEM); }
-void punpcklwd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x61); }
-void pushf() { db(0x9C); }
-void pxor(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEF); }
-void rcl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 2); }
-void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
-void rcpps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0x100, isXMM_XMMorMEM); }
-void rcpss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x53, 0xF3, isXMM_XMMorMEM); }
-void rcr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 3); }
-void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
-void rdmsr() {
-  db(0x0F);
-  db(0x32);
-}
-void rdpmc() {
-  db(0x0F);
-  db(0x33);
-}
-void rdrand(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(6, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdseed(const Reg& r) {
-  if (r.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opModR(Reg(7, Operand::REG, r.getBit()), r, 0x0F, 0xC7);
-}
-void rdtsc() {
-  db(0x0F);
-  db(0x31);
-}
-void rdtscp() {
-  db(0x0F);
-  db(0x01);
-  db(0xF9);
-}
-void rep() { db(0xF3); }
-void repe() { db(0xF3); }
-void repne() { db(0xF2); }
-void repnz() { db(0xF2); }
-void repz() { db(0xF3); }
-void ret(int imm = 0) {
-  if (imm) {
-    db(0xC2);
-    dw(imm);
-  } else {
-    db(0xC3);
-  }
-}
-void retf(int imm = 0) {
-  if (imm) {
-    db(0xCA);
-    dw(imm);
-  } else {
-    db(0xCB);
-  }
-}
-void rol(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 0); }
-void rol(const Operand& op, int imm) { opShift(op, imm, 0); }
-void ror(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 1); }
-void ror(const Operand& op, int imm) { opShift(op, imm, 1); }
-void rorx(const Reg32e& r, const Operand& op, uint8_t imm) {
-  opGpr(r, op, Reg32e(0, r.getBit()), T_0F3A | T_F2, 0xF0, false, imm);
-}
-void roundpd(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x09, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundps(const Xmm& xmm, const Operand& op, uint8_t imm) { opGen(xmm, op, 0x08, 0x66, isXMM_XMMorMEM, imm, 0x3A); }
-void roundsd(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0B, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void roundss(const Xmm& xmm, const Operand& op, int imm) {
-  opGen(xmm, op, 0x0A, 0x66, isXMM_XMMorMEM, static_cast<uint8_t>(imm), 0x3A);
-}
-void rsqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0x100, isXMM_XMMorMEM); }
-void rsqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x52, 0xF3, isXMM_XMMorMEM); }
-void sahf() { db(0x9E); }
-void sal(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void sal(const Operand& op, int imm) { opShift(op, imm, 4); }
-void sar(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 7); }
-void sar(const Operand& op, int imm) { opShift(op, imm, 7); }
-void sarx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F3 | T_0F38, 0xf7, false); }
-void sbb(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x18, 3); }
-void sbb(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x18); }
-void scasb() { db(0xAE); }
-void scasd() { db(0xAF); }
-void scasw() {
-  db(0x66);
-  db(0xAF);
-}
-void serialize() {
-  db(0x0F);
-  db(0x01);
-  db(0xE8);
-}
-void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }     //-V524
-void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }     //-V524
-void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }    //-V524
-void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }    //-V524
-void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 6); }    //-V524
-void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 2); }   //-V524
-void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 7); }   //-V524
-void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 3); }    //-V524
-void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 14); }   //-V524
-void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 12); }  //-V524
-void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 13); }   //-V524
-void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 15); }  //-V524
-void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 1); }    //-V524
-void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 9); }    //-V524
-void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 5); }    //-V524
-void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 0); }     //-V524
-void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }    //-V524
-void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 10); }   //-V524
-void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 11); }   //-V524
-void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 8); }     //-V524
-void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, 0x90 | 4); }     //-V524
-void sfence() {
-  db(0x0F);
-  db(0xAE);
-  db(0xF8);
-}
-void sha1msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC9, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCA, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1nexte(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xC8, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha1rnds4(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, imm, 0x3A);
-}
-void sha256msg1(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCC, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256msg2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCD, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void sha256rnds2(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xCB, NONE, isXMM_XMMorMEM, NONE, 0x38); }
-void shl(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 4); }
-void shl(const Operand& op, int imm) { opShift(op, imm, 4); }
-void shld(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xA4, &_cl); }
-void shld(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xA4); }
-void shlx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_66 | T_0F38, 0xf7, false); }
-void shr(const Operand& op, const Reg8& _cl) { opShift(op, _cl, 5); }
-void shr(const Operand& op, int imm) { opShift(op, imm, 5); }
-void shrd(const Operand& op, const Reg& reg, const Reg8& _cl) { opShxd(op, reg, 0, 0xAC, &_cl); }
-void shrd(const Operand& op, const Reg& reg, uint8_t imm) { opShxd(op, reg, imm, 0xAC); }
-void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, r2, T_F2 | T_0F38, 0xf7, false); }
-void shufpd(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x66, isXMM_XMMorMEM, imm8); }
-void shufps(const Xmm& xmm, const Operand& op, uint8_t imm8) { opGen(xmm, op, 0xC6, 0x100, isXMM_XMMorMEM, imm8); }
-void sqrtpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x66, isXMM_XMMorMEM); }
-void sqrtps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0x100, isXMM_XMMorMEM); }
-void sqrtsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF2, isXMM_XMMorMEM); }
-void sqrtss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x51, 0xF3, isXMM_XMMorMEM); }
-void stac() {
-  db(0x0F);
-  db(0x01);
-  db(0xCB);
-}
-void stc() { db(0xF9); }
-void std() { db(0xFD); }
-void sti() { db(0xFB); }
-void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
-void stosb() { db(0xAA); }
-void stosd() { db(0xAB); }
-void stosw() {
-  db(0x66);
-  db(0xAB);
-}
-void sub(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x28, 5); }
-void sub(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x28); }
-void subpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x66, isXMM_XMMorMEM); }
-void subps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0x100, isXMM_XMMorMEM); }
-void subsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF2, isXMM_XMMorMEM); }
-void subss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x5C, 0xF3, isXMM_XMMorMEM); }
-void sysenter() {
-  db(0x0F);
-  db(0x34);
-}
-void sysexit() {
-  db(0x0F);
-  db(0x35);
-}
-void tpause(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0x66);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void tzcnt(const Reg& reg, const Operand& op) { opSp1(reg, op, 0xF3, 0x0F, 0xBC); }
-void ucomisd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x66, isXMM_XMMorMEM); }
-void ucomiss(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2E, 0x100, isXMM_XMMorMEM); }
-void ud2() {
-  db(0x0F);
-  db(0x0B);
-}
-void umonitor(const Reg& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) int bit = r.getBit();
-  if (BIT != bit) {
-    if ((BIT == 32 && bit == 16) || (BIT == 64 && bit == 32)) {
-      db(0x67);
-    } else {
-      XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
-    }
-  }
-  db(0xF3);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void umwait(const Reg32& r) {
-  int idx = r.getIdx();
-  if (idx > 7) XBYAK_THROW(ERR_BAD_PARAMETER) db(0xF2);
-  db(0x0F);
-  db(0xAE);
-  setModRM(3, 6, idx);
-}
-void unpckhpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM); }
-void unpckhps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x100, isXMM_XMMorMEM); }
-void unpcklpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM); }
-void unpcklps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x100, isXMM_XMMorMEM); }
-void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x58);
-}
-void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x58);
-}
-void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x58);
-}
-void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x58);
-}
-void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0xD0);
-}
-void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0xD0);
-}
-void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDE);
-}
-void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDF);
-}
-void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDC);
-}
-void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F38 | T_YMM | T_EVEX, 0xDD);
-}
-void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_W0, 0xDB); }
-void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0xDF, imm);
-}
-void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55);
-}
-void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55);
-}
-void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54);
-}
-void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54);
-}
-void vbcstnebf162ps(const Xmm& x, const Address& addr) {
-  opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1);
-}
-void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM | T_B16, 0xB1); }
-void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0D, imm);
-}
-void vblendps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0C, imm);
-}
-void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4B, x4.getIdx() << 4);
-}
-void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4A, x4.getIdx() << 4);
-}
-void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x1A); }
-void vbroadcasti128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x5A); }
-void vbroadcastsd(const Ymm& y, const Operand& op) {
-  if (!op.isMEM() && !(y.isYMM() && op.isXMM()) && !(y.isZMM() && op.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(y, op, T_0F38 | T_66 | T_W0 | T_YMM | T_EVEX | T_EW1 | T_N8, 0x19);
-}
-void vbroadcastss(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x18);
-}
-void vcmpeq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 16); }
-void vcmpeq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 16); }
-void vcmpeq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 16); }
-void vcmpeq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 16); }
-void vcmpeq_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 8); }
-void vcmpeq_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 8); }
-void vcmpeq_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 8); }
-void vcmpeq_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 8); }
-void vcmpeq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 24); }
-void vcmpeq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 24); }
-void vcmpeq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 24); }
-void vcmpeq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 24); }
-void vcmpeqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 0); }
-void vcmpeqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 0); }
-void vcmpeqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 0); }
-void vcmpeqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 0); }
-void vcmpfalse_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 27); }
-void vcmpfalse_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 27); }
-void vcmpfalse_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 27); }
-void vcmpfalse_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 27); }
-void vcmpfalsepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 11); }
-void vcmpfalseps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 11); }
-void vcmpfalsesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 11); }
-void vcmpfalsess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 11); }
-void vcmpge_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 29); }
-void vcmpge_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 29); }
-void vcmpge_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 29); }
-void vcmpge_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 29); }
-void vcmpgepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 13); }
-void vcmpgeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 13); }
-void vcmpgesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 13); }
-void vcmpgess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 13); }
-void vcmpgt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 30); }
-void vcmpgt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 30); }
-void vcmpgt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 30); }
-void vcmpgt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 30); }
-void vcmpgtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 14); }
-void vcmpgtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 14); }
-void vcmpgtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 14); }
-void vcmpgtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 14); }
-void vcmple_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 18); }
-void vcmple_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 18); }
-void vcmple_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 18); }
-void vcmple_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 18); }
-void vcmplepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 2); }
-void vcmpleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 2); }
-void vcmplesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 2); }
-void vcmpless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 2); }
-void vcmplt_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 17); }
-void vcmplt_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 17); }
-void vcmplt_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 17); }
-void vcmplt_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 17); }
-void vcmpltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 1); }
-void vcmpltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 1); }
-void vcmpltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 1); }
-void vcmpltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 1); }
-void vcmpneq_oqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 12); }
-void vcmpneq_oqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 12); }
-void vcmpneq_oqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 12); }
-void vcmpneq_oqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 12); }
-void vcmpneq_ospd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 28); }
-void vcmpneq_osps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 28); }
-void vcmpneq_ossd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 28); }
-void vcmpneq_osss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 28); }
-void vcmpneq_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 20); }
-void vcmpneq_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 20); }
-void vcmpneq_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 20); }
-void vcmpneq_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 20); }
-void vcmpneqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 4); }
-void vcmpneqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 4); }
-void vcmpneqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 4); }
-void vcmpneqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 4); }
-void vcmpnge_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 25); }
-void vcmpnge_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 25); }
-void vcmpnge_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 25); }
-void vcmpnge_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 25); }
-void vcmpngepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 9); }
-void vcmpngeps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 9); }
-void vcmpngesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 9); }
-void vcmpngess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 9); }
-void vcmpngt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 26); }
-void vcmpngt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 26); }
-void vcmpngt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 26); }
-void vcmpngt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 26); }
-void vcmpngtpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 10); }
-void vcmpngtps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 10); }
-void vcmpngtsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 10); }
-void vcmpngtss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 10); }
-void vcmpnle_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 22); }
-void vcmpnle_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 22); }
-void vcmpnle_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 22); }
-void vcmpnle_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 22); }
-void vcmpnlepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 6); }
-void vcmpnleps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 6); }
-void vcmpnlesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 6); }
-void vcmpnless(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 6); }
-void vcmpnlt_uqpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 21); }
-void vcmpnlt_uqps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 21); }
-void vcmpnlt_uqsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 21); }
-void vcmpnlt_uqss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 21); }
-void vcmpnltpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 5); }
-void vcmpnltps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 5); }
-void vcmpnltsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 5); }
-void vcmpnltss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 5); }
-void vcmpord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 23); }
-void vcmpord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 23); }
-void vcmpord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 23); }
-void vcmpord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 23); }
-void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 7); }
-void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
-void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
-void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
-void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_YMM, 0xC2, imm);
-}
-void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F, 0xC2, imm);
-}
-void vcmpss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0xC2, imm);
-}
-void vcmptrue_uspd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 31); }
-void vcmptrue_usps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 31); }
-void vcmptrue_ussd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 31); }
-void vcmptrue_usss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 31); }
-void vcmptruepd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 15); }
-void vcmptrueps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 15); }
-void vcmptruesd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 15); }
-void vcmptruess(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 15); }
-void vcmpunord_spd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 19); }
-void vcmpunord_sps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 19); }
-void vcmpunord_ssd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 19); }
-void vcmpunord_sss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 19); }
-void vcmpunordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2, op, 3); }
-void vcmpunordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 3); }
-void vcmpunordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 3); }
-void vcmpunordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 3); }
-void vcomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2F);
-}
-void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2F); }
-void vcvtdq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_F3 | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL, 0xE6);
-}
-void vcvtdq2ps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2 | T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_W0 | T_YMM, 0xB0); }
-void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opCvt2(x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32 | orEvexIf(encoding), 0x72);
-}
-void vcvtpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvtpd2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A);
-}
-void vcvtph2ps(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13);
-}
-void vcvtps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5B);
-}
-void vcvtps2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F | T_YMM | T_EVEX | T_EW0 | T_B32 | T_N8 | T_N_VL | T_SAE_Y, 0x5A);
-}
-void vcvtps2ph(const Operand& op, const Xmm& x, uint8_t imm) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y | T_M_K, 0x1D, imm);
-}
-void vcvtsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvtsd2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x5A);
-}
-void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F2 | T_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A);
-}
-void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x5A);
-}
-void vcvtss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttpd2dq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6);
-}
-void vcvttps2dq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5B);
-}
-void vcvttsd2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C);
-}
-void vcvttss2si(const Reg32& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C);
-}
-void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5E);
-}
-void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5E);
-}
-void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5E);
-}
-void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5E);
-}
-void vdppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x41, imm);
-}
-void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x40, imm);
-}
-void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm);
-}
-void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) {
-  if (!(op.isXMEM() && y.isYMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm);
-}
-void vextractps(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm);
-}
-void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x98);
-}
-void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x98);
-}
-void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x99);
-}
-void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA8);
-}
-void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA8);
-}
-void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xA9);
-}
-void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB8);
-}
-void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB8);
-}
-void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xB9);
-}
-void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x96);
-}
-void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x96);
-}
-void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA6);
-}
-void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA6);
-}
-void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB6);
-}
-void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB6);
-}
-void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9A);
-}
-void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9A);
-}
-void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9B);
-}
-void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAA);
-}
-void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAA);
-}
-void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAB);
-}
-void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBA);
-}
-void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBA);
-}
-void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBB);
-}
-void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x97);
-}
-void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x97);
-}
-void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xA7);
-}
-void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xA7);
-}
-void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xB7);
-}
-void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xB7);
-}
-void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9C);
-}
-void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9C);
-}
-void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9D);
-}
-void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAC);
-}
-void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAC);
-}
-void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAD);
-}
-void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBC);
-}
-void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBC);
-}
-void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBD);
-}
-void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x9E);
-}
-void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x9E);
-}
-void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0x9F);
-}
-void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xAE);
-}
-void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xAE);
-}
-void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xAF);
-}
-void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0xBE);
-}
-void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0xBE);
-}
-void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_W1 | T_EW1 | T_EVEX | T_ER_X, 0xBF);
-}
-void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX | T_ER_X, 0xBF);
-}
-void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0);
-}
-void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x92, 1);
-}
-void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x93, 1);
-}
-void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x93, 2);
-}
-void vgf2p8affineinvqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCF, imm);
-}
-void vgf2p8affineqb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0xCE, imm);
-}
-void vgf2p8mulb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_SAE_Z, 0xCF);
-}
-void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7C);
-}
-void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7C);
-}
-void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_66 | T_0F | T_YMM, 0x7D);
-}
-void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_F2 | T_0F | T_YMM, 0x7D);
-}
-void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x18, imm);
-}
-void vinserti128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isXMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x38, imm);
-}
-void vinsertps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0x21, imm);
-}
-void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, cvtIdx0(x), addr, T_0F | T_F2 | T_W0 | T_YMM, 0xF0); }
-void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, T_0F, 0xAE); }
-void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_66, 0xF7); }
-void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2F);
-}
-void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D);
-}
-void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E);
-}
-void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C);
-}
-void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F);
-}
-void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F);
-}
-void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F);
-}
-void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F);
-}
-void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D);
-}
-void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D);
-}
-void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D);
-}
-void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D);
-}
-void vmovapd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x28); }
-void vmovaps(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x29);
-}
-void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x28); }
-void vmovd(const Operand& op, const Xmm& x) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E);
-}
-void vmovd(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E);
-}
-void vmovddup(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_DUP | T_F2 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_X | T_ER_Y | T_ER_Z, 0x12);
-}
-void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_YMM, 0x7F); }
-void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_YMM, 0x6F); }
-void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_F3 | T_0F | T_YMM, 0x7F); }
-void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM, 0x6F); }
-void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x12);
-}
-void vmovhpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x17);
-}
-void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x16);
-}
-void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x17); }
-void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x16);
-}
-void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_0F | T_EVEX | T_EW0, 0x16);
-}
-void vmovlpd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x13);
-}
-void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, 0x12);
-}
-void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, T_0F | T_EVEX | T_EW0 | T_N8, 0x13); }
-void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) {
-  if (!op2.isNone() && !op2.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, op1, op2, T_0F | T_EVEX | T_EW0 | T_N8, 0x12);
-}
-void vmovmskpd(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_66 | T_W0 | T_YMM, 0x50);
-}
-void vmovmskps(const Reg& r, const Xmm& x) {
-  if (!r.isBit(i32e))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), cvtIdx0(x), x, T_0F | T_W0 | T_YMM, 0x50);
-}
-void vmovntdq(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW0, 0xE7); }
-void vmovntdqa(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38 | T_66 | T_YMM | T_EVEX | T_EW0, 0x2A); }
-void vmovntpd(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_66 | T_YMM | T_EVEX | T_EW1, 0x2B); }
-void vmovntps(const Address& addr, const Xmm& x) { opVex(x, 0, addr, T_0F | T_YMM | T_EVEX | T_EW0, 0x2B); }
-void vmovq(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_0F | T_66 | T_EVEX | T_EW1 | T_N8, x.getIdx() < 16 ? 0xD6 : 0x7E);
-}
-void vmovq(const Xmm& x, const Address& addr) {
-  int type, code;
-  if (x.getIdx() < 16) {
-    type = T_0F | T_F3;
-    code = 0x7E;
-  } else {
-    type = T_0F | T_66 | T_EVEX | T_EW1 | T_N8;
-    code = 0x6E;
-  }
-  opAVX_X_X_XM(x, xm0, addr, type, code);
-}
-void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, T_0F | T_F3 | T_EVEX | T_EW1 | T_N8, 0x7E); }
-void vmovsd(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_M_K, 0x11);
-}
-void vmovsd(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX, 0x10);
-}
-void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x16); }
-void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_EW0 | T_YMM | T_EVEX, 0x12); }
-void vmovss(const Address& addr, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_M_K, 0x11);
-}
-void vmovss(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) {
-  if (!op.isNone() && !op.isXMM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX, 0x10);
-}
-void vmovupd(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0x10); }
-void vmovups(const Address& addr, const Xmm& xmm) {
-  opAVX_X_XM_IMM(xmm, addr, T_0F | T_EW0 | T_YMM | T_EVEX | T_M_K, 0x11);
-}
-void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX, 0x10); }
-void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x42, imm);
-}
-void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59);
-}
-void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59);
-}
-void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59);
-}
-void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59);
-}
-void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56);
-}
-void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56);
-}
-void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1C); }
-void vpabsd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x1E);
-}
-void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x1D); }
-void vpackssdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6B);
-}
-void vpacksswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x63);
-}
-void vpackusdw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x2B);
-}
-void vpackuswb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x67);
-}
-void vpaddb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFC);
-}
-void vpaddd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFE);
-}
-void vpaddq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xD4);
-}
-void vpaddsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEC);
-}
-void vpaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xED);
-}
-void vpaddusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDC);
-}
-void vpaddusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDD);
-}
-void vpaddw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xFD);
-}
-void vpalignr(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_YMM | T_EVEX, 0x0F, imm);
-}
-void vpand(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDB); }
-void vpandn(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xDF); }
-void vpavgb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE0);
-}
-void vpavgw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE3);
-}
-void vpblendd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x02, imm);
-}
-void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) {
-  opAVX_X_X_XM(x1, x2, op, T_0F3A | T_66 | T_YMM, 0x4C, x4.getIdx() << 4);
-}
-void vpblendw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM, 0x0E, imm);
-}
-void vpbroadcastb(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x78);
-}
-void vpbroadcastd(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x58);
-}
-void vpbroadcastq(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX, 0x59);
-}
-void vpbroadcastw(const Xmm& x, const Operand& op) {
-  if (!(op.isXMM() || op.isMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_W0 | T_YMM | T_EVEX, 0x79);
-}
-void vpclmulhqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x11); }
-void vpclmulhqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x01); }
-void vpclmullqhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x10); }
-void vpclmullqlqdq(const Xmm& x1, const Xmm& x2, const Operand& op) { vpclmulqdq(x1, x2, op, 0x00); }
-void vpclmulqdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_YMM | T_EVEX, 0x44, imm);
-}
-void vpcmpeqb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x74); }
-void vpcmpeqd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x76); }
-void vpcmpeqq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x29);
-}
-void vpcmpeqw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x75); }
-void vpcmpestri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x61, imm); }
-void vpcmpestrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x60, imm); }
-void vpcmpgtb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x64); }
-void vpcmpgtd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x66); }
-void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x37);
-}
-void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0x65); }
-void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x63, imm); }
-void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A, 0x62, imm); }
-void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x50, encoding);
-}
-void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x51, encoding);
-}
-void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x50);
-}
-void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0x51);
-}
-void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x52, encoding);
-}
-void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_B32, 0x53, encoding);
-}
-void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD2);
-}
-void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_YMM, 0xD3);
-}
-void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm);
-}
-void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  if (!(y1.isYMM() && y2.isYMM() && op.isYMEM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm);
-}
-void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x36);
-}
-void vpermilpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x0D);
-}
-void vpermilpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_EVEX | T_B64, 0x05, imm);
-}
-void vpermilps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x0C);
-}
-void vpermilps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_EVEX | T_B32, 0x04, imm);
-}
-void vpermpd(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x01, imm);
-}
-void vpermpd(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x16);
-}
-void vpermps(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x16);
-}
-void vpermq(const Ymm& y, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F3A | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x00, imm);
-}
-void vpermq(const Ymm& y1, const Ymm& y2, const Operand& op) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F38 | T_W0 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x36);
-}
-void vpextrb(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(8 | 16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x14, imm);
-}
-void vpextrd(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(32) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x16, imm);
-}
-void vpextrq(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(64) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x16, imm);
-}
-void vpextrw(const Operand& op, const Xmm& x, uint8_t imm) {
-  if (!((op.isREG(16 | i32e) || op.isMEM()) && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_COMBINATION) if (op.isREG() && x.getIdx() < 16) {
-      opAVX_X_X_XM(Xmm(op.getIdx()), xm0, x, T_0F | T_66, 0xC5, imm);
-    }
-  else {
-    opVex(x, 0, op, T_0F3A | T_66 | T_EVEX | T_N2, 0x15, imm);
-  }
-}
-void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x90, 1);
-}
-void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x90, 0);
-}
-void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W0, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) {
-  opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x91, 1);
-}
-void vphaddd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x02); }
-void vphaddsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x03);
-}
-void vphaddw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x01); }
-void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38, 0x41); }
-void vphsubd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x06); }
-void vphsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x07);
-}
-void vphsubw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x05); }
-void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_EVEX | T_N1, 0x20, imm);
-}
-void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm);
-}
-void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm);
-}
-void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM())))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm);
-}
-void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB5, encoding);
-}
-void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) {
-  opEncoding(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_B64, 0xB4, encoding);
-}
-void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x04);
-}
-void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF5);
-}
-void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E);
-}
-void vpmaskmovd(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8C);
-}
-void vpmaskmovq(const Address& addr, const Xmm& x1, const Xmm& x2) {
-  opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8E);
-}
-void vpmaskmovq(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W1 | T_YMM, 0x8C);
-}
-void vpmaxsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3C);
-}
-void vpmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3D);
-}
-void vpmaxsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEE);
-}
-void vpmaxub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDE);
-}
-void vpmaxud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3F);
-}
-void vpmaxuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3E);
-}
-void vpminsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x38);
-}
-void vpminsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x39);
-}
-void vpminsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xEA);
-}
-void vpminub(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xDA);
-}
-void vpminud(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x3B);
-}
-void vpminuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x3A);
-}
-void vpmovmskb(const Reg32e& r, const Xmm& x) {
-  if (!x.is(Operand::XMM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    opVex(x.isYMM() ? Ymm(r.getIdx()) : Xmm(r.getIdx()), 0, x, T_0F | T_66 | T_YMM, 0xD7);
-}
-void vpmovsxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x21);
-}
-void vpmovsxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x22);
-}
-void vpmovsxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x20);
-}
-void vpmovsxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x25);
-}
-void vpmovsxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x23);
-}
-void vpmovsxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x24);
-}
-void vpmovzxbd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x31);
-}
-void vpmovzxbq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N2 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x32);
-}
-void vpmovzxbw(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x30);
-}
-void vpmovzxdq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX, 0x35);
-}
-void vpmovzxwd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x33);
-}
-void vpmovzxwq(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_N_VL | T_66 | T_0F38 | T_YMM | T_EVEX, 0x34);
-}
-void vpmuldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x28);
-}
-void vpmulhrsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x0B);
-}
-void vpmulhuw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE4);
-}
-void vpmulhw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE5);
-}
-void vpmulld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x40);
-}
-void vpmullw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD5);
-}
-void vpmuludq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xF4);
-}
-void vpor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEB); }
-void vpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF6);
-}
-void vpshufb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM | T_EVEX, 0x00);
-}
-void vpshufd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x70, imm);
-}
-void vpshufhw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F3 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpshuflw(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_F2 | T_0F | T_YMM | T_EVEX, 0x70, imm);
-}
-void vpsignb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x08); }
-void vpsignd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x0A); }
-void vpsignw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_YMM, 0x09); }
-void vpslld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpslld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xF2);
-}
-void vpslldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 7), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsllq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xF3);
-}
-void vpsllvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x47);
-}
-void vpsllvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x47);
-}
-void vpsllw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 6), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsllw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xF1);
-}
-void vpsrad(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrad(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xE2);
-}
-void vpsravd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x46);
-}
-void vpsraw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsraw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xE1);
-}
-void vpsrld(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32 | T_MEM_EVEX, 0x72, imm);
-}
-void vpsrld(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW0 | T_YMM | T_EVEX, 0xD2);
-}
-void vpsrldq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 3), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64 | T_MEM_EVEX, 0x73, imm);
-}
-void vpsrlq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_EVEX, 0xD3);
-}
-void vpsrlvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_YMM | T_EVEX | T_B32, 0x45);
-}
-void vpsrlvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W1 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x45);
-}
-void vpsrlw(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 2), x, op, T_66 | T_0F | T_YMM | T_EVEX | T_MEM_EVEX, 0x71, imm);
-}
-void vpsrlw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_YMM | T_EVEX, 0xD1);
-}
-void vpsubb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF8);
-}
-void vpsubd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xFA);
-}
-void vpsubq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xFB);
-}
-void vpsubsb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE8);
-}
-void vpsubsw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xE9);
-}
-void vpsubusb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD8);
-}
-void vpsubusw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xD9);
-}
-void vpsubw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0xF9);
-}
-void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x17); }
-void vpunpckhbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x68);
-}
-void vpunpckhdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x6A);
-}
-void vpunpckhqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6D);
-}
-void vpunpckhwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x69);
-}
-void vpunpcklbw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x60);
-}
-void vpunpckldq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x62);
-}
-void vpunpcklqdq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x6C);
-}
-void vpunpcklwd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM | T_EVEX, 0x61);
-}
-void vpxor(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_YMM, 0xEF); }
-void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x53); }
-void vrcpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x53); }
-void vroundpd(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x09, imm);
-}
-void vroundps(const Xmm& xm, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F3A | T_YMM, 0x08, imm);
-}
-void vroundsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0B, imm);
-}
-void vroundss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0, 0x0A, imm);
-}
-void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F | T_YMM, 0x52); }
-void vrsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F, 0x52); }
-void vsha512msg1(const Ymm& y, const Xmm& x) {
-  if (!(y.isYMM() && x.isXMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y, 0, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCC);
-}
-void vsha512msg2(const Ymm& y1, const Ymm& y2) {
-  if (!(y1.isYMM() && y2.isYMM())) XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, 0, y2, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCD);
-}
-void vsha512rnds2(const Ymm& y1, const Ymm& y2, const Xmm& x) {
-  if (!(y1.isYMM() && y2.isYMM() && x.isXMM()))
-    XBYAK_THROW(ERR_BAD_PARAMETER) opVex(y1, &y2, x, T_F2 | T_0F38 | T_W0 | T_YMM, 0xCB);
-}
-void vshufpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0xC6, imm);
-}
-void vshufps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0xC6, imm);
-}
-void vsm3msg1(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3msg2(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm3rnds2(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_W0 | T_EW0 | T_EVEX, 0xDE, imm);
-}
-void vsm4key4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsm4rnds4(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_W0 | T_EW0 | T_EVEX, 0xDA);
-}
-void vsqrtpd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x51);
-}
-void vsqrtps(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x51);
-}
-void vsqrtsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_0F | T_EW1 | T_EVEX | T_ER_X, 0x51);
-}
-void vsqrtss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_F3 | T_0F | T_EW0 | T_EVEX | T_ER_X, 0x51);
-}
-void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, T_0F, 0xAE); }
-void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5C);
-}
-void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5C);
-}
-void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5C);
-}
-void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5C);
-}
-void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0F); }
-void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66 | T_0F38 | T_YMM, 0x0E); }
-void vucomisd(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N8 | T_66 | T_0F | T_EW1 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vucomiss(const Xmm& xm, const Operand& op) {
-  opAVX_X_XM_IMM(xm, op, T_N4 | T_0F | T_EW0 | T_EVEX | T_SAE_X, 0x2E);
-}
-void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x15);
-}
-void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x15);
-}
-void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_EVEX | T_B64, 0x14);
-}
-void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x14);
-}
-void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57);
-}
-void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57);
-}
-void vzeroall() {
-  db(0xC5);
-  db(0xFC);
-  db(0x77);
-}
-void vzeroupper() {
-  db(0xC5);
-  db(0xF8);
-  db(0x77);
-}
-void wait() { db(0x9B); }
-void wbinvd() {
-  db(0x0F);
-  db(0x09);
-}
-void wrmsr() {
-  db(0x0F);
-  db(0x30);
-}
-void xabort(uint8_t imm) {
-  db(0xC6);
-  db(0xF8);
-  db(imm);
-}
-void xadd(const Operand& op, const Reg& reg) {
-  opModRM(reg, op, (op.isREG() && reg.isREG() && op.getBit() == reg.getBit()), op.isMEM(), 0x0F,
-          0xC0 | (reg.isBit(8) ? 0 : 1));
-}
-void xbegin(uint32_t rel) {
-  db(0xC7);
-  db(0xF8);
-  dd(rel);
-}
-void xend() {
-  db(0x0F);
-  db(0x01);
-  db(0xD5);
-}
-void xgetbv() {
-  db(0x0F);
-  db(0x01);
-  db(0xD0);
-}
-void xlatb() { db(0xD7); }
-void xor_(const Operand& op, uint32_t imm) { opRM_I(op, imm, 0x30, 6); }
-void xor_(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
-void xorpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x66, isXMM_XMMorMEM); }
-void xorps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x57, 0x100, isXMM_XMMorMEM); }
-#ifdef XBYAK_ENABLE_OMITTED_OPERAND
-void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
-void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
-void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvpd(x1, x1, op, x4); }
-void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { vblendvps(x1, x1, op, x4); }
-void vcmpeq_ospd(const Xmm& x, const Operand& op) { vcmpeq_ospd(x, x, op); }
-void vcmpeq_osps(const Xmm& x, const Operand& op) { vcmpeq_osps(x, x, op); }
-void vcmpeq_ossd(const Xmm& x, const Operand& op) { vcmpeq_ossd(x, x, op); }
-void vcmpeq_osss(const Xmm& x, const Operand& op) { vcmpeq_osss(x, x, op); }
-void vcmpeq_uqpd(const Xmm& x, const Operand& op) { vcmpeq_uqpd(x, x, op); }
-void vcmpeq_uqps(const Xmm& x, const Operand& op) { vcmpeq_uqps(x, x, op); }
-void vcmpeq_uqsd(const Xmm& x, const Operand& op) { vcmpeq_uqsd(x, x, op); }
-void vcmpeq_uqss(const Xmm& x, const Operand& op) { vcmpeq_uqss(x, x, op); }
-void vcmpeq_uspd(const Xmm& x, const Operand& op) { vcmpeq_uspd(x, x, op); }
-void vcmpeq_usps(const Xmm& x, const Operand& op) { vcmpeq_usps(x, x, op); }
-void vcmpeq_ussd(const Xmm& x, const Operand& op) { vcmpeq_ussd(x, x, op); }
-void vcmpeq_usss(const Xmm& x, const Operand& op) { vcmpeq_usss(x, x, op); }
-void vcmpeqpd(const Xmm& x, const Operand& op) { vcmpeqpd(x, x, op); }
-void vcmpeqps(const Xmm& x, const Operand& op) { vcmpeqps(x, x, op); }
-void vcmpeqsd(const Xmm& x, const Operand& op) { vcmpeqsd(x, x, op); }
-void vcmpeqss(const Xmm& x, const Operand& op) { vcmpeqss(x, x, op); }
-void vcmpfalse_ospd(const Xmm& x, const Operand& op) { vcmpfalse_ospd(x, x, op); }
-void vcmpfalse_osps(const Xmm& x, const Operand& op) { vcmpfalse_osps(x, x, op); }
-void vcmpfalse_ossd(const Xmm& x, const Operand& op) { vcmpfalse_ossd(x, x, op); }
-void vcmpfalse_osss(const Xmm& x, const Operand& op) { vcmpfalse_osss(x, x, op); }
-void vcmpfalsepd(const Xmm& x, const Operand& op) { vcmpfalsepd(x, x, op); }
-void vcmpfalseps(const Xmm& x, const Operand& op) { vcmpfalseps(x, x, op); }
-void vcmpfalsesd(const Xmm& x, const Operand& op) { vcmpfalsesd(x, x, op); }
-void vcmpfalsess(const Xmm& x, const Operand& op) { vcmpfalsess(x, x, op); }
-void vcmpge_oqpd(const Xmm& x, const Operand& op) { vcmpge_oqpd(x, x, op); }
-void vcmpge_oqps(const Xmm& x, const Operand& op) { vcmpge_oqps(x, x, op); }
-void vcmpge_oqsd(const Xmm& x, const Operand& op) { vcmpge_oqsd(x, x, op); }
-void vcmpge_oqss(const Xmm& x, const Operand& op) { vcmpge_oqss(x, x, op); }
-void vcmpgepd(const Xmm& x, const Operand& op) { vcmpgepd(x, x, op); }
-void vcmpgeps(const Xmm& x, const Operand& op) { vcmpgeps(x, x, op); }
-void vcmpgesd(const Xmm& x, const Operand& op) { vcmpgesd(x, x, op); }
-void vcmpgess(const Xmm& x, const Operand& op) { vcmpgess(x, x, op); }
-void vcmpgt_oqpd(const Xmm& x, const Operand& op) { vcmpgt_oqpd(x, x, op); }
-void vcmpgt_oqps(const Xmm& x, const Operand& op) { vcmpgt_oqps(x, x, op); }
-void vcmpgt_oqsd(const Xmm& x, const Operand& op) { vcmpgt_oqsd(x, x, op); }
-void vcmpgt_oqss(const Xmm& x, const Operand& op) { vcmpgt_oqss(x, x, op); }
-void vcmpgtpd(const Xmm& x, const Operand& op) { vcmpgtpd(x, x, op); }
-void vcmpgtps(const Xmm& x, const Operand& op) { vcmpgtps(x, x, op); }
-void vcmpgtsd(const Xmm& x, const Operand& op) { vcmpgtsd(x, x, op); }
-void vcmpgtss(const Xmm& x, const Operand& op) { vcmpgtss(x, x, op); }
-void vcmple_oqpd(const Xmm& x, const Operand& op) { vcmple_oqpd(x, x, op); }
-void vcmple_oqps(const Xmm& x, const Operand& op) { vcmple_oqps(x, x, op); }
-void vcmple_oqsd(const Xmm& x, const Operand& op) { vcmple_oqsd(x, x, op); }
-void vcmple_oqss(const Xmm& x, const Operand& op) { vcmple_oqss(x, x, op); }
-void vcmplepd(const Xmm& x, const Operand& op) { vcmplepd(x, x, op); }
-void vcmpleps(const Xmm& x, const Operand& op) { vcmpleps(x, x, op); }
-void vcmplesd(const Xmm& x, const Operand& op) { vcmplesd(x, x, op); }
-void vcmpless(const Xmm& x, const Operand& op) { vcmpless(x, x, op); }
-void vcmplt_oqpd(const Xmm& x, const Operand& op) { vcmplt_oqpd(x, x, op); }
-void vcmplt_oqps(const Xmm& x, const Operand& op) { vcmplt_oqps(x, x, op); }
-void vcmplt_oqsd(const Xmm& x, const Operand& op) { vcmplt_oqsd(x, x, op); }
-void vcmplt_oqss(const Xmm& x, const Operand& op) { vcmplt_oqss(x, x, op); }
-void vcmpltpd(const Xmm& x, const Operand& op) { vcmpltpd(x, x, op); }
-void vcmpltps(const Xmm& x, const Operand& op) { vcmpltps(x, x, op); }
-void vcmpltsd(const Xmm& x, const Operand& op) { vcmpltsd(x, x, op); }
-void vcmpltss(const Xmm& x, const Operand& op) { vcmpltss(x, x, op); }
-void vcmpneq_oqpd(const Xmm& x, const Operand& op) { vcmpneq_oqpd(x, x, op); }
-void vcmpneq_oqps(const Xmm& x, const Operand& op) { vcmpneq_oqps(x, x, op); }
-void vcmpneq_oqsd(const Xmm& x, const Operand& op) { vcmpneq_oqsd(x, x, op); }
-void vcmpneq_oqss(const Xmm& x, const Operand& op) { vcmpneq_oqss(x, x, op); }
-void vcmpneq_ospd(const Xmm& x, const Operand& op) { vcmpneq_ospd(x, x, op); }
-void vcmpneq_osps(const Xmm& x, const Operand& op) { vcmpneq_osps(x, x, op); }
-void vcmpneq_ossd(const Xmm& x, const Operand& op) { vcmpneq_ossd(x, x, op); }
-void vcmpneq_osss(const Xmm& x, const Operand& op) { vcmpneq_osss(x, x, op); }
-void vcmpneq_uspd(const Xmm& x, const Operand& op) { vcmpneq_uspd(x, x, op); }
-void vcmpneq_usps(const Xmm& x, const Operand& op) { vcmpneq_usps(x, x, op); }
-void vcmpneq_ussd(const Xmm& x, const Operand& op) { vcmpneq_ussd(x, x, op); }
-void vcmpneq_usss(const Xmm& x, const Operand& op) { vcmpneq_usss(x, x, op); }
-void vcmpneqpd(const Xmm& x, const Operand& op) { vcmpneqpd(x, x, op); }
-void vcmpneqps(const Xmm& x, const Operand& op) { vcmpneqps(x, x, op); }
-void vcmpneqsd(const Xmm& x, const Operand& op) { vcmpneqsd(x, x, op); }
-void vcmpneqss(const Xmm& x, const Operand& op) { vcmpneqss(x, x, op); }
-void vcmpnge_uqpd(const Xmm& x, const Operand& op) { vcmpnge_uqpd(x, x, op); }
-void vcmpnge_uqps(const Xmm& x, const Operand& op) { vcmpnge_uqps(x, x, op); }
-void vcmpnge_uqsd(const Xmm& x, const Operand& op) { vcmpnge_uqsd(x, x, op); }
-void vcmpnge_uqss(const Xmm& x, const Operand& op) { vcmpnge_uqss(x, x, op); }
-void vcmpngepd(const Xmm& x, const Operand& op) { vcmpngepd(x, x, op); }
-void vcmpngeps(const Xmm& x, const Operand& op) { vcmpngeps(x, x, op); }
-void vcmpngesd(const Xmm& x, const Operand& op) { vcmpngesd(x, x, op); }
-void vcmpngess(const Xmm& x, const Operand& op) { vcmpngess(x, x, op); }
-void vcmpngt_uqpd(const Xmm& x, const Operand& op) { vcmpngt_uqpd(x, x, op); }
-void vcmpngt_uqps(const Xmm& x, const Operand& op) { vcmpngt_uqps(x, x, op); }
-void vcmpngt_uqsd(const Xmm& x, const Operand& op) { vcmpngt_uqsd(x, x, op); }
-void vcmpngt_uqss(const Xmm& x, const Operand& op) { vcmpngt_uqss(x, x, op); }
-void vcmpngtpd(const Xmm& x, const Operand& op) { vcmpngtpd(x, x, op); }
-void vcmpngtps(const Xmm& x, const Operand& op) { vcmpngtps(x, x, op); }
-void vcmpngtsd(const Xmm& x, const Operand& op) { vcmpngtsd(x, x, op); }
-void vcmpngtss(const Xmm& x, const Operand& op) { vcmpngtss(x, x, op); }
-void vcmpnle_uqpd(const Xmm& x, const Operand& op) { vcmpnle_uqpd(x, x, op); }
-void vcmpnle_uqps(const Xmm& x, const Operand& op) { vcmpnle_uqps(x, x, op); }
-void vcmpnle_uqsd(const Xmm& x, const Operand& op) { vcmpnle_uqsd(x, x, op); }
-void vcmpnle_uqss(const Xmm& x, const Operand& op) { vcmpnle_uqss(x, x, op); }
-void vcmpnlepd(const Xmm& x, const Operand& op) { vcmpnlepd(x, x, op); }
-void vcmpnleps(const Xmm& x, const Operand& op) { vcmpnleps(x, x, op); }
-void vcmpnlesd(const Xmm& x, const Operand& op) { vcmpnlesd(x, x, op); }
-void vcmpnless(const Xmm& x, const Operand& op) { vcmpnless(x, x, op); }
-void vcmpnlt_uqpd(const Xmm& x, const Operand& op) { vcmpnlt_uqpd(x, x, op); }
-void vcmpnlt_uqps(const Xmm& x, const Operand& op) { vcmpnlt_uqps(x, x, op); }
-void vcmpnlt_uqsd(const Xmm& x, const Operand& op) { vcmpnlt_uqsd(x, x, op); }
-void vcmpnlt_uqss(const Xmm& x, const Operand& op) { vcmpnlt_uqss(x, x, op); }
-void vcmpnltpd(const Xmm& x, const Operand& op) { vcmpnltpd(x, x, op); }
-void vcmpnltps(const Xmm& x, const Operand& op) { vcmpnltps(x, x, op); }
-void vcmpnltsd(const Xmm& x, const Operand& op) { vcmpnltsd(x, x, op); }
-void vcmpnltss(const Xmm& x, const Operand& op) { vcmpnltss(x, x, op); }
-void vcmpord_spd(const Xmm& x, const Operand& op) { vcmpord_spd(x, x, op); }
-void vcmpord_sps(const Xmm& x, const Operand& op) { vcmpord_sps(x, x, op); }
-void vcmpord_ssd(const Xmm& x, const Operand& op) { vcmpord_ssd(x, x, op); }
-void vcmpord_sss(const Xmm& x, const Operand& op) { vcmpord_sss(x, x, op); }
-void vcmpordpd(const Xmm& x, const Operand& op) { vcmpordpd(x, x, op); }
-void vcmpordps(const Xmm& x, const Operand& op) { vcmpordps(x, x, op); }
-void vcmpordsd(const Xmm& x, const Operand& op) { vcmpordsd(x, x, op); }
-void vcmpordss(const Xmm& x, const Operand& op) { vcmpordss(x, x, op); }
-void vcmppd(const Xmm& x, const Operand& op, uint8_t imm) { vcmppd(x, x, op, imm); }
-void vcmpps(const Xmm& x, const Operand& op, uint8_t imm) { vcmpps(x, x, op, imm); }
-void vcmpsd(const Xmm& x, const Operand& op, uint8_t imm) { vcmpsd(x, x, op, imm); }
-void vcmpss(const Xmm& x, const Operand& op, uint8_t imm) { vcmpss(x, x, op, imm); }
-void vcmptrue_uspd(const Xmm& x, const Operand& op) { vcmptrue_uspd(x, x, op); }
-void vcmptrue_usps(const Xmm& x, const Operand& op) { vcmptrue_usps(x, x, op); }
-void vcmptrue_ussd(const Xmm& x, const Operand& op) { vcmptrue_ussd(x, x, op); }
-void vcmptrue_usss(const Xmm& x, const Operand& op) { vcmptrue_usss(x, x, op); }
-void vcmptruepd(const Xmm& x, const Operand& op) { vcmptruepd(x, x, op); }
-void vcmptrueps(const Xmm& x, const Operand& op) { vcmptrueps(x, x, op); }
-void vcmptruesd(const Xmm& x, const Operand& op) { vcmptruesd(x, x, op); }
-void vcmptruess(const Xmm& x, const Operand& op) { vcmptruess(x, x, op); }
-void vcmpunord_spd(const Xmm& x, const Operand& op) { vcmpunord_spd(x, x, op); }
-void vcmpunord_sps(const Xmm& x, const Operand& op) { vcmpunord_sps(x, x, op); }
-void vcmpunord_ssd(const Xmm& x, const Operand& op) { vcmpunord_ssd(x, x, op); }
-void vcmpunord_sss(const Xmm& x, const Operand& op) { vcmpunord_sss(x, x, op); }
-void vcmpunordpd(const Xmm& x, const Operand& op) { vcmpunordpd(x, x, op); }
-void vcmpunordps(const Xmm& x, const Operand& op) { vcmpunordps(x, x, op); }
-void vcmpunordsd(const Xmm& x, const Operand& op) { vcmpunordsd(x, x, op); }
-void vcmpunordss(const Xmm& x, const Operand& op) { vcmpunordss(x, x, op); }
-void vcvtsd2ss(const Xmm& x, const Operand& op) { vcvtsd2ss(x, x, op); }
-void vcvtsi2sd(const Xmm& x, const Operand& op) { vcvtsi2sd(x, x, op); }
-void vcvtsi2ss(const Xmm& x, const Operand& op) { vcvtsi2ss(x, x, op); }
-void vcvtss2sd(const Xmm& x, const Operand& op) { vcvtss2sd(x, x, op); }
-void vdppd(const Xmm& x, const Operand& op, uint8_t imm) { vdppd(x, x, op, imm); }
-void vdpps(const Xmm& x, const Operand& op, uint8_t imm) { vdpps(x, x, op, imm); }
-void vinsertps(const Xmm& x, const Operand& op, uint8_t imm) { vinsertps(x, x, op, imm); }
-void vmpsadbw(const Xmm& x, const Operand& op, uint8_t imm) { vmpsadbw(x, x, op, imm); }
-void vpackssdw(const Xmm& x, const Operand& op) { vpackssdw(x, x, op); }
-void vpacksswb(const Xmm& x, const Operand& op) { vpacksswb(x, x, op); }
-void vpackusdw(const Xmm& x, const Operand& op) { vpackusdw(x, x, op); }
-void vpackuswb(const Xmm& x, const Operand& op) { vpackuswb(x, x, op); }
-void vpaddb(const Xmm& x, const Operand& op) { vpaddb(x, x, op); }
-void vpaddd(const Xmm& x, const Operand& op) { vpaddd(x, x, op); }
-void vpaddq(const Xmm& x, const Operand& op) { vpaddq(x, x, op); }
-void vpaddsb(const Xmm& x, const Operand& op) { vpaddsb(x, x, op); }
-void vpaddsw(const Xmm& x, const Operand& op) { vpaddsw(x, x, op); }
-void vpaddusb(const Xmm& x, const Operand& op) { vpaddusb(x, x, op); }
-void vpaddusw(const Xmm& x, const Operand& op) { vpaddusw(x, x, op); }
-void vpaddw(const Xmm& x, const Operand& op) { vpaddw(x, x, op); }
-void vpalignr(const Xmm& x, const Operand& op, uint8_t imm) { vpalignr(x, x, op, imm); }
-void vpand(const Xmm& x, const Operand& op) { vpand(x, x, op); }
-void vpandn(const Xmm& x, const Operand& op) { vpandn(x, x, op); }
-void vpavgb(const Xmm& x, const Operand& op) { vpavgb(x, x, op); }
-void vpavgw(const Xmm& x, const Operand& op) { vpavgw(x, x, op); }
-void vpblendd(const Xmm& x, const Operand& op, uint8_t imm) { vpblendd(x, x, op, imm); }
-void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { vpblendvb(x1, x1, op, x4); }
-void vpblendw(const Xmm& x, const Operand& op, uint8_t imm) { vpblendw(x, x, op, imm); }
-void vpclmulqdq(const Xmm& x, const Operand& op, uint8_t imm) { vpclmulqdq(x, x, op, imm); }
-void vpcmpeqb(const Xmm& x, const Operand& op) { vpcmpeqb(x, x, op); }
-void vpcmpeqd(const Xmm& x, const Operand& op) { vpcmpeqd(x, x, op); }
-void vpcmpeqq(const Xmm& x, const Operand& op) { vpcmpeqq(x, x, op); }
-void vpcmpeqw(const Xmm& x, const Operand& op) { vpcmpeqw(x, x, op); }
-void vpcmpgtb(const Xmm& x, const Operand& op) { vpcmpgtb(x, x, op); }
-void vpcmpgtd(const Xmm& x, const Operand& op) { vpcmpgtd(x, x, op); }
-void vpcmpgtq(const Xmm& x, const Operand& op) { vpcmpgtq(x, x, op); }
-void vpcmpgtw(const Xmm& x, const Operand& op) { vpcmpgtw(x, x, op); }
-void vphaddd(const Xmm& x, const Operand& op) { vphaddd(x, x, op); }
-void vphaddsw(const Xmm& x, const Operand& op) { vphaddsw(x, x, op); }
-void vphaddw(const Xmm& x, const Operand& op) { vphaddw(x, x, op); }
-void vphsubd(const Xmm& x, const Operand& op) { vphsubd(x, x, op); }
-void vphsubsw(const Xmm& x, const Operand& op) { vphsubsw(x, x, op); }
-void vphsubw(const Xmm& x, const Operand& op) { vphsubw(x, x, op); }
-void vpinsrb(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrb(x, x, op, imm); }
-void vpinsrd(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrd(x, x, op, imm); }
-void vpinsrq(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrq(x, x, op, imm); }
-void vpinsrw(const Xmm& x, const Operand& op, uint8_t imm) { vpinsrw(x, x, op, imm); }
-void vpmaddubsw(const Xmm& x, const Operand& op) { vpmaddubsw(x, x, op); }
-void vpmaddwd(const Xmm& x, const Operand& op) { vpmaddwd(x, x, op); }
-void vpmaxsb(const Xmm& x, const Operand& op) { vpmaxsb(x, x, op); }
-void vpmaxsd(const Xmm& x, const Operand& op) { vpmaxsd(x, x, op); }
-void vpmaxsw(const Xmm& x, const Operand& op) { vpmaxsw(x, x, op); }
-void vpmaxub(const Xmm& x, const Operand& op) { vpmaxub(x, x, op); }
-void vpmaxud(const Xmm& x, const Operand& op) { vpmaxud(x, x, op); }
-void vpmaxuw(const Xmm& x, const Operand& op) { vpmaxuw(x, x, op); }
-void vpminsb(const Xmm& x, const Operand& op) { vpminsb(x, x, op); }
-void vpminsd(const Xmm& x, const Operand& op) { vpminsd(x, x, op); }
-void vpminsw(const Xmm& x, const Operand& op) { vpminsw(x, x, op); }
-void vpminub(const Xmm& x, const Operand& op) { vpminub(x, x, op); }
-void vpminud(const Xmm& x, const Operand& op) { vpminud(x, x, op); }
-void vpminuw(const Xmm& x, const Operand& op) { vpminuw(x, x, op); }
-void vpmuldq(const Xmm& x, const Operand& op) { vpmuldq(x, x, op); }
-void vpmulhrsw(const Xmm& x, const Operand& op) { vpmulhrsw(x, x, op); }
-void vpmulhuw(const Xmm& x, const Operand& op) { vpmulhuw(x, x, op); }
-void vpmulhw(const Xmm& x, const Operand& op) { vpmulhw(x, x, op); }
-void vpmulld(const Xmm& x, const Operand& op) { vpmulld(x, x, op); }
-void vpmullw(const Xmm& x, const Operand& op) { vpmullw(x, x, op); }
-void vpmuludq(const Xmm& x, const Operand& op) { vpmuludq(x, x, op); }
-void vpor(const Xmm& x, const Operand& op) { vpor(x, x, op); }
-void vpsadbw(const Xmm& x, const Operand& op) { vpsadbw(x, x, op); }
-void vpsignb(const Xmm& x, const Operand& op) { vpsignb(x, x, op); }
-void vpsignd(const Xmm& x, const Operand& op) { vpsignd(x, x, op); }
-void vpsignw(const Xmm& x, const Operand& op) { vpsignw(x, x, op); }
-void vpslld(const Xmm& x, const Operand& op) { vpslld(x, x, op); }
-void vpslld(const Xmm& x, uint8_t imm) { vpslld(x, x, imm); }
-void vpslldq(const Xmm& x, uint8_t imm) { vpslldq(x, x, imm); }
-void vpsllq(const Xmm& x, const Operand& op) { vpsllq(x, x, op); }
-void vpsllq(const Xmm& x, uint8_t imm) { vpsllq(x, x, imm); }
-void vpsllw(const Xmm& x, const Operand& op) { vpsllw(x, x, op); }
-void vpsllw(const Xmm& x, uint8_t imm) { vpsllw(x, x, imm); }
-void vpsrad(const Xmm& x, const Operand& op) { vpsrad(x, x, op); }
-void vpsrad(const Xmm& x, uint8_t imm) { vpsrad(x, x, imm); }
-void vpsraw(const Xmm& x, const Operand& op) { vpsraw(x, x, op); }
-void vpsraw(const Xmm& x, uint8_t imm) { vpsraw(x, x, imm); }
-void vpsrld(const Xmm& x, const Operand& op) { vpsrld(x, x, op); }
-void vpsrld(const Xmm& x, uint8_t imm) { vpsrld(x, x, imm); }
-void vpsrldq(const Xmm& x, uint8_t imm) { vpsrldq(x, x, imm); }
-void vpsrlq(const Xmm& x, const Operand& op) { vpsrlq(x, x, op); }
-void vpsrlq(const Xmm& x, uint8_t imm) { vpsrlq(x, x, imm); }
-void vpsrlw(const Xmm& x, const Operand& op) { vpsrlw(x, x, op); }
-void vpsrlw(const Xmm& x, uint8_t imm) { vpsrlw(x, x, imm); }
-void vpsubb(const Xmm& x, const Operand& op) { vpsubb(x, x, op); }
-void vpsubd(const Xmm& x, const Operand& op) { vpsubd(x, x, op); }
-void vpsubq(const Xmm& x, const Operand& op) { vpsubq(x, x, op); }
-void vpsubsb(const Xmm& x, const Operand& op) { vpsubsb(x, x, op); }
-void vpsubsw(const Xmm& x, const Operand& op) { vpsubsw(x, x, op); }
-void vpsubusb(const Xmm& x, const Operand& op) { vpsubusb(x, x, op); }
-void vpsubusw(const Xmm& x, const Operand& op) { vpsubusw(x, x, op); }
-void vpsubw(const Xmm& x, const Operand& op) { vpsubw(x, x, op); }
-void vpunpckhbw(const Xmm& x, const Operand& op) { vpunpckhbw(x, x, op); }
-void vpunpckhdq(const Xmm& x, const Operand& op) { vpunpckhdq(x, x, op); }
-void vpunpckhqdq(const Xmm& x, const Operand& op) { vpunpckhqdq(x, x, op); }
-void vpunpckhwd(const Xmm& x, const Operand& op) { vpunpckhwd(x, x, op); }
-void vpunpcklbw(const Xmm& x, const Operand& op) { vpunpcklbw(x, x, op); }
-void vpunpckldq(const Xmm& x, const Operand& op) { vpunpckldq(x, x, op); }
-void vpunpcklqdq(const Xmm& x, const Operand& op) { vpunpcklqdq(x, x, op); }
-void vpunpcklwd(const Xmm& x, const Operand& op) { vpunpcklwd(x, x, op); }
-void vpxor(const Xmm& x, const Operand& op) { vpxor(x, x, op); }
-void vrcpss(const Xmm& x, const Operand& op) { vrcpss(x, x, op); }
-void vroundsd(const Xmm& x, const Operand& op, uint8_t imm) { vroundsd(x, x, op, imm); }
-void vroundss(const Xmm& x, const Operand& op, uint8_t imm) { vroundss(x, x, op, imm); }
-void vrsqrtss(const Xmm& x, const Operand& op) { vrsqrtss(x, x, op); }
-void vshufpd(const Xmm& x, const Operand& op, uint8_t imm) { vshufpd(x, x, op, imm); }
-void vshufps(const Xmm& x, const Operand& op, uint8_t imm) { vshufps(x, x, op, imm); }
-void vsqrtsd(const Xmm& x, const Operand& op) { vsqrtsd(x, x, op); }
-void vsqrtss(const Xmm& x, const Operand& op) { vsqrtss(x, x, op); }
-void vunpckhpd(const Xmm& x, const Operand& op) { vunpckhpd(x, x, op); }
-void vunpckhps(const Xmm& x, const Operand& op) { vunpckhps(x, x, op); }
-void vunpcklpd(const Xmm& x, const Operand& op) { vunpcklpd(x, x, op); }
-void vunpcklps(const Xmm& x, const Operand& op) { vunpcklps(x, x, op); }
-#endif
-#ifdef XBYAK64
-void jecxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jrcxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jrcxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void cdqe() {
-  db(0x48);
-  db(0x98);
-}
-void cqo() {
-  db(0x48);
-  db(0x99);
-}
-void cmpsq() {
-  db(0x48);
-  db(0xA7);
-}
-void popfq() { db(0x9D); }
-void pushfq() { db(0x9C); }
-void lodsq() {
-  db(0x48);
-  db(0xAD);
-}
-void movsq() {
-  db(0x48);
-  db(0xA5);
-}
-void scasq() {
-  db(0x48);
-  db(0xAF);
-}
-void stosq() {
-  db(0x48);
-  db(0xAB);
-}
-void syscall() {
-  db(0x0F);
-  db(0x05);
-}
-void sysret() {
-  db(0x0F);
-  db(0x07);
-}
-void clui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEE);
-}
-void stui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEF);
-}
-void testui() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xED);
-}
-void uiret() {
-  db(0xF3);
-  db(0x0F);
-  db(0x01);
-  db(0xEC);
-}
-void cmpxchg16b(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xC7); }
-void fxrstor64(const Address& addr) { opModM(addr, Reg64(1), 0x0F, 0xAE); }
-void movq(const Reg64& reg, const Mmx& mmx) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x7E);
-}
-void movq(const Mmx& mmx, const Reg64& reg) {
-  if (mmx.isXMM()) db(0x66);
-  opModR(mmx, reg, 0x0F, 0x6E);
-}
-void movsxd(const Reg64& reg, const Operand& op) {
-  if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opModRM(reg, op, op.isREG(), op.isMEM(), 0x63);
-}
-void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, 0x3A);
-}
-void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) {
-  if (!op.isREG(64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, 0x3A);
-}
-void senduipi(const Reg64& r) {
-  db(0xF3);
-  opModR(Reg32(6), r.cvt32(), 0x0F, 0xC7);
-}
-void vcvtss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_ER_X | T_N8, 0x2D);
-}
-void vcvttss2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W1 | T_EVEX | T_EW1 | T_SAE_X | T_N8, 0x2C);
-}
-void vcvtsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_ER_X, 0x2D);
-}
-void vcvttsd2si(const Reg64& r, const Operand& op) {
-  opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W1 | T_EVEX | T_EW1 | T_N4 | T_SAE_X, 0x2C);
-}
-void vmovq(const Xmm& x, const Reg64& r) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x6E);
-}
-void vmovq(const Reg64& r, const Xmm& x) {
-  opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), T_66 | T_0F | T_W1 | T_EVEX | T_EW1, 0x7E);
-}
-void cmpbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE6, false);
-}
-void cmpbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE2, false);
-}
-void cmplexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEE, false);
-}
-void cmplxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEC, false);
-}
-void cmpnbexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE7, false);
-}
-void cmpnbxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE3, false);
-}
-void cmpnlexadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEF, false);
-}
-void cmpnlxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xED, false);
-}
-void cmpnoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE1, false);
-}
-void cmpnpxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEB, false);
-}
-void cmpnsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE9, false);
-}
-void cmpnzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE5, false);
-}
-void cmpoxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE0, false);
-}
-void cmppxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xEA, false);
-}
-void cmpsxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE8, false);
-}
-void cmpzxadd(const Address& addr, const Reg32e& r1, const Reg32e& r2) {
-  opGpr(r1, addr, r2, T_66 | T_0F38, 0xE4, false);
-}
-void ldtilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_0F38 | T_W0, 0x49); }
-void sttilecfg(const Address& addr) { opVex(tmm0, &tmm0, addr, T_66 | T_0F38 | T_W0, 0x49); }
-void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2 | T_0F38 | T_W0, 0x4b); }
-void tileloaddt1(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_66 | T_0F38 | T_W0, 0x4b); }
-void tilerelease() {
-  db(0xc4);
-  db(0xe2);
-  db(0x78);
-  db(0x49);
-  db(0xc0);
-}
-void tilestored(const Address& addr, const Tmm& tm) { opVex(tm, &tmm0, addr, T_F3 | T_0F38 | T_W0, 0x4b); }
-void tilezero(const Tmm& Tmm) { opVex(Tmm, &tmm0, tmm0, T_F2 | T_0F38 | T_W0, 0x49); }
-void tdpbssd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5e); }
-void tdpbsud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5e); }
-void tdpbusd(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_66 | T_0F38 | T_W0, 0x5e); }
-void tdpbuud(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_0F38 | T_W0, 0x5e); }
-void tdpfp16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F2 | T_0F38 | T_W0, 0x5c); }
-void tdpbf16ps(const Tmm& x1, const Tmm& x2, const Tmm& x3) { opVex(x1, &x3, x2, T_F3 | T_0F38 | T_W0, 0x5c); }
-#else
-void jcxz(std::string label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jcxz(const Label& label) {
-  db(0x67);
-  opJmp(label, T_SHORT, 0xe3, 0, 0);
-}
-void jecxz(std::string label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void jecxz(const Label& label) { opJmp(label, T_SHORT, 0xe3, 0, 0); }
-void aaa() { db(0x37); }
-void aad() {
-  db(0xD5);
-  db(0x0A);
-}
-void aam() {
-  db(0xD4);
-  db(0x0A);
-}
-void aas() { db(0x3F); }
-void daa() { db(0x27); }
-void das() { db(0x2F); }
-void into() { db(0xCE); }
-void popad() { db(0x61); }
-void popfd() { db(0x9D); }
-void pusha() { db(0x60); }
-void pushad() { db(0x60); }
-void pushfd() { db(0x9C); }
-void popa() { db(0x61); }
-void lds(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC5, 0x100); }
-void les(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, 0xC4, 0x100); }
-#endif
-#ifndef XBYAK_NO_OP_NAMES
-void and (const Operand& op1, const Operand& op2) { and_(op1, op2); }
-void and (const Operand& op, uint32_t imm) { and_(op, imm); }
-void or (const Operand& op1, const Operand& op2) { or_(op1, op2); }
-void or (const Operand& op, uint32_t imm) { or_(op, imm); }
-void xor (const Operand& op1, const Operand& op2) { xor_(op1, op2); } void xor
-    (const Operand& op, uint32_t imm) { xor_(op, imm); } void not(const Operand& op) {
-  not_(op);
-}
-#endif
-#ifndef XBYAK_DISABLE_AVX512
-void kaddb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4A);
-}
-void kaddd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x4A);
-}
-void kaddq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4A); }
-void kaddw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4A); }
-void kandb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x41);
-}
-void kandd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x41);
-}
-void kandnb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x42);
-}
-void kandnd(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x42);
-}
-void kandnq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x42); }
-void kandnw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x42); }
-void kandq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x41); }
-void kandw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x41); }
-void kmovb(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W0, 0x91); }
-void kmovb(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W0, 0x90);
-}
-void kmovb(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_66 | T_W0, 0x92); }
-void kmovb(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_66 | T_W0, 0x93); }
-void kmovd(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_66 | T_W1, 0x91); }
-void kmovd(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_66 | T_W1, 0x90);
-}
-void kmovd(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W0, 0x92); }
-void kmovd(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W0, 0x93); }
-void kmovq(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W1, 0x91); }
-void kmovq(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W1, 0x90);
-}
-void kmovw(const Address& addr, const Opmask& k) { opVex(k, 0, addr, T_L0 | T_0F | T_W0, 0x91); }
-void kmovw(const Opmask& k, const Operand& op) {
-  if (!op.isMEM() && !op.isOPMASK()) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(k, 0, op, T_L0 | T_0F | T_W0, 0x90);
-}
-void kmovw(const Opmask& k, const Reg32& r) { opVex(k, 0, r, T_L0 | T_0F | T_W0, 0x92); }
-void kmovw(const Reg32& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_W0, 0x93); }
-void knotb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x44); }
-void knotd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x44); }
-void knotq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x44); }
-void knotw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x44); }
-void korb(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x45); }
-void kord(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x45); }
-void korq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x45); }
-void kortestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x98); }
-void kortestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x98); }
-void kortestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x98); }
-void kortestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x98); }
-void korw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x45); }
-void kshiftlb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x32, imm); }
-void kshiftld(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x33, imm); }
-void kshiftlq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x33, imm); }
-void kshiftlw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x32, imm); }
-void kshiftrb(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x30, imm); }
-void kshiftrd(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W0, 0x31, imm); }
-void kshiftrq(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x31, imm); }
-void kshiftrw(const Opmask& r1, const Opmask& r2, uint8_t imm) { opVex(r1, 0, r2, T_66 | T_0F3A | T_W1, 0x30, imm); }
-void ktestb(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W0, 0x99); }
-void ktestd(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_66 | T_W1, 0x99); }
-void ktestq(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W1, 0x99); }
-void ktestw(const Opmask& r1, const Opmask& r2) { opVex(r1, 0, r2, T_0F | T_W0, 0x99); }
-void kunpckbw(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x4B);
-}
-void kunpckdq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x4B); }
-void kunpckwd(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x4B); }
-void kxnorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x46);
-}
-void kxnord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x46);
-}
-void kxnorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x46); }
-void kxnorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x46); }
-void kxorb(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W0, 0x47);
-}
-void kxord(const Opmask& r1, const Opmask& r2, const Opmask& r3) {
-  opVex(r1, &r2, r3, T_L1 | T_0F | T_66 | T_W1, 0x47);
-}
-void kxorq(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W1, 0x47); }
-void kxorw(const Opmask& r1, const Opmask& r2, const Opmask& r3) { opVex(r1, &r2, r3, T_L1 | T_0F | T_W0, 0x47); }
-void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x9A);
-}
-void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B);
-}
-void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA);
-}
-void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) {
-  opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB);
-}
-void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58);
-}
-void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58);
-}
-void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void valignq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x03, imm);
-}
-void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65);
-}
-void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65);
-}
-void vbroadcastf32x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x19);
-}
-void vbroadcastf32x4(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x1A);
-}
-void vbroadcastf32x8(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x1B);
-}
-void vbroadcastf64x2(const Ymm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x1A);
-}
-void vbroadcastf64x4(const Zmm& y, const Address& addr) {
-  opAVX_X_XM_IMM(y, addr, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x1B);
-}
-void vbroadcasti32x2(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N8, 0x59);
-}
-void vbroadcasti32x4(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N16, 0x5A);
-}
-void vbroadcasti32x8(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_N32, 0x5B);
-}
-void vbroadcasti64x2(const Ymm& y, const Operand& op) {
-  opAVX_X_XM_IMM(y, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N16, 0x5A);
-}
-void vbroadcasti64x4(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_N32, 0x5B);
-}
-void vcmpeq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 16); }
-void vcmpeq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 16); }
-void vcmpeq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 16); }
-void vcmpeq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 16); }
-void vcmpeq_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 8); }
-void vcmpeq_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 8); }
-void vcmpeq_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 8); }
-void vcmpeq_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 8); }
-void vcmpeq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 24); }
-void vcmpeq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 24); }
-void vcmpeq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 24); }
-void vcmpeq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 24); }
-void vcmpeqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 0); }
-void vcmpeqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 0); }
-void vcmpeqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 0); }
-void vcmpeqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 0); }
-void vcmpfalse_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 27); }
-void vcmpfalse_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 27); }
-void vcmpfalse_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 27); }
-void vcmpfalse_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 27); }
-void vcmpfalsepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 11); }
-void vcmpfalseps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 11); }
-void vcmpfalsesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 11); }
-void vcmpfalsess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 11); }
-void vcmpge_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 29); }
-void vcmpge_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 29); }
-void vcmpge_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 29); }
-void vcmpge_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 29); }
-void vcmpgepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 13); }
-void vcmpgeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 13); }
-void vcmpgesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 13); }
-void vcmpgess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 13); }
-void vcmpgt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 30); }
-void vcmpgt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 30); }
-void vcmpgt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 30); }
-void vcmpgt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 30); }
-void vcmpgtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 14); }
-void vcmpgtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 14); }
-void vcmpgtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 14); }
-void vcmpgtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 14); }
-void vcmple_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 18); }
-void vcmple_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 18); }
-void vcmple_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 18); }
-void vcmple_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 18); }
-void vcmplepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 2); }
-void vcmpleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 2); }
-void vcmplesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 2); }
-void vcmpless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 2); }
-void vcmplt_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 17); }
-void vcmplt_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 17); }
-void vcmplt_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 17); }
-void vcmplt_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 17); }
-void vcmpltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 1); }
-void vcmpltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 1); }
-void vcmpltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 1); }
-void vcmpltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 1); }
-void vcmpneq_oqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 12); }
-void vcmpneq_oqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 12); }
-void vcmpneq_oqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 12); }
-void vcmpneq_oqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 12); }
-void vcmpneq_ospd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 28); }
-void vcmpneq_osps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 28); }
-void vcmpneq_ossd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 28); }
-void vcmpneq_osss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 28); }
-void vcmpneq_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 20); }
-void vcmpneq_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 20); }
-void vcmpneq_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 20); }
-void vcmpneq_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 20); }
-void vcmpneqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 4); }
-void vcmpneqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 4); }
-void vcmpneqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 4); }
-void vcmpneqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 4); }
-void vcmpnge_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 25); }
-void vcmpnge_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 25); }
-void vcmpnge_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 25); }
-void vcmpnge_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 25); }
-void vcmpngepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 9); }
-void vcmpngeps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 9); }
-void vcmpngesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 9); }
-void vcmpngess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 9); }
-void vcmpngt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 26); }
-void vcmpngt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 26); }
-void vcmpngt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 26); }
-void vcmpngt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 26); }
-void vcmpngtpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 10); }
-void vcmpngtps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 10); }
-void vcmpngtsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 10); }
-void vcmpngtss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 10); }
-void vcmpnle_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 22); }
-void vcmpnle_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 22); }
-void vcmpnle_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 22); }
-void vcmpnle_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 22); }
-void vcmpnlepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 6); }
-void vcmpnleps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 6); }
-void vcmpnlesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 6); }
-void vcmpnless(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 6); }
-void vcmpnlt_uqpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 21); }
-void vcmpnlt_uqps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 21); }
-void vcmpnlt_uqsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 21); }
-void vcmpnlt_uqss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 21); }
-void vcmpnltpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 5); }
-void vcmpnltps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 5); }
-void vcmpnltsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 5); }
-void vcmpnltss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 5); }
-void vcmpord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 23); }
-void vcmpord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 23); }
-void vcmpord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 23); }
-void vcmpord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 23); }
-void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 7); }
-void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); }
-void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); }
-void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); }
-void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0xC2, imm);
-}
-void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0xC2, imm);
-}
-void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0xC2, imm);
-}
-void vcmpsd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N8 | T_F2 | T_0F | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpsh(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N2 | T_F3 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmpss(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_N4 | T_F3 | T_0F | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0xC2, imm);
-}
-void vcmptrue_uspd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 31); }
-void vcmptrue_usps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 31); }
-void vcmptrue_ussd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 31); }
-void vcmptrue_usss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 31); }
-void vcmptruepd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 15); }
-void vcmptrueps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 15); }
-void vcmptruesd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 15); }
-void vcmptruess(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 15); }
-void vcmpunord_spd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 19); }
-void vcmpunord_sps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 19); }
-void vcmpunord_ssd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 19); }
-void vcmpunord_sss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 19); }
-void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, op, 3); }
-void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
-void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
-void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
-void vcomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F);
-}
-void vcompressb(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcompresspd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressps(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8A);
-}
-void vcompressw(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x63);
-}
-void vcvtdq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x5B);
-}
-void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x72);
-}
-void vcvtpd2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5A);
-}
-void vcvtpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7B);
-}
-void vcvtpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x79);
-}
-void vcvtph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvtph2pd(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x5A);
-}
-void vcvtph2psx(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x13);
-}
-void vcvtph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x7B);
-}
-void vcvtph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_X | T_MUST_EVEX | T_B16, 0x79);
-}
-void vcvtph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtps2phx(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x1D);
-}
-void vcvtps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x7B);
-}
-void vcvtps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_ER_Y | T_MUST_EVEX | T_B32, 0x79);
-}
-void vcvtqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0xE6);
-}
-void vcvtqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x5B);
-}
-void vcvtsd2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_F2 | T_MAP5 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsh2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x5A);
-}
-void vcvtsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2D);
-}
-void vcvtsh2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x13);
-}
-void vcvtsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x2A);
-}
-void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x1D);
-}
-void vcvtss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_ER_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x79);
-}
-void vcvttpd2qq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvttpd2udq(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttpd2uqq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x78);
-}
-void vcvttph2dq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x5B);
-}
-void vcvttph2qq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x7A);
-}
-void vcvttph2udq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_MAP5 | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uqq(const Xmm& x, const Operand& op) {
-  if (!op.isXMM() && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(x, 0, op, T_N4 | T_N_VL | T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_X | T_MUST_EVEX | T_B16, 0x78);
-}
-void vcvttph2uw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttph2w(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP5 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x7C);
-}
-void vcvttps2qq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvttps2udq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_0F | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttps2uqq(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_66 | T_0F | T_EW0 | T_YMM | T_SAE_Y | T_MUST_EVEX | T_B32, 0x78);
-}
-void vcvttsd2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N8 | T_F2 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttsh2si(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x2C);
-}
-void vcvttsh2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N2 | T_F3 | T_MAP5 | T_EW0 | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvttss2usi(const Reg32e& r, const Operand& op) {
-  int type = (T_N4 | T_F3 | T_0F | T_SAE_X | T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0);
-  opVex(r, &xm0, op, type, 0x78);
-}
-void vcvtudq2pd(const Xmm& x, const Operand& op) {
-  checkCvt1(x, op);
-  opVex(x, 0, op, T_N8 | T_N_VL | T_F3 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ph(const Xmm& x, const Operand& op) {
-  checkCvt4(x, op);
-  opCvt(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW0 | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtudq2ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x7A);
-}
-void vcvtuqq2pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ph(const Xmm& x, const Operand& op) {
-  opCvt5(x, op, T_N16 | T_N_VL | T_F2 | T_MAP5 | T_EW1 | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtuqq2ps(const Xmm& x, const Operand& op) {
-  opCvt2(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x7A);
-}
-void vcvtusi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F2 | T_0F | T_MUST_EVEX, T_W1 | T_EW1 | T_ER_X | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtusi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  if (!(x1.isXMM() && x2.isXMM() && op.isBit(32 | 64)))
-    XBYAK_THROW(ERR_BAD_COMBINATION)
-    int type = (T_F3 | T_MAP5 | T_ER_R | T_MUST_EVEX | T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8));
-  opVex(x1, &x2, op, type, 0x7B);
-}
-void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opCvt3(x1, x2, op, T_F3 | T_0F | T_MUST_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x7B);
-}
-void vcvtuw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vcvtw2ph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x7D);
-}
-void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x42, imm);
-}
-void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E);
-}
-void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E);
-}
-void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x52);
-}
-void vexp2pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8);
-}
-void vexp2ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8);
-}
-void vexpandpd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vexpandps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x88);
-}
-void vextractf32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextractf64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x19, imm);
-}
-void vextractf64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1B, imm);
-}
-void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::XMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x39, imm);
-}
-void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3B, imm);
-}
-void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F2 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x54, imm);
-}
-void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x54, imm);
-}
-void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_Z | T_MUST_EVEX, 0x55, imm);
-}
-void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x98);
-}
-void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x99);
-}
-void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA8);
-}
-void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xA9);
-}
-void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB8);
-}
-void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xB9);
-}
-void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x56);
-}
-void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x96);
-}
-void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA6);
-}
-void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB6);
-}
-void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9A);
-}
-void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9B);
-}
-void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAA);
-}
-void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAB);
-}
-void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBA);
-}
-void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBB);
-}
-void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x97);
-}
-void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xA7);
-}
-void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xB7);
-}
-void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_F3 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0xD6);
-}
-void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9C);
-}
-void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9D);
-}
-void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAC);
-}
-void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAD);
-}
-void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBC);
-}
-void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBD);
-}
-void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x9E);
-}
-void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x9F);
-}
-void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xAE);
-}
-void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xAF);
-}
-void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0xBE);
-}
-void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0xBF);
-}
-void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm);
-}
-void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm);
-}
-void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isBit(128 | 256 | 512))
-    XBYAK_THROW(ERR_BAD_MEM_SIZE)
-    opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm);
-}
-void vfpclasssd(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW1 | T_N8, 0x67, imm);
-}
-void vfpclasssh(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_0F3A | T_MUST_EVEX | T_EW0 | T_N2, 0x67, imm);
-}
-void vfpclassss(const Opmask& k, const Operand& op, uint8_t imm) {
-  if (!op.isXMEM())
-    XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k, 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_EW0 | T_N4, 0x67, imm);
-}
-void vgatherdpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 1);
-}
-void vgatherdps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x92, 0);
-}
-void vgatherpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm1, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vgatherpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vgatherpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm2, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vgatherqpd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 0);
-}
-void vgatherqps(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x93, 2);
-}
-void vgetexppd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x42);
-}
-void vgetexpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x42);
-}
-void vgetexpps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x42);
-}
-void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x43);
-}
-void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x26, imm);
-}
-void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x26, imm);
-}
-void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x26, imm);
-}
-void vgetmantsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vgetmantss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x27, imm);
-}
-void vinsertf32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinsertf64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x18, imm);
-}
-void vinsertf64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x1A, imm);
-}
-void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {
-  if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM)))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x38, imm);
-}
-void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {
-  if (!op.is(Operand::MEM | Operand::YMM))
-    XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32 | T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3A, imm);
-}
-void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F);
-}
-void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F);
-}
-void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D);
-}
-void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D);
-}
-void vmovdqa32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqa64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqa64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu16(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu16(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu32(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu32(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu64(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu64(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F3 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovdqu8(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX | T_M_K, 0x7F);
-}
-void vmovdqu8(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F);
-}
-void vmovsh(const Address& addr, const Xmm& x) {
-  opAVX_X_XM_IMM(x, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX | T_M_K, 0x11);
-}
-void vmovsh(const Xmm& x, const Address& addr) {
-  opAVX_X_X_XM(x, xm0, addr, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) {
-  opAVX_X_X_XM(x1, x2, x3, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_MUST_EVEX, 0x10);
-}
-void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x7E); }
-void vmovw(const Xmm& x, const Operand& op) {
-  if (!op.isREG(32 | 64) && !op.isMEM())
-    XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2 | T_66 | T_MAP5 | T_MUST_EVEX, 0x6E);
-}
-void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59);
-}
-void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59);
-}
-void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68);
-}
-void vp2intersectq(const Opmask& k, const Xmm& x, const Operand& op) {
-  if (k.getOpmaskIdx() != 0)
-    XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW1 | T_B64, 0x68);
-}
-void vp4dpwssd(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x52);
-}
-void vp4dpwssds(const Zmm& z1, const Zmm& z2, const Address& addr) {
-  opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x53);
-}
-void vpabsq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_MUST_EVEX | T_EW1 | T_B64 | T_YMM, 0x1F);
-}
-void vpandd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDB);
-}
-void vpandnd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xDF);
-}
-void vpandnq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDF);
-}
-void vpandq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xDB);
-}
-void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64);
-}
-void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64);
-}
-void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66);
-}
-void vpbroadcastb(const Xmm& x, const Reg8& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7A); }
-void vpbroadcastd(const Xmm& x, const Reg32& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7C); }
-void vpbroadcastmb2q(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, 0x2A);
-}
-void vpbroadcastmw2d(const Xmm& x, const Opmask& k) {
-  opVex(x, 0, k, T_F3 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, 0x3A);
-}
-void vpbroadcastw(const Xmm& x, const Reg16& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7B); }
-void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm);
-}
-void vpcmpeqb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x74);
-}
-void vpcmpeqd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpcmpeqq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x29);
-}
-void vpcmpeqw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x64);
-}
-void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66);
-}
-void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37);
-}
-void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65);
-}
-void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm);
-}
-void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm);
-}
-void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm);
-}
-void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm);
-}
-void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm);
-}
-void vpcompressd(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpcompressq(const Operand& op, const Xmm& x) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8B);
-}
-void vpconflictd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xC4);
-}
-void vpconflictq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xC4);
-}
-void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76);
-}
-void vpermi2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x77);
-}
-void vpermi2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x77);
-}
-void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76);
-}
-void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75);
-}
-void vpermt2b(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7E);
-}
-void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F);
-}
-void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F);
-}
-void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E);
-}
-void vpermt2w(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7D);
-}
-void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D);
-}
-void vpexpandb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N1 | T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpexpandd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x89);
-}
-void vpexpandw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_N2 | T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x62);
-}
-void vpgatherdd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 0);
-}
-void vpgatherdq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x90, 1);
-}
-void vpgatherqd(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 2);
-}
-void vpgatherqq(const Xmm& x, const Address& addr) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_VSIB, 0x91, 0);
-}
-void vplzcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x44);
-}
-void vplzcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x44);
-}
-void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3D);
-}
-void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3F);
-}
-void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x39);
-}
-void vpminuq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x3B);
-}
-void vpmovb2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x29); }
-void vpmovd2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x39); }
-void vpmovdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x31, false);
-}
-void vpmovdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x33, true);
-}
-void vpmovm2b(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x28); }
-void vpmovm2d(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, 0x38); }
-void vpmovm2q(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x38); }
-void vpmovm2w(const Xmm& x, const Opmask& k) { opVex(x, 0, k, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x28); }
-void vpmovq2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x39); }
-void vpmovqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x32, false);
-}
-void vpmovqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x35, true);
-}
-void vpmovqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x34, false);
-}
-void vpmovsdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x21, false);
-}
-void vpmovsdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x23, true);
-}
-void vpmovsqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x22, false);
-}
-void vpmovsqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x25, true);
-}
-void vpmovsqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x24, false);
-}
-void vpmovswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x20, true);
-}
-void vpmovusdb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x11, false);
-}
-void vpmovusdw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x13, true);
-}
-void vpmovusqb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N2 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x12, false);
-}
-void vpmovusqd(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x15, true);
-}
-void vpmovusqw(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N4 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x14, false);
-}
-void vpmovuswb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x10, true);
-}
-void vpmovw2m(const Opmask& k, const Xmm& x) { opVex(k, 0, x, T_F3 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, 0x29); }
-void vpmovwb(const Operand& op, const Xmm& x) {
-  opVmov(op, x, T_N8 | T_N_VL | T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K, 0x30, true);
-}
-void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40);
-}
-void vpmultishiftqb(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x83);
-}
-void vpopcntb(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpopcntd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x55);
-}
-void vpopcntq(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x55);
-}
-void vpopcntw(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x54);
-}
-void vpord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEB);
-}
-void vporq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEB);
-}
-void vprold(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprolq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 1), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprolvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x15);
-}
-void vprolvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x15);
-}
-void vprord(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x72, imm);
-}
-void vprorq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 0), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vprorvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x14);
-}
-void vprorvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x14);
-}
-void vpscatterdd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 0);
-}
-void vpscatterdq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA0, 1);
-}
-void vpscatterqd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 2);
-}
-void vpscatterqq(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA1, 0);
-}
-void vpshldd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71, imm);
-}
-void vpshldq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71, imm);
-}
-void vpshldvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x71);
-}
-void vpshldvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x71);
-}
-void vpshldvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70);
-}
-void vpshldw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x70, imm);
-}
-void vpshrdd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73, imm);
-}
-void vpshrdq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73, imm);
-}
-void vpshrdvd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x73);
-}
-void vpshrdvq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x73);
-}
-void vpshrdvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72);
-}
-void vpshrdw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX, 0x72, imm);
-}
-void vpshufbitqmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opVex(k, &x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8F);
-}
-void vpsllvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x12);
-}
-void vpsraq(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm);
-}
-void vpsraq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N16 | T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX, 0xE2);
-}
-void vpsravq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x46);
-}
-void vpsravw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x11);
-}
-void vpsrlvw(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x10);
-}
-void vpternlogd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x25, imm);
-}
-void vpternlogq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x25, imm);
-}
-void vptestmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmb(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vptestnmd(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x27);
-}
-void vptestnmq(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x27);
-}
-void vptestnmw(const Opmask& k, const Xmm& x, const Operand& op) {
-  opAVX_K_X_XM(k, x, op, T_F3 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x26);
-}
-void vpxord(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0xEF);
-}
-void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0xEF);
-}
-void vrangepd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x50, imm);
-}
-void vrangeps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x50, imm);
-}
-void vrangesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrangess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x51, imm);
-}
-void vrcp14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4C);
-}
-void vrcp14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4C);
-}
-void vrcp14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX, 0x4D);
-}
-void vrcp14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vrcp28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCA);
-}
-void vrcp28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA);
-}
-void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCB);
-}
-void vrcpph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4C);
-}
-void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4D);
-}
-void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x56, imm);
-}
-void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x56, imm);
-}
-void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x56, imm);
-}
-void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x57, imm);
-}
-void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B64, 0x09, imm);
-}
-void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B16, 0x08, imm);
-}
-void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_SAE_Z | T_MUST_EVEX | T_B32, 0x08, imm);
-}
-void vrndscalesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F3A | T_EW1 | T_SAE_X | T_MUST_EVEX, 0x0B, imm);
-}
-void vrndscalesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrndscaless(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F3A | T_EW0 | T_SAE_X | T_MUST_EVEX, 0x0A, imm);
-}
-void vrsqrt14pd(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x4E);
-}
-void vrsqrt14ps(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x4E);
-}
-void vrsqrt14sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt14ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x4F);
-}
-void vrsqrt28pd(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28ps(const Zmm& z, const Operand& op) {
-  opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC);
-}
-void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_SAE_X | T_MUST_EVEX, 0xCD);
-}
-void vrsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_MUST_EVEX | T_B16, 0x4E);
-}
-void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_MUST_EVEX, 0x4F);
-}
-void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B64, 0x2C);
-}
-void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_MAP6 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x2C);
-}
-void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B32, 0x2C);
-}
-void vscalefsd(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N8 | T_66 | T_0F38 | T_EW1 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_66 | T_MAP6 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscalefss(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N4 | T_66 | T_0F38 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x2D);
-}
-void vscatterdpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 1);
-}
-void vscatterdps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA2, 0);
-}
-void vscatterpf0dpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf0dps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf0qpd(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf0qps(const Address& addr) {
-  opGatherFetch(addr, zm5, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1dpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::YMM);
-}
-void vscatterpf1dps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC6, Operand::ZMM);
-}
-void vscatterpf1qpd(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N8 | T_66 | T_0F38 | T_EW1 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterpf1qps(const Address& addr) {
-  opGatherFetch(addr, zm6, T_N4 | T_66 | T_0F38 | T_EW0 | T_MUST_EVEX | T_M_K | T_VSIB, 0xC7, Operand::ZMM);
-}
-void vscatterqpd(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N8 | T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 0);
-}
-void vscatterqps(const Address& addr, const Xmm& x) {
-  opGather2(x, addr, T_N4 | T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_M_K | T_VSIB, 0xA3, 2);
-}
-void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x23, imm);
-}
-void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm);
-}
-void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm);
-}
-void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
-  opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm);
-}
-void vsqrtph(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_EW0 | T_YMM | T_ER_Z | T_MUST_EVEX | T_B16, 0x51);
-}
-void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) {
-  opAVX_X_X_XM(x1, x2, op, T_N2 | T_F3 | T_MAP5 | T_EW0 | T_ER_X | T_MUST_EVEX, 0x51);
-}
-void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C);
-}
-void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) {
-  opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C);
-}
-void vucomish(const Xmm& x, const Operand& op) {
-  opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E);
-}
-#ifdef XBYAK64
-void kmovq(const Opmask& k, const Reg64& r) { opVex(k, 0, r, T_L0 | T_0F | T_F2 | T_W1, 0x92); }
-void kmovq(const Reg64& r, const Opmask& k) { opVex(r, 0, k, T_L0 | T_0F | T_F2 | T_W1, 0x93); }
-void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x7C); }
-#endif
-#endif
diff --git a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h b/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
deleted file mode 100644
index f9e43afc8371f..0000000000000
--- a/onnxruntime/core/mlas/lib/x86_64/jblas/jblas/xbyak/xbyak_util.h
+++ /dev/null
@@ -1,1160 +0,0 @@
-//  Copyright (c) 2023 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-#ifndef XBYAK_XBYAK_UTIL_H_
-#define XBYAK_XBYAK_UTIL_H_
-
-#ifdef XBYAK_ONLY_CLASS_CPU
-#include <stdint.h>
-#include <stdlib.h>
-#include <assert.h>
-#ifndef XBYAK_THROW
-#define XBYAK_THROW(x) ;
-#define XBYAK_THROW_RET(x, y) return y;
-#endif
-#ifndef XBYAK_CONSTEXPR
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || \
-    (defined(_MSC_VER) && _MSC_VER >= 1910)
-#define XBYAK_CONSTEXPR constexpr
-#else
-#define XBYAK_CONSTEXPR
-#endif
-#endif
-#else
-#include <string.h>
-
-/**
-        utility class and functions for Xbyak
-        Xbyak::util::Clock ; rdtsc timer
-        Xbyak::util::Cpu ; detect CPU
-*/
-#include "xbyak.h"
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#define XBYAK_INTEL_CPU_SPECIFIC
-#endif
-
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-#if defined(_MSC_VER) && (_MSC_VER < 1400) && defined(XBYAK32)
-static inline __declspec(naked) void __cpuid(int[4], int) {
-  __asm {
-				push	ebx
-				push	esi
-				mov		eax, dword ptr [esp + 4 * 2 + 8]  // eaxIn
-				cpuid
-				mov		esi, dword ptr [esp + 4 * 2 + 4]  // data
-				mov		dword ptr [esi], eax
-				mov		dword ptr [esi + 4], ebx
-				mov		dword ptr [esi + 8], ecx
-				mov		dword ptr [esi + 12], edx
-				pop		esi
-				pop		ebx
-				ret
-  }
-}
-#else
-#include <intrin.h>  // for __cpuid
-#endif
-#else
-#ifndef __GNUC_PREREQ
-#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
-#endif
-#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
-#if !defined(signature_VORTEX_ebx) && !defined(signature_NEXGEN_ebx) && \
-    !defined(signature_AMD_ebx)  // workaround for Bug 96238 - [i386] cpuid.h header needs include guards
-#include <cpuid.h>
-#endif
-#else
-#if defined(__APPLE__) && defined(XBYAK32)  // avoid err : can't find a register in class `BREG' while reloading `asm'
-#define __cpuid(eaxIn, a, b, c, d)                                         \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d)                            \
-  __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" \
-                       : "=a"(a), "=S"(b), "=c"(c), "=d"(d)                \
-                       : "0"(eaxIn), "2"(ecxIn))
-#else
-#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
-#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) \
-  __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
-#endif
-#endif
-#endif
-#endif
-
-#ifdef XBYAK_USE_VTUNE
-// -I /opt/intel/vtune_amplifier/include/ -L /opt/intel/vtune_amplifier/lib64 -ljitprofiling -ldl
-#include <jitprofiling.h>
-#ifdef _MSC_VER
-#pragma comment(lib, "libittnotify.lib")
-#endif
-#ifdef __linux__
-#include <dlfcn.h>
-#endif
-#endif
-#ifdef __linux__
-#define XBYAK_USE_PERF
-#endif
-
-namespace Xbyak {
-namespace util {
-
-typedef enum { SmtLevel = 1, CoreLevel = 2 } IntelCpuTopologyLevel;
-
-namespace local {
-
-template <uint64_t L, uint64_t H = 0>
-struct TypeT {};
-
-template <uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
-XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) {
-  return TypeT<L1 | L2, H1 | H2>();
-}
-
-template <typename T>
-inline T max_(T x, T y) {
-  return x >= y ? x : y;
-}
-template <typename T>
-inline T min_(T x, T y) {
-  return x < y ? x : y;
-}
-
-}  // namespace local
-
-/**
-        CPU detection class
-        @note static inline const member is supported by c++17 or later, so use template hack
-*/
-class Cpu {
- public:
-  class Type {
-    uint64_t L;
-    uint64_t H;
-
-   public:
-    Type(uint64_t L = 0, uint64_t H = 0) : L(L), H(H) {}
-    template <uint64_t L_, uint64_t H_>
-    Type(local::TypeT<L_, H_>) : L(L_), H(H_) {}
-    Type& operator&=(const Type& rhs) {
-      L &= rhs.L;
-      H &= rhs.H;
-      return *this;
-    }
-    Type& operator|=(const Type& rhs) {
-      L |= rhs.L;
-      H |= rhs.H;
-      return *this;
-    }
-    Type operator&(const Type& rhs) const {
-      Type t = *this;
-      t &= rhs;
-      return t;
-    }
-    Type operator|(const Type& rhs) const {
-      Type t = *this;
-      t |= rhs;
-      return t;
-    }
-    bool operator==(const Type& rhs) const { return H == rhs.H && L == rhs.L; }
-    bool operator!=(const Type& rhs) const { return !operator==(rhs); }
-    // without explicit because backward compatilibity
-    operator bool() const { return (H | L) != 0; }
-    uint64_t getL() const { return L; }
-    uint64_t getH() const { return H; }
-  };
-
- private:
-  Type type_;
-  // system topology
-  bool x2APIC_supported_;
-  static const size_t maxTopologyLevels = 2;
-  uint32_t numCores_[maxTopologyLevels];
-
-  static const uint32_t maxNumberCacheLevels = 10;
-  uint32_t dataCacheSize_[maxNumberCacheLevels];
-  uint32_t coresSharignDataCache_[maxNumberCacheLevels];
-  uint32_t dataCacheLevels_;
-
-  uint32_t get32bitAsBE(const char* x) const { return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24); }
-  uint32_t mask(int n) const { return (1U << n) - 1; }
-  void setFamily() {
-    uint32_t data[4] = {};
-    getCpuid(1, data);
-    stepping = data[0] & mask(4);
-    model = (data[0] >> 4) & mask(4);
-    family = (data[0] >> 8) & mask(4);
-    // type = (data[0] >> 12) & mask(2);
-    extModel = (data[0] >> 16) & mask(4);
-    extFamily = (data[0] >> 20) & mask(8);
-    if (family == 0x0f) {
-      displayFamily = family + extFamily;
-    } else {
-      displayFamily = family;
-    }
-    if (family == 6 || family == 0x0f) {
-      displayModel = (extModel << 4) + model;
-    } else {
-      displayModel = model;
-    }
-  }
-  uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) { return (val >> base) & ((1u << (end - base)) - 1); }
-  void setNumCores() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    uint32_t data[4] = {};
-    getCpuidEx(0x0, 0, data);
-    if (data[0] >= 0xB) {
-      /*
-             if leaf 11 exists(x2APIC is supported),
-             we use it to get the number of smt cores and cores on socket
-
-             leaf 0xB can be zeroed-out by a hypervisor
-     */
-      x2APIC_supported_ = true;
-      for (uint32_t i = 0; i < maxTopologyLevels; i++) {
-        getCpuidEx(0xB, i, data);
-        IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
-        if (level == SmtLevel || level == CoreLevel) {
-          numCores_[level - 1] = extractBit(data[1], 0, 15);
-        }
-      }
-      /*
-              Fallback values in case a hypervisor has 0xB leaf zeroed-out.
-      */
-      numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
-      numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
-    } else {
-      /*
-              Failed to deremine num of cores without x2APIC support.
-              TODO: USE initial APIC ID to determine ncores.
-      */
-      numCores_[SmtLevel - 1] = 0;
-      numCores_[CoreLevel - 1] = 0;
-    }
-  }
-  void setCacheHierarchy() {
-    if (!has(tINTEL) && !has(tAMD)) return;
-
-    // https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
-    if (has(tAMD)) {
-      // There are 3 Data Cache Levels (L1, L2, L3)
-      dataCacheLevels_ = 3;
-      const uint32_t leaf = 0x8000001D;  // for modern AMD CPus
-      // Sub leaf value ranges from 0 to 3
-      // Sub leaf value 0 refers to L1 Data Cache
-      // Sub leaf value 1 refers to L1 Instruction Cache
-      // Sub leaf value 2 refers to L2 Cache
-      // Sub leaf value 3 refers to L3 Cache
-      // For legacy AMD CPU, use leaf 0x80000005 for L1 cache
-      // and 0x80000006 for L2 and L3 cache
-      int cache_index = 0;
-      for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
-        // Skip sub_leaf = 1 as it refers to
-        // L1 Instruction Cache (not required)
-        if (sub_leaf == 1) {
-          continue;
-        }
-        uint32_t data[4] = {};
-        getCpuidEx(leaf, sub_leaf, data);
-        // Cache Size = Line Size * Partitions * Associativity * Cache Sets
-        dataCacheSize_[cache_index] = (extractBit(data[1], 22, 31) + 1)    // Associativity-1
-                                      * (extractBit(data[1], 12, 21) + 1)  // Partitions-1
-                                      * (extractBit(data[1], 0, 11) + 1)   // Line Size
-                                      * (data[2] + 1);
-        // Calculate the number of cores sharing the current data cache
-        int smt_width = numCores_[0];
-        int logical_cores = numCores_[1];
-        int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
-        if (logical_cores != 0) {
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
-        ++cache_index;
-      }
-      return;
-    }
-    // intel
-    const uint32_t NO_CACHE = 0;
-    const uint32_t DATA_CACHE = 1;
-    //		const uint32_t INSTRUCTION_CACHE = 2;
-    const uint32_t UNIFIED_CACHE = 3;
-    uint32_t smt_width = 0;
-    uint32_t logical_cores = 0;
-    uint32_t data[4] = {};
-
-    if (x2APIC_supported_) {
-      smt_width = numCores_[0];
-      logical_cores = numCores_[1];
-    }
-
-    /*
-            Assumptions:
-            the first level of data cache is not shared (which is the
-            case for every existing architecture) and use this to
-            determine the SMT width for arch not supporting leaf 11.
-            when leaf 4 reports a number of core less than numCores_
-            on socket reported by leaf 11, then it is a correct number
-            of cores not an upperbound.
-    */
-    for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
-      getCpuidEx(0x4, i, data);
-      uint32_t cacheType = extractBit(data[0], 0, 4);
-      if (cacheType == NO_CACHE) break;
-      if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-        uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
-        if (logical_cores != 0) {  // true only if leaf 0xB is supported and valid
-          actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
-        }
-        assert(actual_logical_cores != 0);
-        dataCacheSize_[dataCacheLevels_] = (extractBit(data[1], 22, 31) + 1) * (extractBit(data[1], 12, 21) + 1) *
-                                           (extractBit(data[1], 0, 11) + 1) * (data[2] + 1);
-        if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
-        assert(smt_width != 0);
-        coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
-        dataCacheLevels_++;
-      }
-    }
-  }
-
- public:
-  int model;
-  int family;
-  int stepping;
-  int extModel;
-  int extFamily;
-  int displayFamily;  // family + extFamily
-  int displayModel;   // model + extModel
-
-  uint32_t getNumCores(IntelCpuTopologyLevel level) const {
-    if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    switch (level) {
-      case SmtLevel:
-        return numCores_[level - 1];
-      case CoreLevel:
-        return numCores_[level - 1] / numCores_[SmtLevel - 1];
-      default:
-        XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
-    }
-  }
-
-  uint32_t getDataCacheLevels() const { return dataCacheLevels_; }
-  uint32_t getCoresSharingDataCache(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return coresSharignDataCache_[i];
-  }
-  uint32_t getDataCacheSize(uint32_t i) const {
-    if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-    return dataCacheSize_[i];
-  }
-
-  /*
-          data[] = { eax, ebx, ecx, edx }
-  */
-  static inline void getCpuid(uint32_t eaxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuid(reinterpret_cast<int*>(data), eaxIn);
-#else
-    __cpuid(eaxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)data;
-#endif
-  }
-  static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _WIN32
-    __cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
-#else
-    __cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
-#endif
-#else
-    (void)eaxIn;
-    (void)ecxIn;
-    (void)data;
-#endif
-  }
-  static inline uint64_t getXfeature() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return _xgetbv(0);
-#else
-    uint32_t eax, edx;
-    // xgetvb is not support on gcc 4.2
-    //		__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
-    __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    return 0;
-#endif
-  }
-
-#define XBYAK_SPLIT_ID(id) ((0 <= id && id < 64) ? (1ull << (id % 64)) : 0), (id >= 64 ? (1ull << (id % 64)) : 0)
-#if (__cplusplus >= 201103) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) /* VS2012 */
-#define XBYAK_DEFINE_TYPE(id, NAME) \
-  static const constexpr local::TypeT<XBYAK_SPLIT_ID(id)> NAME {}
-#else
-#define XBYAK_DEFINE_TYPE(id, NAME) static const local::TypeT<XBYAK_SPLIT_ID(id)> NAME
-#endif
-  XBYAK_DEFINE_TYPE(0, tMMX);
-  XBYAK_DEFINE_TYPE(1, tMMX2);
-  XBYAK_DEFINE_TYPE(2, tCMOV);
-  XBYAK_DEFINE_TYPE(3, tSSE);
-  XBYAK_DEFINE_TYPE(4, tSSE2);
-  XBYAK_DEFINE_TYPE(5, tSSE3);
-  XBYAK_DEFINE_TYPE(6, tSSSE3);
-  XBYAK_DEFINE_TYPE(7, tSSE41);
-  XBYAK_DEFINE_TYPE(8, tSSE42);
-  XBYAK_DEFINE_TYPE(9, tPOPCNT);
-  XBYAK_DEFINE_TYPE(10, tAESNI);
-  XBYAK_DEFINE_TYPE(11, tAVX512_FP16);
-  XBYAK_DEFINE_TYPE(12, tOSXSAVE);
-  XBYAK_DEFINE_TYPE(13, tPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(14, tAVX);
-  XBYAK_DEFINE_TYPE(15, tFMA);
-  XBYAK_DEFINE_TYPE(16, t3DN);
-  XBYAK_DEFINE_TYPE(17, tE3DN);
-  XBYAK_DEFINE_TYPE(18, tWAITPKG);
-  XBYAK_DEFINE_TYPE(19, tRDTSCP);
-  XBYAK_DEFINE_TYPE(20, tAVX2);
-  XBYAK_DEFINE_TYPE(21, tBMI1);  // andn, bextr, blsi, blsmsk, blsr, tzcnt
-  XBYAK_DEFINE_TYPE(22, tBMI2);  // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
-  XBYAK_DEFINE_TYPE(23, tLZCNT);
-  XBYAK_DEFINE_TYPE(24, tINTEL);
-  XBYAK_DEFINE_TYPE(25, tAMD);
-  XBYAK_DEFINE_TYPE(26, tENHANCED_REP);  // enhanced rep movsb/stosb
-  XBYAK_DEFINE_TYPE(27, tRDRAND);
-  XBYAK_DEFINE_TYPE(28, tADX);     // adcx, adox
-  XBYAK_DEFINE_TYPE(29, tRDSEED);  // rdseed
-  XBYAK_DEFINE_TYPE(30, tSMAP);    // stac
-  XBYAK_DEFINE_TYPE(31, tHLE);     // xacquire, xrelease, xtest
-  XBYAK_DEFINE_TYPE(32, tRTM);     // xbegin, xend, xabort
-  XBYAK_DEFINE_TYPE(33, tF16C);    // vcvtph2ps, vcvtps2ph
-  XBYAK_DEFINE_TYPE(34, tMOVBE);   // mobve
-  XBYAK_DEFINE_TYPE(35, tAVX512F);
-  XBYAK_DEFINE_TYPE(36, tAVX512DQ);
-  XBYAK_DEFINE_TYPE(37, tAVX512_IFMA);
-  XBYAK_DEFINE_TYPE(37, tAVX512IFMA);  // = tAVX512_IFMA;
-  XBYAK_DEFINE_TYPE(38, tAVX512PF);
-  XBYAK_DEFINE_TYPE(39, tAVX512ER);
-  XBYAK_DEFINE_TYPE(40, tAVX512CD);
-  XBYAK_DEFINE_TYPE(41, tAVX512BW);
-  XBYAK_DEFINE_TYPE(42, tAVX512VL);
-  XBYAK_DEFINE_TYPE(43, tAVX512_VBMI);
-  XBYAK_DEFINE_TYPE(43, tAVX512VBMI);  // = tAVX512_VBMI; // changed by Intel's manual
-  XBYAK_DEFINE_TYPE(44, tAVX512_4VNNIW);
-  XBYAK_DEFINE_TYPE(45, tAVX512_4FMAPS);
-  XBYAK_DEFINE_TYPE(46, tPREFETCHWT1);
-  XBYAK_DEFINE_TYPE(47, tPREFETCHW);
-  XBYAK_DEFINE_TYPE(48, tSHA);
-  XBYAK_DEFINE_TYPE(49, tMPX);
-  XBYAK_DEFINE_TYPE(50, tAVX512_VBMI2);
-  XBYAK_DEFINE_TYPE(51, tGFNI);
-  XBYAK_DEFINE_TYPE(52, tVAES);
-  XBYAK_DEFINE_TYPE(53, tVPCLMULQDQ);
-  XBYAK_DEFINE_TYPE(54, tAVX512_VNNI);
-  XBYAK_DEFINE_TYPE(55, tAVX512_BITALG);
-  XBYAK_DEFINE_TYPE(56, tAVX512_VPOPCNTDQ);
-  XBYAK_DEFINE_TYPE(57, tAVX512_BF16);
-  XBYAK_DEFINE_TYPE(58, tAVX512_VP2INTERSECT);
-  XBYAK_DEFINE_TYPE(59, tAMX_TILE);
-  XBYAK_DEFINE_TYPE(60, tAMX_INT8);
-  XBYAK_DEFINE_TYPE(61, tAMX_BF16);
-  XBYAK_DEFINE_TYPE(62, tAVX_VNNI);
-  XBYAK_DEFINE_TYPE(63, tCLFLUSHOPT);
-  XBYAK_DEFINE_TYPE(64, tCLDEMOTE);
-  XBYAK_DEFINE_TYPE(65, tMOVDIRI);
-  XBYAK_DEFINE_TYPE(66, tMOVDIR64B);
-  XBYAK_DEFINE_TYPE(67, tCLZERO);  // AMD Zen
-  XBYAK_DEFINE_TYPE(68, tAMX_FP16);
-  XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
-  XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
-  XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
-  XBYAK_DEFINE_TYPE(72, tRAO_INT);
-  XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
-  XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
-  XBYAK_DEFINE_TYPE(75, tSERIALIZE);
-  XBYAK_DEFINE_TYPE(76, tUINTR);
-  XBYAK_DEFINE_TYPE(77, tXSAVE);
-  XBYAK_DEFINE_TYPE(78, tSHA512);
-  XBYAK_DEFINE_TYPE(79, tSM3);
-  XBYAK_DEFINE_TYPE(80, tSM4);
-  XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
-
-#undef XBYAK_SPLIT_ID
-#undef XBYAK_DEFINE_TYPE
-
-  Cpu()
-      : type_(),
-        x2APIC_supported_(false),
-        numCores_(),
-        dataCacheSize_(),
-        coresSharignDataCache_(),
-        dataCacheLevels_(0) {
-    uint32_t data[4] = {};
-    const uint32_t& EAX = data[0];
-    const uint32_t& EBX = data[1];
-    const uint32_t& ECX = data[2];
-    const uint32_t& EDX = data[3];
-    getCpuid(0, data);
-    const uint32_t maxNum = EAX;
-    static const char intel[] = "ntel";
-    static const char amd[] = "cAMD";
-    if (ECX == get32bitAsBE(amd)) {
-      type_ |= tAMD;
-      getCpuid(0x80000001, data);
-      if (EDX & (1U << 31)) {
-        type_ |= t3DN;
-        // 3DNow! implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-
-      if (EDX & (1U << 29)) {
-        // Long mode implies support for PREFETCHW on AMD
-        type_ |= tPREFETCHW;
-      }
-    }
-    if (ECX == get32bitAsBE(intel)) {
-      type_ |= tINTEL;
-    }
-
-    // Extended flags information
-    getCpuid(0x80000000, data);
-    const uint32_t maxExtendedNum = EAX;
-    if (maxExtendedNum >= 0x80000001) {
-      getCpuid(0x80000001, data);
-
-      if (EDX & (1U << 31)) type_ |= t3DN;
-      if (EDX & (1U << 30)) type_ |= tE3DN;
-      if (EDX & (1U << 27)) type_ |= tRDTSCP;
-      if (EDX & (1U << 22)) type_ |= tMMX2;
-      if (EDX & (1U << 15)) type_ |= tCMOV;
-      if (ECX & (1U << 5)) type_ |= tLZCNT;
-      if (ECX & (1U << 8)) type_ |= tPREFETCHW;
-    }
-
-    if (maxExtendedNum >= 0x80000008) {
-      getCpuid(0x80000008, data);
-      if (EBX & (1U << 0)) type_ |= tCLZERO;
-    }
-
-    getCpuid(1, data);
-    if (ECX & (1U << 0)) type_ |= tSSE3;
-    if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
-    if (ECX & (1U << 9)) type_ |= tSSSE3;
-    if (ECX & (1U << 19)) type_ |= tSSE41;
-    if (ECX & (1U << 20)) type_ |= tSSE42;
-    if (ECX & (1U << 22)) type_ |= tMOVBE;
-    if (ECX & (1U << 23)) type_ |= tPOPCNT;
-    if (ECX & (1U << 25)) type_ |= tAESNI;
-    if (ECX & (1U << 26)) type_ |= tXSAVE;
-    if (ECX & (1U << 27)) type_ |= tOSXSAVE;
-    if (ECX & (1U << 30)) type_ |= tRDRAND;
-    if (ECX & (1U << 29)) type_ |= tF16C;
-
-    if (EDX & (1U << 15)) type_ |= tCMOV;
-    if (EDX & (1U << 23)) type_ |= tMMX;
-    if (EDX & (1U << 25)) type_ |= tMMX2 | tSSE;
-    if (EDX & (1U << 26)) type_ |= tSSE2;
-
-    if (type_ & tOSXSAVE) {
-      // check XFEATURE_ENABLED_MASK[2:1] = '11b'
-      uint64_t bv = getXfeature();
-      if ((bv & 6) == 6) {
-        if (ECX & (1U << 28)) type_ |= tAVX;
-        if (ECX & (1U << 12)) type_ |= tFMA;
-          // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
-#if !defined(__APPLE__)
-        if (((bv >> 5) & 7) == 7)
-#endif
-        {
-          getCpuidEx(7, 0, data);
-          if (EBX & (1U << 16)) type_ |= tAVX512F;
-          if (type_ & tAVX512F) {
-            if (EBX & (1U << 17)) type_ |= tAVX512DQ;
-            if (EBX & (1U << 21)) type_ |= tAVX512_IFMA;
-            if (EBX & (1U << 26)) type_ |= tAVX512PF;
-            if (EBX & (1U << 27)) type_ |= tAVX512ER;
-            if (EBX & (1U << 28)) type_ |= tAVX512CD;
-            if (EBX & (1U << 30)) type_ |= tAVX512BW;
-            if (EBX & (1U << 31)) type_ |= tAVX512VL;
-            if (ECX & (1U << 1)) type_ |= tAVX512_VBMI;
-            if (ECX & (1U << 6)) type_ |= tAVX512_VBMI2;
-            if (ECX & (1U << 11)) type_ |= tAVX512_VNNI;
-            if (ECX & (1U << 12)) type_ |= tAVX512_BITALG;
-            if (ECX & (1U << 14)) type_ |= tAVX512_VPOPCNTDQ;
-            if (EDX & (1U << 2)) type_ |= tAVX512_4VNNIW;
-            if (EDX & (1U << 3)) type_ |= tAVX512_4FMAPS;
-            if (EDX & (1U << 8)) type_ |= tAVX512_VP2INTERSECT;
-            if ((type_ & tAVX512BW) && (EDX & (1U << 23))) type_ |= tAVX512_FP16;
-          }
-        }
-      }
-    }
-    if (maxNum >= 7) {
-      getCpuidEx(7, 0, data);
-      const uint32_t maxNumSubLeaves = EAX;
-      if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
-      if (EBX & (1U << 3)) type_ |= tBMI1;
-      if (EBX & (1U << 8)) type_ |= tBMI2;
-      if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
-      if (EBX & (1U << 18)) type_ |= tRDSEED;
-      if (EBX & (1U << 19)) type_ |= tADX;
-      if (EBX & (1U << 20)) type_ |= tSMAP;
-      if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
-      if (EBX & (1U << 4)) type_ |= tHLE;
-      if (EBX & (1U << 11)) type_ |= tRTM;
-      if (EBX & (1U << 14)) type_ |= tMPX;
-      if (EBX & (1U << 29)) type_ |= tSHA;
-      if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
-      if (ECX & (1U << 5)) type_ |= tWAITPKG;
-      if (ECX & (1U << 8)) type_ |= tGFNI;
-      if (ECX & (1U << 9)) type_ |= tVAES;
-      if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
-      if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
-      if (ECX & (1U << 27)) type_ |= tMOVDIRI;
-      if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
-      if (EDX & (1U << 5)) type_ |= tUINTR;
-      if (EDX & (1U << 14)) type_ |= tSERIALIZE;
-      if (EDX & (1U << 22)) type_ |= tAMX_BF16;
-      if (EDX & (1U << 24)) type_ |= tAMX_TILE;
-      if (EDX & (1U << 25)) type_ |= tAMX_INT8;
-      if (maxNumSubLeaves >= 1) {
-        getCpuidEx(7, 1, data);
-        if (EAX & (1U << 0)) type_ |= tSHA512;
-        if (EAX & (1U << 1)) type_ |= tSM3;
-        if (EAX & (1U << 2)) type_ |= tSM4;
-        if (EAX & (1U << 3)) type_ |= tRAO_INT;
-        if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
-        if (type_ & tAVX512F) {
-          if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
-        }
-        if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
-        if (EAX & (1U << 21)) type_ |= tAMX_FP16;
-        if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
-        if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
-        if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
-        if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
-        if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
-      }
-    }
-    setFamily();
-    setNumCores();
-    setCacheHierarchy();
-  }
-  void putFamily() const {
-#ifndef XBYAK_ONLY_CLASS_CPU
-    printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n", family, model, stepping, extFamily,
-           extModel);
-    printf("display:family=%X, model=%X\n", displayFamily, displayModel);
-#endif
-  }
-  bool has(const Type& type) const { return (type & type_) == type; }
-};
-
-#ifndef XBYAK_ONLY_CLASS_CPU
-class Clock {
- public:
-  static inline uint64_t getRdtsc() {
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-#ifdef _MSC_VER
-    return __rdtsc();
-#else
-    uint32_t eax, edx;
-    __asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
-    return ((uint64_t)edx << 32) | eax;
-#endif
-#else
-    // TODO: Need another impl of Clock or rdtsc-equivalent for non-x86 cpu
-    return 0;
-#endif
-  }
-  Clock() : clock_(0), count_(0) {}
-  void begin() { clock_ -= getRdtsc(); }
-  void end() {
-    clock_ += getRdtsc();
-    count_++;
-  }
-  int getCount() const { return count_; }
-  uint64_t getClock() const { return clock_; }
-  void clear() {
-    count_ = 0;
-    clock_ = 0;
-  }
-
- private:
-  uint64_t clock_;
-  int count_;
-};
-
-#ifdef XBYAK64
-const int UseRCX = 1 << 6;
-const int UseRDX = 1 << 7;
-
-class Pack {
-  static const size_t maxTblNum = 15;
-  Xbyak::Reg64 tbl_[maxTblNum];
-  size_t n_;
-
- public:
-  Pack() : tbl_(), n_(0) {}
-  Pack(const Xbyak::Reg64* tbl, size_t n) { init(tbl, n); }
-  Pack(const Pack& rhs) : n_(rhs.n_) {
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-  }
-  Pack& operator=(const Pack& rhs) {
-    n_ = rhs.n_;
-    for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
-    return *this;
-  }
-  Pack(const Xbyak::Reg64& t0) {
-    n_ = 1;
-    tbl_[0] = t0;
-  }
-  Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 2;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-  }
-  Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 3;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-  }
-  Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 4;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-  }
-  Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 5;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-  }
-  Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 6;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-  }
-  Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 7;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-  }
-  Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 8;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-  }
-  Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5,
-       const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1,
-       const Xbyak::Reg64& t0) {
-    n_ = 9;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-  }
-  Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6,
-       const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2,
-       const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 10;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-  }
-  Pack(const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7,
-       const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3,
-       const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 11;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-  }
-  Pack(const Xbyak::Reg64& tb, const Xbyak::Reg64& ta, const Xbyak::Reg64& t9, const Xbyak::Reg64& t8,
-       const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4,
-       const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0) {
-    n_ = 12;
-    tbl_[0] = t0;
-    tbl_[1] = t1;
-    tbl_[2] = t2;
-    tbl_[3] = t3;
-    tbl_[4] = t4;
-    tbl_[5] = t5;
-    tbl_[6] = t6;
-    tbl_[7] = t7;
-    tbl_[8] = t8;
-    tbl_[9] = t9;
-    tbl_[10] = ta;
-    tbl_[11] = tb;
-  }
-  Pack& append(const Xbyak::Reg64& t) {
-    if (n_ == maxTblNum) {
-      fprintf(stderr, "ERR Pack::can't append\n");
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, *this)
-    }
-    tbl_[n_++] = t;
-    return *this;
-  }
-  void init(const Xbyak::Reg64* tbl, size_t n) {
-    if (n > maxTblNum) {
-      fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
-      XBYAK_THROW(ERR_BAD_PARAMETER)
-    }
-    n_ = n;
-    for (size_t i = 0; i < n; i++) {
-      tbl_[i] = tbl[i];
-    }
-  }
-  const Xbyak::Reg64& operator[](size_t n) const {
-    if (n >= n_) {
-      fprintf(stderr, "ERR Pack bad n=%d(%d)\n", (int)n, (int)n_);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, rax)
-    }
-    return tbl_[n];
-  }
-  size_t size() const { return n_; }
-  /*
-          get tbl[pos, pos + num)
-  */
-  Pack sub(size_t pos, size_t num = size_t(-1)) const {
-    if (num == size_t(-1)) num = n_ - pos;
-    if (pos + num > n_) {
-      fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
-      XBYAK_THROW_RET(ERR_BAD_PARAMETER, Pack())
-    }
-    Pack pack;
-    pack.n_ = num;
-    for (size_t i = 0; i < num; i++) {
-      pack.tbl_[i] = tbl_[pos + i];
-    }
-    return pack;
-  }
-  void put() const {
-    for (size_t i = 0; i < n_; i++) {
-      printf("%s ", tbl_[i].toString());
-    }
-    printf("\n");
-  }
-};
-
-class StackFrame {
-#ifdef XBYAK64_WIN
-  static const int noSaveNum = 6;
-  static const int rcxPos = 0;
-  static const int rdxPos = 1;
-#else
-  static const int noSaveNum = 8;
-  static const int rcxPos = 3;
-  static const int rdxPos = 2;
-#endif
-  static const int maxRegNum = 14;  // maxRegNum = 16 - rsp - rax
-  Xbyak::CodeGenerator* code_;
-  int pNum_;
-  int tNum_;
-  bool useRcx_;
-  bool useRdx_;
-  int saveNum_;
-  int P_;
-  bool makeEpilog_;
-  Xbyak::Reg64 pTbl_[4];
-  Xbyak::Reg64 tTbl_[maxRegNum];
-  Pack p_;
-  Pack t_;
-  StackFrame(const StackFrame&);
-  void operator=(const StackFrame&);
-
- public:
-  const Pack& p;
-  const Pack& t;
-  /*
-          make stack frame
-          @param sf [in] this
-          @param pNum [in] num of function parameter(0 <= pNum <= 4)
-          @param tNum [in] num of temporary register(0 <= tNum, with UseRCX, UseRDX) #{pNum + tNum [+rcx] + [rdx]} <= 14
-          @param stackSizeByte [in] local stack size
-          @param makeEpilog [in] automatically call close() if true
-
-          you can use
-          rax
-          gp0, ..., gp(pNum - 1)
-          gt0, ..., gt(tNum-1)
-          rcx if tNum & UseRCX
-          rdx if tNum & UseRDX
-          rsp[0..stackSizeByte - 1]
-  */
-  StackFrame(Xbyak::CodeGenerator* code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
-      : code_(code),
-        pNum_(pNum),
-        tNum_(tNum & ~(UseRCX | UseRDX)),
-        useRcx_((tNum & UseRCX) != 0),
-        useRdx_((tNum & UseRDX) != 0),
-        saveNum_(0),
-        P_(0),
-        makeEpilog_(makeEpilog),
-        p(p_),
-        t(t_) {
-    using namespace Xbyak;
-    if (pNum < 0 || pNum > 4) XBYAK_THROW(ERR_BAD_PNUM)
-    const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
-    if (tNum_ < 0 || allRegNum > maxRegNum) XBYAK_THROW(ERR_BAD_TNUM)
-    const Reg64& _rsp = code->rsp;
-    saveNum_ = local::max_(0, allRegNum - noSaveNum);
-    const int* tbl = getOrderTbl() + noSaveNum;
-    for (int i = 0; i < saveNum_; i++) {
-      code->push(Reg64(tbl[i]));
-    }
-    P_ = (stackSizeByte + 7) / 8;
-    if (P_ > 0 && (P_ & 1) == (saveNum_ & 1)) P_++;  // (rsp % 16) == 8, then increment P_ for 16 byte alignment
-    P_ *= 8;
-    if (P_ > 0) code->sub(_rsp, P_);
-    int pos = 0;
-    for (int i = 0; i < pNum; i++) {
-      pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    for (int i = 0; i < tNum_; i++) {
-      tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
-    }
-    if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
-    if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
-    p_.init(pTbl_, pNum);
-    t_.init(tTbl_, tNum_);
-  }
-  /*
-          make epilog manually
-          @param callRet [in] call ret() if true
-  */
-  void close(bool callRet = true) {
-    using namespace Xbyak;
-    const Reg64& _rsp = code_->rsp;
-    const int* tbl = getOrderTbl() + noSaveNum;
-    if (P_ > 0) code_->add(_rsp, P_);
-    for (int i = 0; i < saveNum_; i++) {
-      code_->pop(Reg64(tbl[saveNum_ - 1 - i]));
-    }
-
-    if (callRet) code_->ret();
-  }
-  ~StackFrame() {
-    if (!makeEpilog_) return;
-    close();
-  }
-
- private:
-  const int* getOrderTbl() const {
-    using namespace Xbyak;
-    static const int tbl[] = {
-#ifdef XBYAK64_WIN
-        Operand::RCX, Operand::RDX, Operand::R8,  Operand::R9,  Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
-#else
-        Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8,  Operand::R9, Operand::R10, Operand::R11,
-#endif
-        Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15};
-    return &tbl[0];
-  }
-  int getRegIdx(int& pos) const {
-    assert(pos < maxRegNum);
-    using namespace Xbyak;
-    const int* tbl = getOrderTbl();
-    int r = tbl[pos++];
-    if (useRcx_) {
-      if (r == Operand::RCX) {
-        return Operand::R10;
-      }
-      if (r == Operand::R10) {
-        r = tbl[pos++];
-      }
-    }
-    if (useRdx_) {
-      if (r == Operand::RDX) {
-        return Operand::R11;
-      }
-      if (r == Operand::R11) {
-        return tbl[pos++];
-      }
-    }
-    return r;
-  }
-};
-#endif
-
-class Profiler {
-  int mode_;
-  const char* suffix_;
-  const void* startAddr_;
-#ifdef XBYAK_USE_PERF
-  FILE* fp_;
-#endif
- public:
-  enum { None = 0, Perf = 1, VTune = 2 };
-  Profiler()
-      : mode_(None),
-        suffix_(""),
-        startAddr_(0)
-#ifdef XBYAK_USE_PERF
-        ,
-        fp_(0)
-#endif
-  {
-  }
-  // append suffix to funcName
-  void setNameSuffix(const char* suffix) { suffix_ = suffix; }
-  void setStartAddr(const void* startAddr) { startAddr_ = startAddr; }
-  void init(int mode) {
-    mode_ = None;
-    switch (mode) {
-      default:
-      case None:
-        return;
-      case Perf:
-#ifdef XBYAK_USE_PERF
-        close();
-        {
-          const int pid = getpid();
-          char name[128];
-          snprintf(name, sizeof(name), "/tmp/perf-%d.map", pid);
-          fp_ = fopen(name, "a+");
-          if (fp_ == 0) {
-            fprintf(stderr, "can't open %s\n", name);
-            return;
-          }
-        }
-        mode_ = Perf;
-#endif
-        return;
-      case VTune:
-#ifdef XBYAK_USE_VTUNE
-        dlopen("dummy", RTLD_LAZY);  // force to load dlopen to enable jit profiling
-        if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) {
-          fprintf(stderr, "VTune profiling is not active\n");
-          return;
-        }
-        mode_ = VTune;
-#endif
-        return;
-    }
-  }
-  ~Profiler() { close(); }
-  void close() {
-#ifdef XBYAK_USE_PERF
-    if (fp_ == 0) return;
-    fclose(fp_);
-    fp_ = 0;
-#endif
-  }
-  void set(const char* funcName, const void* startAddr, size_t funcSize) const {
-    if (mode_ == None) return;
-#if !defined(XBYAK_USE_PERF) && !defined(XBYAK_USE_VTUNE)
-    (void)funcName;
-    (void)startAddr;
-    (void)funcSize;
-#endif
-#ifdef XBYAK_USE_PERF
-    if (mode_ == Perf) {
-      if (fp_ == 0) return;
-      fprintf(fp_, "%llx %zx %s%s", (long long)startAddr, funcSize, funcName, suffix_);
-      /*
-              perf does not recognize the function name which is less than 3,
-              so append '_' at the end of the name if necessary
-      */
-      size_t n = strlen(funcName) + strlen(suffix_);
-      for (size_t i = n; i < 3; i++) {
-        fprintf(fp_, "_");
-      }
-      fprintf(fp_, "\n");
-      fflush(fp_);
-    }
-#endif
-#ifdef XBYAK_USE_VTUNE
-    if (mode_ != VTune) return;
-    char className[] = "";
-    char fileName[] = "";
-    iJIT_Method_Load jmethod = {};
-    jmethod.method_id = iJIT_GetNewMethodID();
-    jmethod.class_file_name = className;
-    jmethod.source_file_name = fileName;
-    jmethod.method_load_address = const_cast<void*>(startAddr);
-    jmethod.method_size = funcSize;
-    jmethod.line_number_size = 0;
-    char buf[128];
-    snprintf(buf, sizeof(buf), "%s%s", funcName, suffix_);
-    jmethod.method_name = buf;
-    iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void*)&jmethod);
-#endif
-  }
-  /*
-          for continuous set
-          funcSize = endAddr - <previous set endAddr>
-  */
-  void set(const char* funcName, const void* endAddr) {
-    set(funcName, startAddr_, (size_t)endAddr - (size_t)startAddr_);
-    startAddr_ = endAddr;
-  }
-};
-#endif  // XBYAK_ONLY_CLASS_CPU
-
-}  // namespace util
-}  // namespace Xbyak
-
-#endif
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 113b94fa6f7c9..e0ed32630277e 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -63,7 +63,7 @@ void QuantizeDequantize(std::vector<float>& raw_vals,
       tp.get());
 }
 
-void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_COMPUTE_TYPE comp_type,
+void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accuracy_level,
              bool has_zeropoint, bool use_float16, float fp16_abs_error = 0.02f) {
   RandomValueGenerator random{1234};
   std::vector<float> input0_vals(random.Gaussian<float>(std::vector<int64_t>({M, K}), 0.0f, 0.25f));
@@ -110,7 +110,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
   test.AddAttribute<int64_t>("N", N);
   test.AddAttribute<int64_t>("block_size", block_size);
   test.AddAttribute<int64_t>("bits", QBits);
-  test.AddAttribute<int64_t>("accuracy_level", comp_type);
+  test.AddAttribute<int64_t>("accuracy_level", accuracy_level);
   if (use_float16) {
     test.AddInput<MLFloat16>("A", {M, K}, ToFloat16(input0_vals), false);
     test.AddInput<uint8_t>("B", {q_cols, q_rows}, input1_vals, true);
@@ -134,7 +134,7 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, MLAS_SQNBIT_CO
     }
 
     test.AddOutput<float>("Y", {M, N}, expected_vals);
-    if (comp_type == CompInt8) {
+    if (accuracy_level == 4) {
       test.SetOutputAbsErr("Y", 0.1f);
     }
 
@@ -147,10 +147,17 @@ TEST(MatMulNBits, Float32) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          for (auto comp : {CompUndef, CompFp32, CompInt8}) {
-            RunTest(M, N, K, block_size, comp, false, false);
-            RunTest(M, N, K, block_size, comp, true, false);
+#ifdef ORT_NEURAL_SPEED
+          for (auto accuracy_level : {0, 1, 4}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
           }
+#else
+          for (auto accuracy_level : {0}) {
+            RunTest(M, N, K, block_size, accuracy_level, false, false);
+            RunTest(M, N, K, block_size, accuracy_level, true, false);
+          }
+#endif
         }
       }
     }
@@ -163,8 +170,8 @@ TEST(MatMulNBits, Float16) {
     for (auto N : {1, 2, 32, 288}) {
       for (auto K : {16, 32, 64, 128, 256, 1024, 93, 1234}) {
         for (auto block_size : {16, 32, 64, 128}) {
-          RunTest(M, N, K, block_size, CompUndef, false, true);
-          RunTest(M, N, K, block_size, CompUndef, true, true);
+          RunTest(M, N, K, block_size, 0, false, true);
+          RunTest(M, N, K, block_size, 0, true, true);
         }
       }
     }
@@ -174,9 +181,9 @@ TEST(MatMulNBits, Float16) {
 TEST(MatMulNBits, Float16Large) {
   for (auto block_size : {16, 32, 64, 128}) {
     for (auto symmetric : {false, true}) {
-      RunTest(1, 4096, 4096, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 4096, 11008, block_size, CompUndef, symmetric, true, 0.05f);
-      RunTest(1, 11008, 4096, block_size, CompUndef, symmetric, true, 0.05f);
+      RunTest(1, 4096, 4096, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 4096, 11008, block_size, 0, symmetric, true, 0.05f);
+      RunTest(1, 11008, 4096, block_size, 0, symmetric, true, 0.05f);
     }
   }
 }
@@ -184,11 +191,11 @@ TEST(MatMulNBits, Float16Large) {
 #endif
 
 void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_size, bool is_asym,
-                                   MLAS_SQNBIT_COMPUTE_TYPE acc_lvl) {
+                                   int64_t acc_lvl) {
   // (M x K) X (K x N)
 
   OpTester test("MatMulNBits", 1, kMSDomain);
-  test.AddAttribute<int64_t>("accuracy_level", int64_t(acc_lvl));
+  test.AddAttribute<int64_t>("accuracy_level", acc_lvl);
   test.AddAttribute<int64_t>("block_size", int64_t(block_size));
   test.AddAttribute<int64_t>("bits", QBits);
   test.AddAttribute<int64_t>("N", N);
@@ -268,7 +275,7 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
     test.AddInput<uint8_t>("zero_points", {N, static_cast<int64_t>(kblks / 2)}, input3_vals, true);
   }
   test.AddOutput<float>("Y", {M, N}, expected_vals, false);
-  if (acc_lvl == CompInt8) {
+  if (acc_lvl == 4) {
     test.SetOutputAbsErr("Y", 0.1f);
   }
 
@@ -341,14 +348,14 @@ void RunSharedPrepackedWeightsTest(int64_t M, int64_t N, int64_t K, int block_si
   }
 }
 
-#ifdef MLAS_JBLAS
+#ifdef ORT_NEURAL_SPEED
 TEST(MatMulNBits, SharedPrepackedWeights) {
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompFp32);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, CompInt8);
-  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, CompInt8);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, true, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 32, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 1);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 128, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 1024, false, 4);
+  RunSharedPrepackedWeightsTest(2, 4096, 4096, 4096, false, 4);
 }
 #endif
 }  // namespace test
diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
index 2a56d37b899f8..668d7a0611367 100644
--- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp
@@ -112,64 +112,3 @@ static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) {
 }
 
 BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime();
-
-#if defined(MLAS_JBLAS)
-
-void Q4GEMM_Jblas(benchmark::State& state, int block_size, bool is_asym, MLAS_SQNBIT_COMPUTE_TYPE cmp_type) {
-  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
-  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
-  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
-  if (state.range(3) <= 0) throw std::invalid_argument("Threads must greater than 0!");
-
-  const size_t M = static_cast<size_t>(state.range(0));
-  const size_t N = static_cast<size_t>(state.range(1));
-  const size_t K = static_cast<size_t>(state.range(2));
-  const size_t threads = static_cast<size_t>(state.range(3));
-  block_size = block_size == -1 ? static_cast<int>(K) : block_size;
-  const size_t pack_b_size = MlasNBitsGemmPackBSize(N, K, block_size, 4, is_asym, cmp_type);
-
-  OrtThreadPoolParams tpo;
-  tpo.thread_pool_size = static_cast<int>(threads);
-  tpo.auto_set_affinity = true;
-  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(onnxruntime::concurrency::CreateThreadPool(
-      &onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
-
-  auto A1 = RandomVectorUniform(static_cast<size_t>(M * K), -1.0f, 1.0f);
-  auto B1 = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * K / 2), 0, 255);
-  auto blk_num = static_cast<size_t>((K + block_size - 1) / block_size);
-  auto B_scale = RandomVectorUniform(static_cast<size_t>(N * blk_num), 0.003f, 0.005f);
-  std::vector<float> C1(static_cast<size_t>(M * N));
-  auto B_zp = RandomVectorUniform<uint8_t>(static_cast<size_t>(N * blk_num / 2), 0, 255);
-
-  std::vector<int8_t> B1_packed(pack_b_size);
-  MlasNBitsGemmPackB(B1_packed.data(), B1.data(), B_scale.data(), is_asym ? B_zp.data() : nullptr, N, K, K, block_size,
-                     4, is_asym, true, cmp_type, tp.get());
-
-  MLAS_SQNBITS_GEMM_DATA_PACKED_PARAMS params1;
-  params1.A = A1.data();
-  params1.lda = K;
-  params1.C = C1.data();
-  params1.ldc = N;
-  params1.B = B1_packed.data();
-  std::vector<int8_t> workspace(static_cast<size_t>(M <= 32 ? 32 : M) * K * 4);
-  MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-
-  for (auto _ : state) {
-    MlasSQNBitsGemmBatchPackedB(M, N, K, 1, &params1, workspace.data(), tp.get());
-  }
-}
-
-static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
-  b->ArgNames({"M", "N", "K", "Threads"});
-  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}, {8}});
-}
-
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymInt8, 32, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymInt8, 128, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymInt8, -1, false, CompInt8)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32SymFp32, 32, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G128SymFp32, 128, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4GPerNSymFp32, -1, false, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-BENCHMARK_CAPTURE(Q4GEMM_Jblas, Q4G32AsymFp32, 32, true, CompFp32)->Apply(GemmSizeProducts)->UseRealTime();
-
-#endif  // defined(MLAS_JBLAS)

From dc1fed7268876bcf8c12161d9398a65c427af315 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 19 Jan 2024 05:26:26 +0800
Subject: [PATCH 33/39] [Fix] Dual Cuda version isn't supported as expected in
 Linux Gpu pipeline (#19192)

### Description
<!-- Describe your changes. -->


### Motivation and Context
It isn't support expected dual cuda version

cuda 12 link

https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1272235&view=logs&j=f2f63060-d9d6-52d0-adee-b97db5a9ab91
---
 .../azure-pipelines/linux-gpu-ci-pipeline.yml | 97 ++++++++++---------
 1 file changed, 52 insertions(+), 45 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 5bc8c3603ee92..1060a0138e0b7 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -43,7 +43,6 @@ resources:
     ref: 5eda9aded5462201e6310105728d33016e637ea7
 
 variables:
-  - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
@@ -56,6 +55,12 @@ variables:
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
       value: 8.6.1.6-1.cuda12.0
 
+  - name: Repository
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: 'onnxruntimecuda11build'
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: 'onnxruntimecuda12build'
+
 jobs:
 - job: Linux_Build
   timeoutInMinutes: 120
@@ -65,6 +70,7 @@ jobs:
   workspace:
     clean: all
   pool: onnxruntime-Ubuntu2204-AMD-CPU
+
   steps:
   - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
     displayName: 'Clean Agent Directories'
@@ -73,24 +79,25 @@ jobs:
   - checkout: self
     clean: true
     submodules: none
+
   - template: templates/get-docker-image-steps.yml
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
-      --network=host 
+      --network=host
       --build-arg BASEIMAGE=$(docker_base_image)
-      --build-arg TRT_VERSION=$(linux_trt_version) 
+      --build-arg TRT_VERSION=$(linux_trt_version)
       --build-arg BUILD_UID=$( id -u )
       "
-      Repository: onnxruntimecuda11build
+      Repository: $(Repository)
 
   - task: Cache@2
     inputs:
-      key: '"ccache" | "$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
+      key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"'
       path: $(CCACHE_DIR)
       restoreKeys: |
-        "ccache" | "$(Build.SourceBranch)"
+        "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)"
         "ccache"
       cacheHitVar: CACHE_RESTORED
     displayName: Cach Task
@@ -100,41 +107,41 @@ jobs:
     condition: ne(variables.CACHE_RESTORED, 'true')
     displayName: Create Cache Dir
 
-  - task: CmdLine@2
-    inputs:
-      script: |
-        mkdir -p $HOME/.onnx
-        docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
-          --volume /data/onnx:/data/onnx:ro \
-          --volume $(Build.SourcesDirectory):/onnxruntime_src \
-          --volume $(Build.BinariesDirectory):/build \
-          --volume /data/models:/build/models:ro \
-          --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-          --volume $(Pipeline.Workspace)/ccache:/cache \
-          -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-          -e NIGHTLY_BUILD \
-          -e BUILD_BUILDNUMBER \
-          -e CCACHE_DIR=/cache \
-          onnxruntimecuda11build \
-          /bin/bash -c "
-            set -ex; \
-            env; \
-            ccache -s; \
-            /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
-              --build_dir /build --cmake_generator Ninja \
-              --config Release --update --build \
-              --skip_submodule_sync \
-              --build_shared_lib \
-              --parallel \
-              --build_wheel \
-              --enable_onnx_tests --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda-${{variables.common_cuda_version}} --cudnn_home=/usr/local/cuda-${{variables.common_cuda_version}} \
-              --enable_cuda_profiling --enable_cuda_nhwc_ops \
-              --enable_pybind --build_java \
-              --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
-                ccache -sv; \
-                ccache -z"
-      workingDirectory: $(Build.SourcesDirectory)
+  - script: |
+      set -e -x
+      mkdir -p $HOME/.onnx
+      docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \
+        --volume /data/onnx:/data/onnx:ro \
+        --volume $(Build.SourcesDirectory):/onnxruntime_src \
+        --volume $(Build.BinariesDirectory):/build \
+        --volume /data/models:/build/models:ro \
+        --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
+        --volume $(Pipeline.Workspace)/ccache:/cache \
+        -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
+        -e NIGHTLY_BUILD \
+        -e BUILD_BUILDNUMBER \
+        -e CCACHE_DIR=/cache \
+        $(Repository) \
+        /bin/bash -c "
+          set -ex; \
+          env; \
+          ccache -s; \
+          /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
+            --build_dir /build --cmake_generator Ninja \
+            --config Release --update --build \
+            --skip_submodule_sync \
+            --build_shared_lib \
+            --parallel \
+            --build_wheel \
+            --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \
+            --enable_cuda_profiling --enable_cuda_nhwc_ops \
+            --enable_pybind --build_java \
+            --use_cache \
+            --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              ccache -sv; \
+              ccache -z"
+    workingDirectory: $(Build.SourcesDirectory)
+    displayName: Build Onnxruntime
 
   - task: CmdLine@2
     inputs:
@@ -179,12 +186,12 @@ jobs:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
-      --network=host 
+      --network=host
       --build-arg BASEIMAGE=$(docker_base_image)
       --build-arg TRT_VERSION=$(linux_trt_version)
       --build-arg BUILD_UID=$( id -u )
       "
-      Repository: onnxruntimecuda11build
+      Repository: $(Repository)
 
   - task: CmdLine@2
     inputs:
@@ -197,7 +204,7 @@ jobs:
           --volume /data/models:/build/models:ro \
           --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
           --volume /data/onnx:/data/onnx \
-          onnxruntimecuda11build \
+          $(Repository) \
           /bin/bash -c "
             set -ex; \
             cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \
@@ -209,7 +216,7 @@ jobs:
             cd /tmp; \
             /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \
               --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --build_wheel --enable_onnx_tests \
-              --use_cuda --cuda_version=${{variables.common_cuda_version}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
+              --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \
               --enable_pybind --build_java --ctest_path '' "
 
   - template: templates/clean-agent-build-directory-step.yml

From d69b622ef489dfa9e9fdd56f6ce6df7376f9fa6f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 18 Jan 2024 13:45:42 -0800
Subject: [PATCH 34/39] [js/web] upgrade dependency packages version (#19193)

### Description
upgrade packages version.

```
# npm audit report

electron  23.0.0-alpha.1 - 23.3.13
Severity: moderate
ASAR Integrity bypass via filetype confusion in electron - https://github.com/advisories/GHSA-7m48-wc93-9g85
fix available via `npm audit fix --force`
Will install electron@28.1.4, which is a breaking change
node_modules/electron

get-func-name  <2.0.1
Severity: high
Chaijs/get-func-name vulnerable to ReDoS - https://github.com/advisories/GHSA-4q6p-r6v2-jvc5
fix available via `npm audit fix`
node_modules/get-func-name

semver  <=5.7.1 || 6.0.0 - 6.3.0 || 7.0.0 - 7.5.1
Severity: moderate
semver vulnerable to Regular Expression Denial of Service - https://github.com/advisories/GHSA-c2qf-rxjj-qqgw
semver vulnerable to Regular Expression Denial of Service - https://github.com/advisories/GHSA-c2qf-rxjj-qqgw
semver vulnerable to Regular Expression Denial of Service - https://github.com/advisories/GHSA-c2qf-rxjj-qqgw
fix available via `npm audit fix`
node_modules/cross-spawn/node_modules/semver
node_modules/global-agent/node_modules/semver
node_modules/semver
```
---
 js/web/package-lock.json | 80 +++++++++++++++++-----------------------
 js/web/package.json      |  2 +-
 2 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 1815767fd2320..41c44aaa2679b 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -28,7 +28,7 @@
         "@webgpu/types": "^0.1.38",
         "base64-js": "^1.5.1",
         "chai": "^4.3.7",
-        "electron": "^23.1.2",
+        "electron": "^28.1.4",
         "globby": "^13.1.3",
         "karma": "^6.4.1",
         "karma-browserstack-launcher": "^1.6.0",
@@ -862,9 +862,9 @@
       }
     },
     "node_modules/cross-spawn/node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
       "dev": true,
       "bin": {
         "semver": "bin/semver"
@@ -1042,14 +1042,14 @@
       "dev": true
     },
     "node_modules/electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "hasInstallScript": true,
       "dependencies": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
       },
       "bin": {
@@ -1059,12 +1059,6 @@
         "node": ">= 12.20.55"
       }
     },
-    "node_modules/electron/node_modules/@types/node": {
-      "version": "16.18.14",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-      "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-      "dev": true
-    },
     "node_modules/emoji-regex": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
@@ -1432,9 +1426,9 @@
       }
     },
     "node_modules/get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true,
       "engines": {
         "node": "*"
@@ -1542,9 +1536,9 @@
       }
     },
     "node_modules/global-agent/node_modules/semver": {
-      "version": "7.3.8",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-      "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+      "version": "7.5.4",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+      "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
       "dev": true,
       "optional": true,
       "dependencies": {
@@ -2908,9 +2902,9 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true,
       "bin": {
         "semver": "bin/semver.js"
@@ -4203,9 +4197,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "5.7.1",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-          "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
           "dev": true
         }
       }
@@ -4339,22 +4333,14 @@
       "dev": true
     },
     "electron": {
-      "version": "23.3.13",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-23.3.13.tgz",
-      "integrity": "sha512-BaXtHEb+KYKLouUXlUVDa/lj9pj4F5kiE0kwFdJV84Y2EU7euIDgPthfKtchhr5MVHmjtavRMIV/zAwEiSQ9rQ==",
+      "version": "28.1.4",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-28.1.4.tgz",
+      "integrity": "sha512-WE6go611KOhtH6efRPMnVC7FE7DCKnQ3ZyHFeI1DbaCy8OU4UjZ8/CZGcuZmZgRdxSBEHoHdgaJkWRHZzF0FOg==",
       "dev": true,
       "requires": {
         "@electron/get": "^2.0.0",
-        "@types/node": "^16.11.26",
+        "@types/node": "^18.11.18",
         "extract-zip": "^2.0.1"
-      },
-      "dependencies": {
-        "@types/node": {
-          "version": "16.18.14",
-          "resolved": "https://registry.npmjs.org/@types/node/-/node-16.18.14.tgz",
-          "integrity": "sha512-wvzClDGQXOCVNU4APPopC2KtMYukaF1MN/W3xAmslx22Z4/IF1/izDMekuyoUlwfnDHYCIZGaj7jMwnJKBTxKw==",
-          "dev": true
-        }
       }
     },
     "emoji-regex": {
@@ -4657,9 +4643,9 @@
       "dev": true
     },
     "get-func-name": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
-      "integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
+      "integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
       "dev": true
     },
     "get-intrinsic": {
@@ -4742,9 +4728,9 @@
       },
       "dependencies": {
         "semver": {
-          "version": "7.3.8",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz",
-          "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==",
+          "version": "7.5.4",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz",
+          "integrity": "sha512-1bCSESV6Pv+i21Hvpxp3Dx+pSD8lIPt8uVjRrxAUt/nbswYc+tK6Y2btiULjd4+fnq15PX+nqQDC7Oft7WkwcA==",
           "dev": true,
           "optional": true,
           "requires": {
@@ -5780,9 +5766,9 @@
       "dev": true
     },
     "semver": {
-      "version": "6.3.0",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz",
-      "integrity": "sha512-b39TBaTSfV6yBrapU89p5fKekE2m/NwnDocOVruQFS1/veMgdzuPcnOM34M6CwxW8jH/lxEa5rBoDeUwu5HHTw==",
+      "version": "6.3.1",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
+      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
       "dev": true
     },
     "semver-compare": {
diff --git a/js/web/package.json b/js/web/package.json
index aa89606c00a1e..a502c2b6b032d 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -47,7 +47,7 @@
     "@webgpu/types": "^0.1.38",
     "base64-js": "^1.5.1",
     "chai": "^4.3.7",
-    "electron": "^23.1.2",
+    "electron": "^28.1.4",
     "globby": "^13.1.3",
     "karma": "^6.4.1",
     "karma-browserstack-launcher": "^1.6.0",

From 28a16c223cb7a8707fb90340c8aecdc46eec3f2f Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Thu, 18 Jan 2024 14:59:23 -0800
Subject: [PATCH 35/39] [QNN EP] Update QNN pipelines to use QNN SDK 2.18 by
 default (#19129)

### Description
Update QNN pipelines to use QNN SDK 2.18 by default


### Motivation and Context
Test with the latest version of QNN SDK by default.
---
 ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml |  9 +--------
 .../azure-pipelines/linux-qnn-ci-pipeline.yml |  2 +-
 .../qnn-ep-nuget-packaging-pipeline.yml       | 19 +++++++------------
 .../win-qnn-arm64-ci-pipeline.yml             |  2 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   |  2 +-
 5 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index e2ca4f64a0ecb..2b181810b0788 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -31,7 +31,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124
+  default: qnn-v2.18.0.240101
 
 jobs:
 - job: Build_QNN_EP
@@ -88,13 +88,6 @@ jobs:
       cp -r cmake/external/onnx/onnx/backend/test/data/node/test_basic_conv_with_padding build_qnn/Release/testdata/QNN/node_tests
     displayName: Initialize test directories
 
-  - task: JavaToolInstaller@0
-    displayName: Use jdk 11
-    inputs:
-      versionSpec: '11'
-      jdkArchitectureOption: 'x64'
-      jdkSourceOption: 'PreInstalled'
-
   # This is commented out for now. The emulator runs correctly, onnx_test_runner is executable, and the test passes
   # with the CPU EP but returns 139 when attempting to use the QNN EP. Maybe some QNN EP parameters need to be provided?
   #
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index d286c4f3a46fe..07910911ab67a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124
+  default: qnn-v2.18.0.240101
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 0b4951f01ff01..47d97787d3b9e 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -1,13 +1,8 @@
 parameters:
-- name: qnn_sdk_path_win
-  displayName: QNN Windows SDK path
+- name: QnnSdk
+  displayName: QNN SDK Version
   type: string
-  default: C:\data\qnnsdk\qnn-v2.17.0.231124_win
-
-- name: qnn_sdk_info
-  displayName: QNN SDK Version Information
-  type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.18.0.240101_win
 
 - name: build_config
   displayName: Build Configuration
@@ -42,7 +37,7 @@ jobs:
       buildArch: x64
       setVcvars: true
       ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}} --parallel'
+      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home C:\data\qnnsdk\${{parameters.QnnSdk}} --parallel'
 
     steps:
       - template: templates/set-version-number-variables-step.yml
@@ -85,7 +80,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package x64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.QnnSdk }}
             cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\x64
@@ -125,7 +120,7 @@ jobs:
         displayName: 'Generate CMake Configuration for arm64'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}} --parallel'
+          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home C:\data\qnnsdk\${{parameters.QnnSdk}} --parallel'
 
       - task: VSBuild@1
         displayName: 'Build onnxruntime arm64'
@@ -173,7 +168,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package arm64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.QnnSdk }}
             cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\arm64
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 6dc428d6606af..13d4589a67cdc 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.18.0.240101_win
 
 jobs:
 - job: 'build'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index fbec572fd346c..c686fc57ab5f1 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.17.0.231124_win
+  default: qnn-v2.18.0.240101_win
 
 jobs:
 - job: 'build'

From eaf047c82092062e5a5df7f4087a9a5c7ec75f2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Fri, 19 Jan 2024 19:36:19 +0100
Subject: [PATCH 36/39] Increment year to 2024 in conf.py (python
 documentation) (#19107)

### Description
Update copyright in python documentation.
---
 docs/python/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/python/conf.py b/docs/python/conf.py
index 065149441b72c..7ab2d42aa15e1 100644
--- a/docs/python/conf.py
+++ b/docs/python/conf.py
@@ -17,7 +17,7 @@
 # -- Project information -----------------------------------------------------
 
 project = "Python API"
-copyright = "2018-2023, Microsoft"
+copyright = "2018-2024, Microsoft"
 author = "Microsoft"
 
 # -- General configuration ---------------------------------------------------

From a3ecb6326711ca368ccc402886dcaf1ef56ff79c Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:09:24 -0800
Subject: [PATCH 37/39] Update LLaMA attention fusions (#19200)

### Description
This PR updates the LLaMA-2 attention fusions by adding the following.

- Loading the PyTorch model from Hugging Face with the `LlamaAttention`
class before exporting
- Updating the attention mask pattern matching to support another case

This PR also fixes [this
issue](https://github.com/microsoft/onnxruntime/issues/19040).

### Motivation and Context
Recent changes to Hugging Face's `transformers` library break the
existing pattern matching. Since the attention fusions aim to change the
graph from `LayerNorm Op --> Set of Attention Nodes --> LayerNorm Op` to
`LayerNorm Op --> Attention Op --> LayerNorm Op` per layer, ultimately
it does not matter what nodes comprise the `Set of Attention Nodes`
because they will all be removed and replaced by the `Attention Op` in
the end.

Therefore, it does not matter whether the `LlamaAttention` class or a
different attention class is used to load the PyTorch model before
exporting because the expected graphs after the attention fusions will
look identical no matter the attention class chosen. By loading the
PyTorch model with the `LlamaAttention` class instead of other attention
classes (e.g. `LlamaFlashAttention2` or `LlamaSdpaAttention`) and then
exporting it to ONNX, the existing pattern matching will continue to
work.
---
 .../transformers/fusion_rotary_attention.py   | 10 ++++++
 .../tools/transformers/models/llama/README.md | 31 +++++--------------
 .../models/llama/convert_to_onnx.py           | 27 ++++++++++++++++
 .../transformers/models/llama/llama_torch.py  |  1 +
 .../models/llama/requirements.txt             |  2 +-
 5 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
index de89b35366a23..618d3c2fab12c 100644
--- a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py
@@ -539,6 +539,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # attn_mask_nodes_1, attn_mask_nodes_2 are for LLaMA-2 Microsoft's 3D attention mask
         # attn_mask_nodes_3, attn_mask_nodes_4 are for LLaMA-2 Hugging Face's 2D attention mask
+        # attn_mask_nodes_5, attn_mask_nodes_6 are for LLaMA-2 Microsoft's model for the DML EP
+        # attn_mask_nodes_7 is for LLaMA-2 Hugging Face's changes to the attention mask
         attn_mask, add_qk_str = "", ""
         attn_mask_nodes_1 = self.model.match_parent_path(
             add_qk,
@@ -570,6 +572,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             ["Expand", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
             [1, 0, 2, 1, 0, 0, 0],
         )
+        attn_mask_nodes_7 = self.model.match_parent_path(
+            add_qk,
+            ["Where", "Cast", "Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"],
+            [1, 0, 0, 0, 0, 1, 0, 0, 0],
+        )
         if attn_mask_nodes_1 is not None:
             _, slice_mask_1, slice_mask_2 = attn_mask_nodes_1
             attn_mask = slice_mask_1.output[0]
@@ -588,6 +595,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         elif attn_mask_nodes_6 is not None:
             # The mask has already been reshaped to (B,N,S,T)
             add_qk_str = attn_mask_nodes_6[0].output[0]
+        elif attn_mask_nodes_7 is not None:
+            # Reshape from (B,1,S,T) to (B,N,S,T)
+            add_qk_str = self.reshape_add_qk(attn_mask_nodes_7[0].output[0])
         else:
             logger.debug("fuse_rotary_attention: failed to match attention mask nodes")
             return
diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md
index e7bcc19635f40..f9552e02d74b9 100644
--- a/onnxruntime/python/tools/transformers/models/llama/README.md
+++ b/onnxruntime/python/tools/transformers/models/llama/README.md
@@ -42,23 +42,6 @@ $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama
 
 To make this option compatible with [Hugging Face's Optimum](https://github.com/huggingface/optimum), you will need to create `config.json` and `generation_config.json` for your model and store them in the same directory as your ONNX models. For example, you can find those JSON files for LLaMA-2 7B on Hugging Face [here](https://huggingface.co/meta-llama/Llama-2-7b-hf).
 
-As indicated in `requirements.txt`, you will also need to install Optimum from source. Once installed, you will need to modify `ORTModelForCausalLM.forward` in `optimum/optimum/onnxruntime/modeling_decoder.py` as follows:
-
-```
-# Before
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:]
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-
-
-# After
-if self.use_cache:
-    if past_key_values is not None:
-        input_ids = input_ids[:, -1:] if past_key_values[0][0].shape[2] != 0 else input_ids
-        # Flatten the past_key_values (no need to flatten for models using multi-query attn)
-```
-
 ### Option 2: from [Microsoft's custom export](https://github.com/microsoft/Llama-2-Onnx)
 
 Please follow the [README instructions](https://github.com/microsoft/Llama-2-Onnx#before-you-start) in the custom export of LLaMA-2.
@@ -254,7 +237,7 @@ Here are some examples of how you can benchmark LLaMA-2.
 
 1. PyTorch without `torch.compile`, FP32
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-eager \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp32 \
@@ -266,7 +249,7 @@ python3 -m models.llama.benchmark \
 
 2. PyTorch with `torch.compile`, FP16
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-pt-compile \
     --model-name meta-llama/Llama-2-7b-hf \
     --precision fp16 \
@@ -278,7 +261,7 @@ python3 -m models.llama.benchmark \
 
 3. Optimum + ONNX Runtime, FP32, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -291,7 +274,7 @@ python3 -m models.llama.benchmark \
 
 4. Optimum + ONNX Runtime, FP16, export via Optimum or convert_to_onnx
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type hf-ort \
     --hf-ort-dir-path ./Llama-2-7b-hf-onnx/ \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -304,7 +287,7 @@ python3 -m models.llama.benchmark \
 
 5. ONNX Runtime, FP32, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float32/ONNX/LlamaV2_7B_float32.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -316,7 +299,7 @@ python3 -m models.llama.benchmark \
 
 6. ONNX Runtime, FP16, Microsoft custom export
 ```
-python3 -m models.llama.benchmark \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark \
     --benchmark-type ort-msft \
     --ort-model-path ./llama-2-onnx/7B_float16/ONNX/LlamaV2_7B_float16.onnx \
     --model-name meta-llama/Llama-2-7b-hf \
@@ -367,7 +350,7 @@ You can profile a variant by adding the `--profile` flag and providing one batch
 ### Benchmark All
 You can use `benchmark_all.py` to benchmark across various options and automatically store the results in a CSV file. Here is an example.
 ```
-python3 -m models.llama.benchmark_all \
+CUDA_VISIBLE_DEVICES=0 python3 -m models.llama.benchmark_all \
     --hf-pt-eager \
     --hf-pt-compile \
     --hf-ort-dir-path ./llama2-7b-fp16/ \
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index bc09b52574a27..71f52faa2c1e6 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -4,6 +4,8 @@
 import logging
 import os
 import shutil
+import subprocess
+import sys
 from itertools import chain
 
 import onnx
@@ -408,6 +410,31 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov
         only_onnxruntime=False,
     )
     model_opt.save_model_to_file(output_path, use_external_data_format=True)
+
+    # Run symbolic shape inference on optimized model to avoid shape errors during runtime
+    # Ex: Before attention fusion, RotaryEmbedding assumes a 4D input and produces a 4D output.
+    # After attention fusion, RotaryEmbedding expects a 3D input and produces a 3D output.
+    wheel_cmd = [sys.executable, "-m", "onnxruntime.tools.symbolic_shape_infer"]
+    source_cmd = [sys.executable, "../symbolic_shape_infer.py"]
+    symbolic_shape_infer_args = [
+        "--input",
+        output_path,
+        "--output",
+        output_path,
+        "--auto_merge",
+        "--save_as_external_data",
+        "--all_tensors_to_one_file",
+        "--external_data_location",
+        os.path.basename(output_path) + ".data",
+    ]
+
+    file_path = os.path.dirname(__file__)
+    if os.path.exists(os.path.join(file_path, "../../../tools/symbolic_shape_infer.py")):
+        main_cmd = wheel_cmd
+    else:
+        main_cmd = source_cmd
+    subprocess.run(main_cmd + symbolic_shape_infer_args)  # noqa: PLW1510
+
     logger.info(f"The ONNX model at {input_path} has been successfully optimized and saved at {output_path}!")
     if remove_model:
         remove_existing_model(input_path)
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
index 94e0397116d1c..89b459c80beec 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py
@@ -21,6 +21,7 @@ def setup_torch_model(args, location, use_auth_token, torch_dtype=torch.float32,
         if i == rank % (world_size):
             l_config = AutoConfig.from_pretrained(location, use_auth_token=use_auth_token, cache_dir=args.cache_dir)
             l_config.use_cache = True
+            l_config._attn_implementation = "eager"  # "eager" uses LlamaAttention for attention layer
             llama = AutoModelForCausalLM.from_pretrained(
                 location,
                 use_auth_token=use_auth_token,
diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
index 4210f36982aef..b72c972e7a16a 100644
--- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt
+++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt
@@ -1,4 +1,4 @@
-git+https://github.com/huggingface/optimum.git
+optimum>=1.14.1
 transformers>=4.33.2
 torch>=2.2.0.dev20230920
 onnx>=1.14.0

From 6e17571f2f0cb8eb841395f208d8c31359e2f054 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Fri, 19 Jan 2024 15:16:17 -0800
Subject: [PATCH 38/39] Fix issue that the generated context cache model
 inputs/outputs order is not guaranteed (#19195)

Fix issue that the generated context cache model inputs/outputs order is not guaranteed

### Description
Currently, QNN EP generate the context cache model in Compile() method which only get access to the partitioned graph. And the inputs/outputs order for the partitioned graph is not guaranteed. And EP doesn't have the view of the input user model. Have to move the context cache model generation to a higher level in GraphPartitioner which has the view of the partitioned model.
This is also a break down of PR for multi-partition support.
https://github.com/microsoft/onnxruntime/pull/18865
---
 .../core/framework/execution_provider.h       |   9 ++
 .../onnxruntime_session_options_config_keys.h |   2 +-
 .../core/framework/graph_partitioner.cc       | 105 ++++++++++++++++++
 .../core/framework/graph_partitioner.h        |   3 +
 .../qnn/builder/onnx_ctx_model_helper.cc      |  13 +--
 .../qnn/builder/onnx_ctx_model_helper.h       |   3 +-
 .../providers/qnn/qnn_execution_provider.cc   |  16 ++-
 .../providers/qnn/qnn_execution_provider.h    |   4 +
 onnxruntime/core/session/inference_session.cc |   9 +-
 .../test/framework/session_state_test.cc      |  25 +++--
 .../test/providers/qnn/qnn_basic_test.cc      |  45 ++++++++
 .../test/providers/qnn/simple_op_htp_test.cc  |   2 +
 .../testdata/qnn_ctx_2_inputs_order_test.onnx | Bin 0 -> 2053 bytes
 13 files changed, 210 insertions(+), 26 deletions(-)
 create mode 100644 onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index ea4f52f99649d..1de0217c7e1fa 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -326,6 +326,15 @@ class IExecutionProvider {
    */
   virtual std::vector<AllocatorPtr> CreatePreferredAllocators() { return std::vector<AllocatorPtr>(); };
 
+  /**
+   * Get the array of pointers for EPContext nodes
+   * EP needs to implement this if has the requirement to generate the context cache model. Otherwise leave it.
+   * Default return an empty vector if not provided by the Execution Provider
+   */
+  virtual const InlinedVector<const Node*> GetEpContextNodes() const {
+    return InlinedVector<const Node*>();
+  }
+
  private:
   const std::string type_;
 
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index df79cb6e5b21b..8fd51962bf087 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -236,7 +236,7 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
-// Enable EP context feature to dump the partitioned graph which include the EP context into Onnx file.
+// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)
 // "1": enable.
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index e4fe0c7564548..07b465c80745a 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
 // #define COUNT_NON_CUDA_OPS
@@ -634,6 +635,100 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
   return Status::OK();
 }
 
+static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
+                                   const Graph& graph,
+                                   const std::string& ep_context_path,
+                                   const logging::Logger& logger) {
+  InlinedVector<const Node*> all_ep_context_nodes;
+  for (const auto& ep : execution_providers) {
+    const InlinedVector<const Node*> ep_context_nodes = ep->GetEpContextNodes();
+    all_ep_context_nodes.insert(all_ep_context_nodes.begin(), ep_context_nodes.begin(), ep_context_nodes.end());
+  }
+
+  auto get_ep_context_node = [&all_ep_context_nodes](const std::string& node_name) -> std::pair<bool, const Node*> {
+    for (auto& node : all_ep_context_nodes) {
+      if (node_name == node->Name()) {
+        return std::make_pair(true, node);
+      }
+    }
+    return std::make_pair(false, static_cast<const Node*>(nullptr));
+  };
+
+  onnxruntime::PathString context_cache_path;
+  PathString model_pathstring = graph.ModelPath().ToPathString();
+  if (all_ep_context_nodes.size() > 0) {
+    if (!ep_context_path.empty()) {
+      context_cache_path = ToPathString(ep_context_path);
+    } else if (!model_pathstring.empty()) {
+      context_cache_path = model_pathstring + ToPathString("_ctx.onnx");
+    }
+
+    {
+#ifdef _WIN32
+      std::wifstream fs(context_cache_path);
+#else
+      std::ifstream fs(context_cache_path);
+#endif
+      ORT_RETURN_IF(fs.good(), "Failed to generate EP context model since the file exist already.");
+    }
+
+    Model ep_context_model(graph.Name(), false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+                           graph.DomainToVersionMap(), {}, logger);
+    auto& ep_graph = ep_context_model.MainGraph();
+    ep_graph.SetDescription(graph.Description());
+
+    // Set inputs outputs explicitly to make sure the order is same as the user model.
+    auto inputs = graph.GetInputs();
+    auto outputs = graph.GetOutputs();
+
+    InlinedVector<const NodeArg*> ep_graph_inputs;
+    ep_graph_inputs.reserve(inputs.size());
+    for (auto& input : inputs) {
+      auto input_arg = graph.GetNodeArg(input->Name());
+      auto& ep_graph_input_arg = ep_graph.GetOrCreateNodeArg(input_arg->Name(), input_arg->TypeAsProto());
+      ep_graph_inputs.push_back(&ep_graph_input_arg);
+    }
+
+    InlinedVector<const NodeArg*> ep_graph_outputs;
+    ep_graph_outputs.reserve(outputs.size());
+    for (auto& output : outputs) {
+      auto output_arg = graph.GetNodeArg(output->Name());
+      auto& ep_graph_output_arg = ep_graph.GetOrCreateNodeArg(output_arg->Name(), output_arg->TypeAsProto());
+      ep_graph_outputs.push_back(&ep_graph_output_arg);
+    }
+
+    ep_graph.SetInputs(ep_graph_inputs);
+    ep_graph.SetOutputs(ep_graph_outputs);
+
+    for (const auto& node : graph.Nodes()) {
+      // the fused node and EPContext node has same node name
+      auto ep_context_node = get_ep_context_node(node.Name());
+      // Use EpContext node created by the EPs if name matched, otherwise use node from original model
+      if (ep_context_node.first) {
+        ep_graph.AddNode(*ep_context_node.second);
+      } else {
+        ep_graph.AddNode(node);
+      }
+    }
+
+    // handle initializers
+    for (const auto& input : graph.GetInputsIncludingInitializers()) {
+      const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
+      if (graph.GetInitializedTensor(input->Name(), initializer)) {
+        // There initializer could have duplicates so make sure we only add once
+        const ONNX_NAMESPACE::TensorProto* subgraph_initializer = nullptr;
+        if (!ep_graph.GetInitializedTensor(input->Name(), subgraph_initializer)) {
+          ep_graph.AddInitializedTensor(*initializer);
+        }
+      }
+    }
+
+    ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  }
+
+  return Status::OK();
+}
+
 static Status PartitionOnnxFormatModel(const PartitionParams& partition_params, GraphPartitioner::Mode mode,
                                        const ExecutionProviders& execution_providers,
                                        KernelRegistryManager& kernel_registry_manager) {
@@ -840,6 +935,8 @@ Status GraphPartitioner::InlineFunctionsAOT(Model& model,
 
 Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
                                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                                   const ConfigOptions& config_options,
+                                   const logging::Logger& logger,
                                    Mode mode,
                                    const layout_transformation::DebugGraphFn& debug_graph_fn) const {
   // It is a greedy partitioning algorithm per provider preferences user provided when calling ONNX RUNTIME right now.
@@ -886,7 +983,15 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
 #if !defined(ORT_MINIMAL_BUILD)
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode,
                                                  providers_, kernel_registry_mgr_));
+
+    bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
+    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    if (ep_context_enabled) {
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+    }
 #else
+    ORT_UNUSED_PARAMETER(config_options);
+    ORT_UNUSED_PARAMETER(logger);
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "ONNX models are not supported in this build.");
 #endif  //! defined(ORT_MINIMAL_BUILD)
   } else {
diff --git a/onnxruntime/core/framework/graph_partitioner.h b/onnxruntime/core/framework/graph_partitioner.h
index 4fc85c2588260..d1ef193cf1520 100644
--- a/onnxruntime/core/framework/graph_partitioner.h
+++ b/onnxruntime/core/framework/graph_partitioner.h
@@ -13,6 +13,7 @@ namespace onnxruntime {
 class ExecutionProviders;
 class KernelRegistryManager;
 class Model;
+struct ConfigOptions;
 
 class GraphPartitioner {
  public:
@@ -31,6 +32,8 @@ class GraphPartitioner {
   // Run partitioning.
   Status Partition(Graph& graph, FuncManager& func_mgr,
                    const layout_transformation::TransformLayoutFunction& transform_layout_function,
+                   const ConfigOptions& config_options,
+                   const logging::Logger& logger,
                    Mode mode = Mode::kNormal,
                    const layout_transformation::DebugGraphFn& debug_graph_fn = {}) const;
 
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index fd9bf200c45ef..5d3f406f50612 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -230,8 +230,7 @@ Status ValidateWithContextFile(const onnxruntime::PathString& context_cache_path
   return Status::OK();
 }
 
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
+Status GenerateCtxCacheOnnxModel(Model* model,
                                  unsigned char* buffer,
                                  uint64_t buffer_size,
                                  const std::string& sdk_build_version,
@@ -240,11 +239,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                  const onnxruntime::PathString& context_cache_path,
                                  bool qnn_context_embed_mode,
                                  const logging::Logger& logger) {
-  std::unordered_map<std::string, int> domain_to_version = {{kOnnxDomain, 11}, {kMSDomain, 1}};
-  Model model(model_name, false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
-              domain_to_version, {}, logger);
-  auto& graph = model.MainGraph();
-  graph.SetDescription(model_description);
+  auto& graph = model->MainGraph();
 
   using namespace ONNX_NAMESPACE;
   int index = 0;
@@ -270,7 +265,7 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
                                   nullptr,
                                   kMSDomain);
 
-    // Only dump the context buffer once since all QNN graph are in one single context
+    // Only dump the context buffer once since all QNN graphs are in one single context
     if (0 == index) {
       if (qnn_context_embed_mode) {
         std::string cache_payload(buffer, buffer + buffer_size);
@@ -296,8 +291,6 @@ Status GenerateCtxCacheOnnxModel(const std::string model_name,
     ep_node.AddAttribute(SOURCE, kQnnExecutionProvider);
     ++index;
   }
-  ORT_RETURN_IF_ERROR(graph.Resolve());
-  ORT_RETURN_IF_ERROR(Model::Save(model, context_cache_path));
 
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 0011d0f43f5bc..ba6fe23ecd56e 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -73,8 +73,7 @@ Status GetMetadataFromEpContextModel(const onnxruntime::PathString& ctx_onnx_mod
                                      std::string& cache_source,
                                      const logging::Logger& logger);
 
-Status GenerateCtxCacheOnnxModel(const std::string model_name,
-                                 const std::string model_description,
+Status GenerateCtxCacheOnnxModel(Model* model,
                                  unsigned char* buffer,
                                  uint64_t buffer_size,
                                  const std::string& sdk_build_version,
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 04bd58c237141..56eb1f4f59f33 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -613,8 +613,8 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
     ORT_RETURN_IF(fused_nodes_and_graphs.size() != 1, "Only support single partition for context cache feature.");
     uint64_t buffer_size(0);
     auto context_buffer = qnn_backend_manager_->GetContextBinaryBuffer(buffer_size);
-    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(model_name,
-                                                       model_description,
+    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    ORT_RETURN_IF_ERROR(qnn::GenerateCtxCacheOnnxModel(qnn_ep_context_model_.get(),
                                                        context_buffer.get(),
                                                        buffer_size,
                                                        qnn_backend_manager_->GetSdkVersion(),
@@ -626,4 +626,16 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
   }
   return Status::OK();
 }
+
+const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const {
+  InlinedVector<const Node*> ep_context_nodes;
+  if (qnn_ep_context_model_) {
+    const auto& graph = qnn_ep_context_model_->MainGraph();
+    for (const auto& node : graph.Nodes()) {
+      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    }
+  }
+
+  return ep_context_nodes;
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 8b5d0929209ee..d4927f3fa505e 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -9,6 +9,7 @@
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_graph_configs_helper.h"
+#include "core/graph/model.h"
 
 namespace onnxruntime {
 
@@ -35,6 +36,8 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   DataLayout GetPreferredLayout() const override;
 
+  const InlinedVector<const Node*> GetEpContextNodes() const override;
+
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                        std::unordered_map<const NodeUnit*, bool>& node_unit_supported_result,
@@ -66,6 +69,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool disable_cpu_ep_fallback_ = false;  // True if CPU EP fallback has been disabled for this session.
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
+  std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 93877c8dd66bd..e8853c8824738 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1164,6 +1164,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool
 
   // Do partitioning based on execution providers' capabilities.
   ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn,
+                                                       session_options_.config_options, *session_logger_,
                                                        mode, debug_graph_fn));
 
   // apply Level2 and higher transformers.
@@ -1458,7 +1459,9 @@ namespace {
 Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
                                const ExecutionProviders& providers,
                                KernelRegistryManager& kernel_registry_manager,
-                               SessionState& session_state) {
+                               SessionState& session_state,
+                               const ConfigOptions& config_options,
+                               const logging::Logger& logger) {
   layout_transformation::TransformLayoutFunction transform_layout_fn = nullptr;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1479,6 +1482,8 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph,
   ORT_RETURN_IF_ERROR(partitioner.Partition(graph,
                                             session_state.GetMutableFuncMgr(),
                                             transform_layout_fn,
+                                            config_options,
+                                            logger,
                                             GraphPartitioner::Mode::kOrtFormatLoad));
 
   return Status::OK();
@@ -1833,7 +1838,7 @@ common::Status InferenceSession::Initialize() {
 #endif  // !defined(ORT_MINIMAL_BUILD)
     } else {
       ORT_RETURN_IF_ERROR_SESSIONID_(PartitionOrtFormatModel(graph, execution_providers_, kernel_registry_manager_,
-                                                             *session_state_));
+                                                             *session_state_, session_options_.config_options, *session_logger_));
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
       const auto& cpu_ep = *execution_providers_.Get(onnxruntime::kCpuExecutionProvider);
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 8990c23e4af39..0c2d8bcb2eb93 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -171,13 +171,16 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) {
 
   GraphPartitioner partitioner(krm, execution_providers);
   ASSERT_STATUS_OK(
-      partitioner.Partition(graph, session_state.GetMutableFuncMgr(),
-                            [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
-                               const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
-                              AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-                              return layout_transformation::TransformLayoutForEP(
-                                  graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
-                            }));
+      partitioner.Partition(
+          graph, session_state.GetMutableFuncMgr(),
+          [](Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
+             const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
+            AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
+            return layout_transformation::TransformLayoutForEP(
+                graph, modified, execution_provider, std::move(cpu_allocator), debug_graph_fn);
+          },
+          sess_options.config_options,
+          DefaultLoggingManager().DefaultLogger()));
 
   ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -257,7 +260,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(graph, modified, execution_provider,
                                                              cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
 
@@ -314,7 +319,9 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) {
                          const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status {
           return layout_transformation::TransformLayoutForEP(
               graph, modified, execution_provider, cpu_allocator, debug_graph_fn);
-        }));
+        },
+        sess_options.config_options,
+        DefaultLoggingManager().DefaultLogger()));
 
     // Finalize the session state
     ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm));
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index f9064cad3fe12..bc40682cf87b7 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -600,6 +600,51 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
 
   // Make sure the Qnn context cache binary file is generated
   EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+}
+
+// Generate context cache model from the ONNX models with 2 inputs.
+// The generated model should have same input order.
+// The input ONNX model is created in the way that the model inputs order
+// is different with the order in the graph (topological order).
+// It cause issue if the generated model doesn't set the inputs/outputs explicitly.
+TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Add kMSDomain to cover contrib op like Gelu
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx";
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str());
+
+  so.AppendExecutionProvider("QNN", provider_options);
+
+  Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so);
+
+  // Make sure the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger()));
+  auto inputs = model->MainGraph().GetInputs();
+  EXPECT_TRUE(inputs.size() == 2);
+  EXPECT_TRUE(inputs[0]->Name() == "attention_mask");
+  EXPECT_TRUE(inputs[1]->Name() == "Add_input_0");
+
+  // clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // A repro of QC case 06838696, accuracy issue for Cast + Op (quantized)
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index 4ac1f5ddca643..1e938ae9e334b 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -778,6 +778,8 @@ TEST_F(QnnHTPBackendTests, ContextBinaryCacheEmbedModeTest) {
                        QDQTolerance(),
                        logging::Severity::kERROR,
                        context_binary_file);
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
 // Run QDQ model on HTP 3 times
diff --git a/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx b/onnxruntime/test/testdata/qnn_ctx_2_inputs_order_test.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..46b212dc1fc0e9c2f4e6bd32424fcbd90a2b6c9f
GIT binary patch
literal 2053
zcmb7FKX21O6z3dTxsQ@MPASAq(<W^hPHL1e(hSsM0f8!pE*&@-a!7{84vte*5gS9N
zZit~PTSvYCbl`I^@i~hB#`gIxP6tcA_xE?d_wGG;Nxg4d)>-@W*sxr4LbZiXyW8$O
z{j&n+2<{#9`^2<{W0!QGk~)yld*g({T3R%rj!lNPX}M}MEq@b*eq~zJaLD1<_2Oxo
z*rZj?y1)g3su3VDNteV>`>>J-Lp-aAAdM!GKBuuagGOx9QdlSWj-YI~F7+6*Eiy1h
zpI|k6j`*oD(iEs2MwPvC%+kh8s~k~35EN5?n?i0MBn?1V9%7L7Sw<ZCej=Fb_O~bX
zF+-OK@n(S?2lQK*hYM(m8!4homF?gKkgm4i8+ThJk3TqrsT|6&Mbt*s={eOvjj52|
zIu8Z<T+@6*P!MZAKwHy<BHC+03l8BUC=3X5)>~bV5;IJ_@F!bXg(qPZz9N`q3(HZU
zaNJa)Q>rR;ex}24=sn<Sa}{x5W|(3Wi(wuu?g6$SdAijluNU_QcAuW$4MoD}jxHGP
zcwIrQyVdLAzMkI*xaKRbe?RN#>~Qw{g0S2fjdx`vOL9q(bl#+Yxp;JRb#-%twJ-+n
zLEuMz>epXRdph%e@K)$0p2oQ0`~Lbj+1I&)+>0Gx&leo8`JR6-FME2XH;#AM6``u2
z$VgoVNk-l$d0*+cnOU(slXrE9s>#!SY$RiYr`IK=>TCi8JKnTDP)Du!|IXOkT>~1f
F{r^R&&7%MS

literal 0
HcmV?d00001


From c8ce83967e5b52062558046d769f3af7d871e893 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 19 Jan 2024 15:30:09 -0800
Subject: [PATCH 39/39] Download protoc for all Apple host builds, remove
 protoc build from iOS packaging pipeline. (#19209)

---
 .../external/onnxruntime_external_deps.cmake  | 74 ++++++++++---------
 .../stages/mac-ios-packaging-build-stage.yml  |  7 +-
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 78f63227c8392..403b4b2c4107a 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -108,41 +108,14 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
+if(NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
-  message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
-  if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
-    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
-      FetchContent_Populate(protoc_binary)
-    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
-      FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
-      FetchContent_Populate(protoc_binary)
-    endif()
-    if(protoc_binary_SOURCE_DIR)
-      message("Use prebuilt protoc")
-      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
-      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
-    endif()
-  elseif ((CMAKE_SYSTEM_NAME STREQUAL "Emscripten" OR CMAKE_SYSTEM_NAME STREQUAL "Android" OR CMAKE_SYSTEM_NAME STREQUAL "iOS") AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
+  if (CMAKE_HOST_APPLE)
+    # Using CMAKE_CROSSCOMPILING is not recommended for Apple target devices.
+    # https://cmake.org/cmake/help/v3.26/variable/CMAKE_CROSSCOMPILING.html
+    # To keep it simple, just download and use the universal protoc binary for all Apple host builds.
     FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
     FetchContent_Populate(protoc_binary)
     if(protoc_binary_SOURCE_DIR)
@@ -150,6 +123,38 @@ if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
       set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
+  elseif (CMAKE_CROSSCOMPILING)
+    message("CMAKE_HOST_SYSTEM_NAME: ${CMAKE_HOST_SYSTEM_NAME}")
+    if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win64} URL_HASH SHA1=${DEP_SHA1_protoc_win64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_win32} URL_HASH SHA1=${DEP_SHA1_protoc_win32})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc.exe)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux")
+      if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x64})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_x86} URL_HASH SHA1=${DEP_SHA1_protoc_linux_x86})
+        FetchContent_Populate(protoc_binary)
+      elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
+        FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_linux_aarch64} URL_HASH SHA1=${DEP_SHA1_protoc_linux_aarch64})
+        FetchContent_Populate(protoc_binary)
+      endif()
+      if(protoc_binary_SOURCE_DIR)
+        message("Use prebuilt protoc")
+        set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+        set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+      endif()
+    endif()
   endif()
 endif()
 
@@ -184,9 +189,9 @@ FetchContent_Declare(
 )
 
 set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests" FORCE)
-#TODO: we'd better to turn the following option off. However, it will cause 
+#TODO: we'd better to turn the following option off. However, it will cause
 # ".\build.bat --config Debug --parallel --skip_submodule_sync --update" fail with an error message:
-# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is 
+# install(EXPORT "ONNXTargets" ...) includes target "onnx_proto" which requires target "libprotobuf-lite" that is
 # not in any export set.
 #set(protobuf_INSTALL OFF CACHE BOOL "Install protobuf binaries and files" FORCE)
 set(protobuf_USE_EXTERNAL_GTEST ON CACHE BOOL "" FORCE)
@@ -562,4 +567,3 @@ endif()
 
 FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
 FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
-
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
index d1dff0769e25f..ed32c5d0e15be 100644
--- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -78,10 +78,6 @@ stages:
         pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
       displayName: "Install Python requirements"
 
-    - script: |
-        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
-      displayName: "Build Host Protoc"
-
     # create and test mobile pods
     - script: |
         python tools/ci_build/github/apple/build_and_assemble_apple_pods.py \
@@ -91,8 +87,7 @@ stages:
           --test \
           --variant ${{ parameters.packageVariant }} \
           --build-settings-file "${{ variables.buildSettingsFile }}" \
-          ${{ variables.optionalIncludeOpsByConfigOption }} \
-          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
+          ${{ variables.optionalIncludeOpsByConfigOption }}
       displayName: "Build macOS/iOS framework and assemble pod package files"
 
     - script: |